//#include #include #include "cuda_pointer.h" enum{ MAX_GPU = 4, MAX_CPU = 4, NBODY_MAX = (1048576), // 256KB NB_MAX = 256, // per block MAX_NB_BUF = (1048576), // 256KB }; #include "gpu.h" #include "particle.h" #define _out_ __global__ void kernel_jp_scatter( const int nj, const Jparticle jpsrc[], _out_ Jparticle jpdst[]) { const int tid = threadIdx.x + blockDim.x * blockIdx.x; if(tid < nj){ const Jparticle jp = jpsrc[tid]; jpdst[jp.addr] = jp; } } __global__ void kernel_predict( const int nj, const float2 ti, const Jparticle jptcl[], _out_ Jppred jpred[]) { #if 0 const int tid = threadIdx.x + blockDim.x * blockIdx.x; if(tid < nj){ jpred[tid].predict(jptcl[tid], ti); } #else const int tid = threadIdx.x; const int off = blockDim.x * blockIdx.x; const int nth = blockDim.x; __shared__ float4 sbuf[NTHREADS*5]; Jparticle *sptcl = (Jparticle *)sbuf; Jppred *spred = (Jppred *)sbuf; { // LOAD float4 *src = (float4 *)(jptcl + off); float4 *dst = (float4 *)(sptcl); #pragma unroll for(int k=0; k<5; k++, src+=nth, dst+=nth){ dst[tid] = src[tid]; } } // Predict __syncthreads(); Jppred pp; pp.predict(sptcl[tid], ti); __syncthreads(); spred[tid] = pp; __syncthreads(); { // STORE float4 *src = (float4 *)(spred); float4 *dst = (float4 *)(jpred + off); #pragma unroll for(int k=0; k<3; k++, src+=nth, dst+=nth){ dst[tid] = src[tid]; } } #endif } #define INTERACTION Interaction_NB __global__ void kernel_gravity( const int ni, const int nj, const Iparticle ipbuf[], const Jppred jpbuf[], _out_ Force fodev[][NJBLOCKS], _out_ int nbbuf[][NJBLOCKS][NB_MAX], const bool with_neib) { int ibid = blockIdx.x; int jbid = blockIdx.y; int tid = threadIdx.x; int iaddr = tid + blockDim.x * ibid; int jstart = (nj * (jbid )) / NJBLOCKS; int jend = (nj * (jbid+1)) / NJBLOCKS; int *nbdst = nbbuf[iaddr][jbid]; __shared__ Jppred jpshare[NJPSHRE]; // 32 const Iparticle ip = ipbuf[iaddr]; Force fo; fo.clear(); if(with_neib){ for(int j=jstart; j= ni) return; if(nbcnt[iaddr].x < 0) return; // overflow const int mynnb = (xid < NJBLOCKS) ? fodev[iaddr][xid].num_neib : 0; // now performe prefix sum __shared__ int ishare[NYREDUCE][NXREDUCE]; ishare[yid][xid] = mynnb; int *ish = ishare[yid]; if(xid>=1) ish[xid] += ish[xid-1]; if(xid>=2) ish[xid] += ish[xid-2]; if(xid>=4) ish[xid] += ish[xid-4]; if(xid>=8) ish[xid] += ish[xid-8]; if(32 == NXREDUCE){ if(xid>=16) ish[xid] += ish[xid-16]; } const int off = (xid == 0) ? 0 : ish[xid-1]; int *nbdst = nblst + nbcnt[iaddr].y + off; if(xid < NJBLOCKS){ for(int k=0; k ipbuf; cudaPointer jpbuf; cudaPointer jpque; cudaPointer jpred; cudaPointer fodev; cudaPointer fosum; cudaPointer nbbuf; cudaPointer nblst; cudaPointer nbcnt; // {num, off} void allocate(){ ipbuf.allocate(NIMAX); jpbuf.allocate(NBODY_MAX); jpque.allocate(NBODY_MAX); jpred.allocate(NBODY_MAX); fodev.allocate(NIMAX); fosum.allocate(NIMAX); nbbuf.allocate(NIMAX); nblst.allocate(MAX_NB_BUF); nbcnt.allocate(NIMAX); } void free(){ ipbuf.free(); jpbuf.free(); jpque.free(); jpred.free(); fodev.free(); fosum.free(); nbbuf.free(); nblst.free(); nbcnt.free(); } public: void set_gpid(){ int dev; cudaGetDevice(&dev); gpid = dev; } void set_gpid(const int id){ gpid = id; int dev; cudaGetDevice(&dev); assert(dev == gpid); } void open(){ assert(!is_open); allocate(); is_open = true; njp_in_que = 0; predicted = false; grav_called = false; jp_flushed = false; } void close(){ assert(is_open); free(); is_open = false; } void set_ip( const int ni, const double pos[][3], const double vel[][3], const double eps2[], const double h2 [], const int id []) { assert(is_open); assert(ni <= NIMAX); for(int i=0; i>> (njq, jpque, jpbuf); jp_flushed = true; predicted = false; } void set_ti(const double dbl_ti){ assert(is_open); ti = float2_split(dbl_ti); predicted = false; } void predict_all(const int nj){ assert(is_open); const int Blocks = 1 + (nj-1)/NTHREADS; kernel_predict <<< Blocks, NTHREADS >>> (nj, ti, jpbuf, jpred); predicted = true; } void launch_gravity( const int ni, const int nj, const bool with_neib) { assert(is_open); assert(ni == ni_save); assert(ni <= NIMAX); assert(nj < NBODY_MAX); if(!jp_flushed) transter_jp(); if(!predicted ) predict_all(nj); if(ni <= 64){ dim3 grid ( 1, NJBLOCKS, 1); dim3 threads(64, 1, 1); kernel_gravity <<< grid, threads >>> (ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib); }else{ const int niblocks = 1 + (ni-1) / NTHREADS; dim3 grid (niblocks, NJBLOCKS, 1); dim3 threads(NTHREADS, 1, 1); kernel_gravity <<< grid, threads >>> (ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib); } grav_called = true; } void get_force( const int ni, _out_ double acc [][3], _out_ double jrk [][3], _out_ double pot [], _out_ int nnb_id[]) { assert(is_open); assert(grav_called); assert(ni == ni_save); assert(ni <= NIMAX); const int ni8 = 1 + (ni-1) / NYREDUCE; dim3 grid (ni8, 1, 1); dim3 threads(NXREDUCE, NYREDUCE, 1); kernel_reduce <<< grid, threads >>> (ni, fodev, fosum); fosum.dtoh(ni); grav_called = false; for(int i=0; i= 0) nbsum += nbcnt[i].x; } assert(nbsum <= MAX_NB_BUF); nbcnt.htod(ni); const int ni8 = 1 + (ni-1) / NYREDUCE; dim3 grid (ni8, 1, 1); dim3 threads(NXREDUCE, NYREDUCE, 1); kernel_gather_nb <<< grid, threads >>> (ni, fodev, nbcnt, nbbuf, nblst); nblst.dtoh(nbsum); } void get_neighbor_list( const int ipipe, const int maxlen, _out_ int *num_neib, _out_ int list[]) { assert(is_open); assert(ipipe < ni_save); const int nnb = nbcnt[ipipe].x; const int off = nbcnt[ipipe].y; const int *src = &nblst[off]; if(nnb > 0 && maxlen >= nnb){ for(int k=0; k= NJBLOCKS); assert(NXREDUCE <= 32); assert(sizeof(Jppred) % sizeof(float4) == 0); assert(sizeof(Jppred) / sizeof(float4) == Jppred::SIZE_F4); assert(NJPSHRE * Jppred::SIZE_F4 <= NTHREADS); // int devid[MAX_GPU]; // cudaGetDeviceCount(&numGPU); // assert(numGPU <= MAX_GPU); /* char *gpu_list = getenv("GPU_LIST"); if(gpu_list) { // get GPU list from environment variable numGPU = 0; char *p = strtok(gpu_list, " "); while(p) { devid[numGPU++] = atoi(p); p = strtok(NULL, " "); assert(numGPU <= MAX_GPU); } } else { // use all GPUs // for(int i=0; i= 0) { cudaSetDevice(devid[tid]); resource[tid].set_gpid(devid[tid]); } else { fprintf(stderr, "Skipping cudaSetDevice(), using the default GPU \n"); resource[tid].set_gpid(); } } */ // } cudaGetDeviceCount(&numGPU); assert(numGPU <= MAX_GPU); cudaSetDevice(gpid); resource[0].set_gpid(gpid); fprintf(stderr, "***********************\n"); fprintf(stderr, "Initializing Yebisu/G6 library\n"); fprintf(stderr, "#CPU %d, #GPU %d, device: %d\n", numCPU, numGPU, gpid); // fprintf(stderr, "device: %d\n" gpid); // fprintf(stderr, "device: "); // for(int i=0; i