//#define EPS_RED //#define EPS_MUL 0.0001f static inline __host__ float2 float2_split(const double x) { const float fx = float(x); const float fy = float(x - double(fx)); return make_float2(fx, fy); } static inline __host__ double float2_todouble(const float2 f) { return double(f.x) + double(f.y); } static inline __device__ float2 float2_accum(const float2 acc, const float x) { const float ax = acc.x + x; const float ay = acc.y - ((ax - acc.x) - x); return make_float2(ax, ay); } static inline __device__ float float2_diff(const float2 xj, const float2 xi){ return (xj.x - xi.x) + (xj.y - xi.y); } struct Iparticle{ float2 pos[3]; // 6 float vel[3]; // 9 float eps2; // 10 float h2; // 11 int id; // 12 __host__ void read( const double h_pos[], const double h_vel[], const double h_eps2, const double h_h2, const int h_id) { for(int k=0; k<3; k++){ pos[k] = float2_split(h_pos[k]); vel[k] = float(h_vel[k]); } eps2 = float(h_eps2); h2 = float(h_h2); id = h_id; } }; struct Jparticle{ float2 pos [3]; // 6 float vel [3]; // 9 float acc2[3]; // 12 float jrk6[3]; // 15 float mass; // 16 float2 tj; // 18 int id; // 19 int addr; // 20 __host__ void read( const double h_pos[], const double h_vel[], const double h_acc2[], const double h_jrk6[], const double h_mass, const double h_tj, const int h_id, const int h_addr) { for(int k=0; k<3; k++){ pos [k] = float2_split(h_pos[k]); vel [k] = float(h_vel [k]); acc2[k] = float(h_acc2[k]); jrk6[k] = float(h_jrk6[k]); } mass = float(h_mass); tj = float2_split(h_tj); id = h_id; addr = h_addr; assert(addr < NBODY_MAX); } }; struct Jppred{ float2 pos[3]; // 6 float vel[3]; // 9 float mass; // 10 int id; // 11 int pad; // 12 enum{ SIZE_F4 = 3, }; __device__ void predict( const Jparticle &jp, const float2 ti) { const float dt = float2_diff(ti, jp.tj); #pragma unroll for(int k=0; k<3; k++){ pos[k].x = jp.pos[k].x; pos[k].y = jp.pos[k].y + dt*(jp.vel[k] + dt*(jp.acc2[k] + dt*jp.jrk6[k])); vel[k] = jp.vel[k] + (2.f*dt)*(jp.acc2[k] + (1.5f*dt)*(jp.jrk6[k])); } mass = jp.mass; id = jp.id ; } }; struct Interaction{ float3 acc; float3 jrk; float pot; __device__ Interaction( const Iparticle &ip, const Jppred &jp){ const float dx = float2_diff(jp.pos[0], ip.pos[0]); const float dy = float2_diff(jp.pos[1], ip.pos[1]); const float dz = float2_diff(jp.pos[2], ip.pos[2]); const float dvx = jp.vel[0] - ip.vel[0]; const float dvy = jp.vel[1] - ip.vel[1]; const float dvz = jp.vel[2] - ip.vel[2]; #ifdef EPS_RED float r2, tmp_eps2; tmp_eps2 = ip.eps2; // default value ----> eps = 1e-5 !!! // larger ----> 10 x eps if mass > 1e-6 (i.e. high mass part.) if( (jp.id < 99998) || (ip.id < 99998) ) tmp_eps2 *= 100.0f; // if i or j is a BH's ----> 1e-2 * eps if( (ip.id == 999998) || (jp.id == 999998) || (ip.id == 999999) || (jp.id == 999999) ) { r2 = EPS_MUL*tmp_eps2 + dx*dx + dy*dy + dz*dz; } else { r2 = tmp_eps2 + dx*dx + dy*dy + dz*dz; } #else const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz; #endif /* #ifdef EPS_RED float r2; // if i or j is a BH's if( (ip.id == 0) || (jp.id == 0) || (ip.id == 1) || (jp.id == 1) ) { r2 = EPS_MUL*ip.eps2 + dx*dx + dy*dy + dz*dz; } else { r2 = ip.eps2 + dx*dx + dy*dy + dz*dz; } #else const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz; #endif */ // const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz; const float rv = dx*dvx + dy*dvy + dz*dvz; const float rinv1 = (jp.id == ip.id) ? 0.0f : rsqrtf(r2); const float rinv2 = rinv1 * rinv1; const float mrinv1 = jp.mass * rinv1; const float mrinv3 = mrinv1 * rinv2; const float alpha = -3.f * rv * rinv2; acc.x = mrinv3 * dx; acc.y = mrinv3 * dy; acc.z = mrinv3 * dz; jrk.x = mrinv3 * (dvx + alpha * dx); jrk.y = mrinv3 * (dvy + alpha * dy); jrk.z = mrinv3 * (dvz + alpha * dz); pot = mrinv1; // use positive definition here } __device__ void set_neib(int &dst) const{ // do nothing } }; struct Interaction_NB{ float3 acc; float3 jrk; float pot; float nb_rinv; int jid; bool is_neib; __device__ Interaction_NB( const Iparticle &ip, const Jppred &jp) { const float dx = float2_diff(jp.pos[0], ip.pos[0]); const float dy = float2_diff(jp.pos[1], ip.pos[1]); const float dz = float2_diff(jp.pos[2], ip.pos[2]); const float dvx = jp.vel[0] - ip.vel[0]; const float dvy = jp.vel[1] - ip.vel[1]; const float dvz = jp.vel[2] - ip.vel[2]; // if( (jp.id) > 1 && (jp.id < 200000) ) ip.eps2 *= 100; const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz; const float rv = dx*dvx + dy*dvy + dz*dvz; const float rinv1 = (jp.id == ip.id) ? 0.0f : rsqrtf(r2); const float rinv2 = rinv1 * rinv1; const float mrinv1 = jp.mass * rinv1; const float mrinv3 = mrinv1 * rinv2; const float alpha = -3.f * rv * rinv2; acc.x = mrinv3 * dx; acc.y = mrinv3 * dy; acc.z = mrinv3 * dz; jrk.x = mrinv3 * (dvx + alpha * dx); jrk.y = mrinv3 * (dvy + alpha * dy); jrk.z = mrinv3 * (dvz + alpha * dz); pot = mrinv1; // use positive definition here nb_rinv = rinv1; jid = jp.id; is_neib = (r2 < ip.h2) && (jp.id != ip.id); } __device__ void set_neib(int &dst) const{ if(is_neib) dst = jid; } }; struct Force{ float2 acc[3]; // 6 float jrk[3]; // 9 float2 pot; // 11 int nnb_id; // 12 ID of nearest neighbor float nnb_rinv; // 13 rinv of nearest neighbor int num_neib; // 14 __host__ void write( double h_acc[], double h_jrk[], double &h_pot, int &h_nnb_id, int &h_num_neib) const { for(int k=0; k<3; k++){ h_acc[k] = float2_todouble(acc[k]); h_jrk[k] = double(jrk[k]); } h_pot = - float2_todouble(pot); h_nnb_id = nnb_id; h_num_neib = num_neib; } __device__ void clear() { #pragma unroll for(int k=0; k<3; k++){ acc[k] = make_float2(0.0f, 0.0f); jrk[k] = 0.0f; } pot = make_float2(0.0f, 0.0f); nnb_id = -1; nnb_rinv = 0.0f; num_neib = 0; } __device__ void check_overflow(){ if(num_neib > NB_MAX) num_neib = -1; } // for the redction kernel __device__ void operator+=( const Force &fo) { #pragma unroll for(int k=0; k<3; k++){ acc[k] = float2_accum(acc[k], fo.acc[k].x); acc[k] = float2_accum(acc[k], fo.acc[k].y); jrk[k] += fo.jrk[k]; } pot = float2_accum(pot, fo.pot.x); pot = float2_accum(pot, fo.pot.y); if(num_neib>=0 && fo.num_neib>=0){ num_neib += fo.num_neib; }else{ // overflow num_neib = -1; } // nearest neighbor if(nnb_rinv < fo.nnb_rinv){ nnb_id = fo.nnb_id; nnb_rinv = fo.nnb_rinv; } } // for the gravity kernel __device__ void operator+=( const Interaction &fo) { acc[0] = float2_accum(acc[0], fo.acc.x); acc[1] = float2_accum(acc[1], fo.acc.y); acc[2] = float2_accum(acc[2], fo.acc.z); pot = float2_accum(pot, fo.pot); jrk[0] += fo.jrk.x; jrk[1] += fo.jrk.y; jrk[2] += fo.jrk.z; } __device__ void operator+=( const Interaction_NB &fo) { acc[0] = float2_accum(acc[0], fo.acc.x); acc[1] = float2_accum(acc[1], fo.acc.y); acc[2] = float2_accum(acc[2], fo.acc.z); jrk[0] += fo.jrk.x; jrk[1] += fo.jrk.y; jrk[2] += fo.jrk.z; pot = float2_accum(pot, fo.pot); // neighbor list counter if(fo.is_neib) num_neib++; // nearest neighbor if(nnb_rinv < fo.nb_rinv){ nnb_id = fo.jid; nnb_rinv = fo.nb_rinv; } } };