first commit

This commit is contained in:
Yohai 2019-08-25 16:12:39 +08:00
commit 604f1cb18a
9 changed files with 1683 additions and 0 deletions

30
Makefile Normal file
View file

@ -0,0 +1,30 @@
ETICS_LMAX ?= 2
ETICS_NMAX ?= 10
GPUARCH ?= sm_75
CUDAHOME ?= /usr/local/cuda
OPTIMIZATION ?= 3
CXX ?= g++
NVCC ?= $(CUDAHOME)/bin/nvcc
CXXFLAGS += -O$(OPTIMIZATION)
all : libyebisug6.a libyebisug6.nb.a
yebisu_g6.o : yebisu_g6.cu yebisu_g6.h particle.h
$(NVCC) -arch=$(GPUARCH) $(CUDAFLAGS) -Xcompiler "$(CXXFLAGS)" -I$(CUDAHOME)/include -I$(CUDAHOME)/samples/common/inc -c yebisu_g6.cu
grape6.o : grape6.cpp
$(CXX) $(CXXFLAGS) -c -DNB_FLAG=0 -o $@ $<
grape6.nb.o : grape6.cpp
$(CXX) $(CXXFLAGS) -c -DNB_FLAG=1 -o $@ $<
libyebisug6.a : yebisu_g6.o grape6.o
ar -r $@ $^
ranlib $@
libyebisug6.nb.a : yebisu_g6.o grape6.nb.o
ar -r $@ $^
ranlib $@
clean:
rm -f *.o *.a

54
cuda_pointer.h Normal file
View file

@ -0,0 +1,54 @@
#include <assert.h>
// #include <cutil.h>
#include <helper_cuda.h>
template <typename T>
struct cudaPointer{
T *dev_pointer;
T *host_pointer;
int size;
cudaPointer(){
dev_pointer = NULL;
host_pointer = NULL;
size = 0;
}
~cudaPointer(){
// free();
}
void allocate(int _size){
size = _size;
void *p;
checkCudaErrors(cudaMalloc(&p, size * sizeof(T)));
assert(p);
dev_pointer = (T*)p;
checkCudaErrors(cudaMallocHost(&p, size * sizeof(T)));
assert(p);
host_pointer = (T*)p;
}
void free(){
checkCudaErrors(cudaFree(dev_pointer));
checkCudaErrors(cudaFreeHost(host_pointer));
dev_pointer = NULL;
host_pointer = NULL;
size = 0;
}
void htod(int count){
checkCudaErrors(cudaMemcpy(dev_pointer, host_pointer, count * sizeof(T), cudaMemcpyHostToDevice));
}
void htod(){
this->htod(size);
}
void dtoh(int count){
checkCudaErrors(cudaMemcpy(host_pointer, dev_pointer, count * sizeof(T), cudaMemcpyDeviceToHost));
}
void dtoh(){
this->dtoh(size);
}
T &operator [] (int i){
return host_pointer[i];
}
operator T* (){
return dev_pointer;
}
};

90
g6util.h Normal file
View file

@ -0,0 +1,90 @@
#ifndef G6UTIL_H
#define G6UTIL_H
// #include "typedef.h"
/* constant definitions */
#if !defined(__LANGUAGE_FORTRAN__) && !defined(_LANGUAGE_FORTRAN)
#if defined(__cplusplus)
extern "C"
{
#endif
/* C interface */
/*
* standard functions.
* the number of the cards is hidden to the user.
*/
void g6_open_all(void);
void g6_close_all(void);
int g6_set_j_particle_all(int address, int index, double tj, double dtj, double mass,
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
int g6_set_j_particle_mxonly_all(int address, int index, double mass, double x[3]);
void g6_set_ti_all(double ti);
void g6calc_firsthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
void g6calc_firsthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
int g6calc_lasthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
int g6calc_lasthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
int g6calc_lasthalf2_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
int g6_read_neighbour_list_all(void);
int g6_get_neighbour_list_all(int ipipe, int maxlength, int *nblen, int nbl[]);
void g6_set_nip_all(int nip);
void g6_set_njp_all(int njp);
void g6_set_i_particle_scales_from_real_value_all(int address, double acc[3], double jerk[3], double phi,
double jfactor, double ffactor);
void g6_set_i_particle_all(int address, int index, double x[3], double v[3], double eps2, double h2);
int g6_get_force_all(double acc[][3], double jerk[][3], double phi[], int flag[]);
int g6_get_force_etc_all(double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
void g6_get_predicted_j_particles_all(int nj, double (*x)[3], double (*v)[3]);
int g6_getnjmax_all(void);
/*
* primitive functions to control multiple cards individually.
* the user needs to specify card's device id explicitly.
*/
void g6_open(int clusterid);
void g6_close(int clusterid);
void g6_set_tunit(int newtunit);
void g6_set_xunit(int newxunit);
int g6_set_j_particle(int clusterid, int address, int index, double tj, double dtj, double mass,
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
int g6_set_j_particle_mxonly(int clusterid, int address, int index, double mass, double x[3]);
void g6_set_ti(int clusterid, double ti);
void g6calc_firsthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
void g6calc_firsthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
int g6calc_lasthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
int g6calc_lasthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
int g6calc_lasthalf2(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
int g6_read_neighbour_list(int clusterid);
int g6_get_neighbour_list(int clusterid, int ipipe, int maxlength, int *nblen, int nbl[]);
void g6_set_neighbour_list_sort_mode(int mode);
int g6_get_neighbour_list_sort_mode(void);
int g6_npipes(void);
void g6_set_nip(int clusterid, int nip);
void g6_set_njp(int clusterid, int njp);
void g6_set_i_particle_scales_from_real_value(int clusterid, int address, double acc[3], double jerk[3], double phi,
double jfactor, double ffactor);
void g6_set_i_particle(int clusterid, int address, int index, double x[3], double v[3], double eps2, double h2);
int g6_get_force(int clusterid, double acc[][3], double jerk[][3], double phi[], int flag[]);
int g6_get_force_etc(int clusterid, double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
void g6_get_predicted_j_particles(int clusterid, int nj, double x[][3], double v[][3]);
int g6_getnjmax(int clusterid);
#if defined(__cplusplus)
}
#endif
#endif /* LANGUAGE_FORTRAN */
#endif /* G6UTIL_H */

80
gpu.h Normal file
View file

@ -0,0 +1,80 @@
//#define GPUTYPE 8800 // MP=16
//#define NJBL_value 32
//#define GPUTYPE C1060 // MP=30
//#define NJBL_value 30
//#define GPUTYPE M2070 // MP=14
//#define NJBL_value 28
//#define GPUTYPE GF460 // MP=7
//#define NJBL_value 14
//#define GPUTYPE GF470 // MP=14
//#define NJBL_value 28
//#define GPUTYPE GF480 // MP=15
//#define NJBL_value 30
//#define GPUTYPE GF570 // MP=15
//#define NJBL_value 30
#define GPUTYPE K20m // MP=13
#define NJBL_value 26
//#define GPUTYPE GF660 // MP=5
//#define NJBL_value 10
//#define GPUTYPE GF780 // MP=15
//#define NJBL_value 30
//#define GPUTYPE GF1080 // MP=20
//#define NJBL_value 20
//#define GPUTYPE GFTITANX // MP=24
//#define NJBL_value 24
// NJBLOCKS = 1 or 2 x MP
// NXREDUCE is a power of 2 and >= NJBLOCKS ~32 is good for all
// NYREDUCE is a power of 2 < NXREDUCE ~4 is good for all
enum{
SM_VER = 35,
NTHREADS = 128,
NTHSCAT = 64,
NIBLOCKS = 16,
NJPSHRE = 32,
NIMAX = (NTHREADS * NIBLOCKS),
NJBLOCKS = NJBL_value,
NXREDUCE = 32,
NYREDUCE = 4,
};
/*
----------------------------------------------------------------------------
type MP x Cores x GPU clock rate SP Gflop/s DP Gflop/s
per MP (GHz) (nbody n=200k) (nbody n=200k)
8800 GT 14 x 8 x 1.62 = 181.44 -
8800 512 16 x 8 x 1.80 = 230.40 298.266 -
9800 GTX 16 x 8 x 1.85 = 236.80 331.754 -
250 GTS 16 x 8 x 1.73 = 221.44
C1060 30 x 8 x 1.30 = 312.00 450.861 55.412
M2070 14 x 32 x 1.15 = 515.20 546.416 250.454
460 v2 7 x 48 x 1.56 = 524.16 421.335 67.939
470 14 x 32 x 1.22 = 546.56 578.572 105.796
480 15 x 32 x 1.40 = 672.00 705.514 145.502
570 15 x 32 x 1.46 = 700.80 736.882 151.960
670 7 x 192 x 0.71 = 954.24 1151.030 100.092
680 8 x 192 x 0.71 = 1090.56 1187.710 101.211
K20m 13 x 192 x 0.71 = 1772.16 1322.374 576.523
TITAN 14 x 192 x 0.88 = 2365.44 2026.287 751.671
780 Ti 15 x 192 x 0.93 = 2678.40 2691.603 198.987
----------------------------------------------------------------------------
*/

173
grape6.cpp Normal file
View file

@ -0,0 +1,173 @@
#include <algorithm>
#include <cassert>
#include "yebisu_g6.h"
#include "g6util.h"
#define NIMAX 2048
#define MAXDEV 4
#ifndef NB_FLAG
#define NB_FLAG 0
#endif
extern "C"
{
static int sort_mode = 0;
static double eps2_buf[MAXDEV][NIMAX];
static int nnb_buf[MAXDEV][NIMAX];
void g6_open(int clusterid){
assert(clusterid < MAXDEV);
assert(NIMAX >= g6_npipes());
yebisu_g6_open(clusterid);
}
void g6_close(int clusterid){
yebisu_g6_close(clusterid);
}
void g6_set_tunit(int newtunit){}
void g6_set_xunit(int newxunit){}
int g6_set_j_particle(
int clusterid,
int address,
int index,
double tj,
double dtj,
double mass,
double a2by18[3],
double a1by6[3],
double aby2[3],
double v[3],
double x[3])
{
yebisu_g6_push_jp(clusterid, x, v, aby2, a1by6, mass, tj, index, address);
return 0;
}
void g6_set_ti(int clusterid, double ti){
yebisu_g6_set_ti(clusterid, ti);
}
void g6calc_firsthalf0(
int clusterid,
int nj,
int ni,
int index[],
double xi[][3],
double vi[][3],
double fold[][3],
double jold[][3],
double phiold[],
double *eps2,
double h2[],
int mode)
{
double *eps2_ptr = eps2;
if(mode){ // constand eps2
eps2_ptr = eps2_buf[clusterid];
for(int i=0; i<ni; i++){
eps2_ptr[i] = *eps2;
}
}
yebisu_g6_set_ip(clusterid, ni, xi, vi, eps2_ptr, h2, index);
yebisu_g6_launch_gravity(clusterid, ni, nj, NB_FLAG);
}
void g6calc_firsthalf(
int clusterid,
int nj,
int ni,
int index[],
double xi[][3],
double vi[][3],
double fold[][3],
double jold[][3],
double phiold[],
double eps2,
double h2[])
{
g6calc_firsthalf0(clusterid, nj, ni, index, xi, vi, fold, jold, phiold, &eps2, h2, 1);
}
int g6calc_lasthalf(
int clusterid,
int nj,
int ni,
int index[],
double xi[][3],
double vi[][3],
double eps2,
double h2[],
double acc[][3],
double jerk[][3],
double pot[])
{
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnb_buf[clusterid]);
return 0;
}
int g6calc_lasthalf0(
int clusterid,
int nj,
int ni,
int index[],
double xi[][3],
double vi[][3],
double *eps2,
double h2[],
double acc[][3],
double jerk[][3],
double pot[],
int mode)
{
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnb_buf[clusterid]);
return 0;
}
int g6calc_lasthalf2(
int clusterid,
int nj,
int ni,
int index[],
double xi[][3],
double vi[][3],
double eps2,
double h2[],
double acc[][3],
double jerk[][3],
double pot[],
int nnbindex[])
{
assert(NB_FLAG);
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnbindex);
return 0;
}
int g6_read_neighbour_list(int clusterid){
assert(NB_FLAG);
yebisu_g6_receive_neighbor_list(clusterid);
return 0;
}
int g6_get_neighbour_list(
int clusterid,
int ipipe,
int maxlength,
int *nblen,
int nbl[])
{
assert(NB_FLAG);
yebisu_g6_get_neighbor_list(clusterid, ipipe, maxlength, nblen, nbl);
const int nnb = *nblen;
if(nnb < 0) return -1;
if(sort_mode){
std::sort(nbl, nbl+nnb);
}
return 0;
}
void g6_set_neighbour_list_sort_mode(int mode){
sort_mode = mode;
}
int g6_get_neighbour_list_sort_mode(void){
return sort_mode;
}
int g6_npipes(void){
return yebisu_g6_get_nimax();
}
int g6_getnjmax(int clusterid){
return yebisu_g6_get_njmax();
}
}

90
grape6.h Normal file
View file

@ -0,0 +1,90 @@
#ifndef G6UTIL_H
#define G6UTIL_H
// #include "typedef.h"
/* constant definitions */
#if !defined(__LANGUAGE_FORTRAN__) && !defined(_LANGUAGE_FORTRAN)
#if defined(__cplusplus)
extern "C"
{
#endif
/* C interface */
/*
* standard functions.
* the number of the cards is hidden to the user.
*/
void g6_open_all(void);
void g6_close_all(void);
int g6_set_j_particle_all(int address, int index, double tj, double dtj, double mass,
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
int g6_set_j_particle_mxonly_all(int address, int index, double mass, double x[3]);
void g6_set_ti_all(double ti);
void g6calc_firsthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
void g6calc_firsthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
int g6calc_lasthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
int g6calc_lasthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
int g6calc_lasthalf2_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
int g6_read_neighbour_list_all(void);
int g6_get_neighbour_list_all(int ipipe, int maxlength, int *nblen, int nbl[]);
void g6_set_nip_all(int nip);
void g6_set_njp_all(int njp);
void g6_set_i_particle_scales_from_real_value_all(int address, double acc[3], double jerk[3], double phi,
double jfactor, double ffactor);
void g6_set_i_particle_all(int address, int index, double x[3], double v[3], double eps2, double h2);
int g6_get_force_all(double acc[][3], double jerk[][3], double phi[], int flag[]);
int g6_get_force_etc_all(double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
void g6_get_predicted_j_particles_all(int nj, double (*x)[3], double (*v)[3]);
int g6_getnjmax_all(void);
/*
* primitive functions to control multiple cards individually.
* the user needs to specify card's device id explicitly.
*/
void g6_open(int clusterid);
void g6_close(int clusterid);
void g6_set_tunit(int newtunit);
void g6_set_xunit(int newxunit);
int g6_set_j_particle(int clusterid, int address, int index, double tj, double dtj, double mass,
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
int g6_set_j_particle_mxonly(int clusterid, int address, int index, double mass, double x[3]);
void g6_set_ti(int clusterid, double ti);
void g6calc_firsthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
void g6calc_firsthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
int g6calc_lasthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
int g6calc_lasthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
int g6calc_lasthalf2(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
int g6_read_neighbour_list(int clusterid);
int g6_get_neighbour_list(int clusterid, int ipipe, int maxlength, int *nblen, int nbl[]);
void g6_set_neighbour_list_sort_mode(int mode);
int g6_get_neighbour_list_sort_mode(void);
int g6_npipes(void);
void g6_set_nip(int clusterid, int nip);
void g6_set_njp(int clusterid, int njp);
void g6_set_i_particle_scales_from_real_value(int clusterid, int address, double acc[3], double jerk[3], double phi,
double jfactor, double ffactor);
void g6_set_i_particle(int clusterid, int address, int index, double x[3], double v[3], double eps2, double h2);
int g6_get_force(int clusterid, double acc[][3], double jerk[][3], double phi[], int flag[]);
int g6_get_force_etc(int clusterid, double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
void g6_get_predicted_j_particles(int clusterid, int nj, double x[][3], double v[][3]);
int g6_getnjmax(int clusterid);
#if defined(__cplusplus)
}
#endif
#endif /* LANGUAGE_FORTRAN */
#endif /* G6UTIL_H */

343
particle.h Normal file
View file

@ -0,0 +1,343 @@
//#define EPS_RED
//#define EPS_MUL 0.0001f
static inline __host__
float2 float2_split(const double x)
{
const float fx = float(x);
const float fy = float(x - double(fx));
return make_float2(fx, fy);
}
static inline __host__
double float2_todouble(const float2 f)
{
return double(f.x) + double(f.y);
}
static inline __device__
float2 float2_accum(const float2 acc, const float x)
{
const float ax = acc.x + x;
const float ay = acc.y - ((ax - acc.x) - x);
return make_float2(ax, ay);
}
static inline __device__
float float2_diff(const float2 xj, const float2 xi){
return (xj.x - xi.x) + (xj.y - xi.y);
}
struct Iparticle{
float2 pos[3]; // 6
float vel[3]; // 9
float eps2; // 10
float h2; // 11
int id; // 12
__host__ void read(
const double h_pos[],
const double h_vel[],
const double h_eps2,
const double h_h2,
const int h_id)
{
for(int k=0; k<3; k++){
pos[k] = float2_split(h_pos[k]);
vel[k] = float(h_vel[k]);
}
eps2 = float(h_eps2);
h2 = float(h_h2);
id = h_id;
}
};
struct Jparticle{
float2 pos [3]; // 6
float vel [3]; // 9
float acc2[3]; // 12
float jrk6[3]; // 15
float mass; // 16
float2 tj; // 18
int id; // 19
int addr; // 20
__host__ void read(
const double h_pos[],
const double h_vel[],
const double h_acc2[],
const double h_jrk6[],
const double h_mass,
const double h_tj,
const int h_id,
const int h_addr)
{
for(int k=0; k<3; k++){
pos [k] = float2_split(h_pos[k]);
vel [k] = float(h_vel [k]);
acc2[k] = float(h_acc2[k]);
jrk6[k] = float(h_jrk6[k]);
}
mass = float(h_mass);
tj = float2_split(h_tj);
id = h_id;
addr = h_addr;
assert(addr < NBODY_MAX);
}
};
struct Jppred{
float2 pos[3]; // 6
float vel[3]; // 9
float mass; // 10
int id; // 11
int pad; // 12
enum{
SIZE_F4 = 3,
};
__device__ void predict(
const Jparticle &jp,
const float2 ti)
{
const float dt = float2_diff(ti, jp.tj);
#pragma unroll
for(int k=0; k<3; k++){
pos[k].x = jp.pos[k].x;
pos[k].y = jp.pos[k].y + dt*(jp.vel[k] + dt*(jp.acc2[k] + dt*jp.jrk6[k]));
vel[k] = jp.vel[k] + (2.f*dt)*(jp.acc2[k] + (1.5f*dt)*(jp.jrk6[k]));
}
mass = jp.mass;
id = jp.id ;
}
};
struct Interaction{
float3 acc;
float3 jrk;
float pot;
__device__ Interaction(
const Iparticle &ip,
const Jppred &jp){
const float dx = float2_diff(jp.pos[0], ip.pos[0]);
const float dy = float2_diff(jp.pos[1], ip.pos[1]);
const float dz = float2_diff(jp.pos[2], ip.pos[2]);
const float dvx = jp.vel[0] - ip.vel[0];
const float dvy = jp.vel[1] - ip.vel[1];
const float dvz = jp.vel[2] - ip.vel[2];
#ifdef EPS_RED
float r2, tmp_eps2;
tmp_eps2 = ip.eps2; // default value ----> eps = 1e-5 !!!
// larger ----> 10 x eps if mass > 1e-6 (i.e. high mass part.)
if( (jp.id < 99998) || (ip.id < 99998) ) tmp_eps2 *= 100.0f;
// if i or j is a BH's ----> 1e-2 * eps
if( (ip.id == 999998) || (jp.id == 999998) || (ip.id == 999999) || (jp.id == 999999) )
{
r2 = EPS_MUL*tmp_eps2 + dx*dx + dy*dy + dz*dz;
}
else
{
r2 = tmp_eps2 + dx*dx + dy*dy + dz*dz;
}
#else
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
#endif
/*
#ifdef EPS_RED
float r2;
// if i or j is a BH's
if( (ip.id == 0) || (jp.id == 0) || (ip.id == 1) || (jp.id == 1) )
{
r2 = EPS_MUL*ip.eps2 + dx*dx + dy*dy + dz*dz;
}
else
{
r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
}
#else
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
#endif
*/
// const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
const float rv = dx*dvx + dy*dvy + dz*dvz;
const float rinv1 = (jp.id == ip.id) ? 0.0f
: rsqrtf(r2);
const float rinv2 = rinv1 * rinv1;
const float mrinv1 = jp.mass * rinv1;
const float mrinv3 = mrinv1 * rinv2;
const float alpha = -3.f * rv * rinv2;
acc.x = mrinv3 * dx;
acc.y = mrinv3 * dy;
acc.z = mrinv3 * dz;
jrk.x = mrinv3 * (dvx + alpha * dx);
jrk.y = mrinv3 * (dvy + alpha * dy);
jrk.z = mrinv3 * (dvz + alpha * dz);
pot = mrinv1; // use positive definition here
}
__device__ void set_neib(int &dst) const{
// do nothing
}
};
struct Interaction_NB{
float3 acc;
float3 jrk;
float pot;
float nb_rinv;
int jid;
bool is_neib;
__device__ Interaction_NB(
const Iparticle &ip,
const Jppred &jp)
{
const float dx = float2_diff(jp.pos[0], ip.pos[0]);
const float dy = float2_diff(jp.pos[1], ip.pos[1]);
const float dz = float2_diff(jp.pos[2], ip.pos[2]);
const float dvx = jp.vel[0] - ip.vel[0];
const float dvy = jp.vel[1] - ip.vel[1];
const float dvz = jp.vel[2] - ip.vel[2];
// if( (jp.id) > 1 && (jp.id < 200000) ) ip.eps2 *= 100;
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
const float rv = dx*dvx + dy*dvy + dz*dvz;
const float rinv1 = (jp.id == ip.id) ? 0.0f
: rsqrtf(r2);
const float rinv2 = rinv1 * rinv1;
const float mrinv1 = jp.mass * rinv1;
const float mrinv3 = mrinv1 * rinv2;
const float alpha = -3.f * rv * rinv2;
acc.x = mrinv3 * dx;
acc.y = mrinv3 * dy;
acc.z = mrinv3 * dz;
jrk.x = mrinv3 * (dvx + alpha * dx);
jrk.y = mrinv3 * (dvy + alpha * dy);
jrk.z = mrinv3 * (dvz + alpha * dz);
pot = mrinv1; // use positive definition here
nb_rinv = rinv1;
jid = jp.id;
is_neib = (r2 < ip.h2) && (jp.id != ip.id);
}
__device__ void set_neib(int &dst) const{
if(is_neib) dst = jid;
}
};
struct Force{
float2 acc[3]; // 6
float jrk[3]; // 9
float2 pot; // 11
int nnb_id; // 12 ID of nearest neighbor
float nnb_rinv; // 13 rinv of nearest neighbor
int num_neib; // 14
__host__ void write(
double h_acc[],
double h_jrk[],
double &h_pot,
int &h_nnb_id,
int &h_num_neib) const
{
for(int k=0; k<3; k++){
h_acc[k] = float2_todouble(acc[k]);
h_jrk[k] = double(jrk[k]);
}
h_pot = - float2_todouble(pot);
h_nnb_id = nnb_id;
h_num_neib = num_neib;
}
__device__ void clear()
{
#pragma unroll
for(int k=0; k<3; k++){
acc[k] = make_float2(0.0f, 0.0f);
jrk[k] = 0.0f;
}
pot = make_float2(0.0f, 0.0f);
nnb_id = -1;
nnb_rinv = 0.0f;
num_neib = 0;
}
__device__ void check_overflow(){
if(num_neib > NB_MAX) num_neib = -1;
}
// for the redction kernel
__device__ void operator+=(
const Force &fo)
{
#pragma unroll
for(int k=0; k<3; k++){
acc[k] = float2_accum(acc[k], fo.acc[k].x);
acc[k] = float2_accum(acc[k], fo.acc[k].y);
jrk[k] += fo.jrk[k];
}
pot = float2_accum(pot, fo.pot.x);
pot = float2_accum(pot, fo.pot.y);
if(num_neib>=0 && fo.num_neib>=0){
num_neib += fo.num_neib;
}else{ // overflow
num_neib = -1;
}
// nearest neighbor
if(nnb_rinv < fo.nnb_rinv){
nnb_id = fo.nnb_id;
nnb_rinv = fo.nnb_rinv;
}
}
// for the gravity kernel
__device__ void operator+=(
const Interaction &fo)
{
acc[0] = float2_accum(acc[0], fo.acc.x);
acc[1] = float2_accum(acc[1], fo.acc.y);
acc[2] = float2_accum(acc[2], fo.acc.z);
pot = float2_accum(pot, fo.pot);
jrk[0] += fo.jrk.x;
jrk[1] += fo.jrk.y;
jrk[2] += fo.jrk.z;
}
__device__ void operator+=(
const Interaction_NB &fo)
{
acc[0] = float2_accum(acc[0], fo.acc.x);
acc[1] = float2_accum(acc[1], fo.acc.y);
acc[2] = float2_accum(acc[2], fo.acc.z);
jrk[0] += fo.jrk.x;
jrk[1] += fo.jrk.y;
jrk[2] += fo.jrk.z;
pot = float2_accum(pot, fo.pot);
// neighbor list counter
if(fo.is_neib) num_neib++;
// nearest neighbor
if(nnb_rinv < fo.nb_rinv){
nnb_id = fo.jid;
nnb_rinv = fo.nb_rinv;
}
}
};

762
yebisu_g6.cu Normal file
View file

@ -0,0 +1,762 @@
//#include <cutil.h>
#include <omp.h>
#include "cuda_pointer.h"
enum{
MAX_GPU = 4,
MAX_CPU = 4,
NBODY_MAX = (1048576), // 256KB
NB_MAX = 256, // per block
MAX_NB_BUF = (1048576), // 256KB
};
#include "gpu.h"
#include "particle.h"
#define _out_
__global__ void kernel_jp_scatter(
const int nj,
const Jparticle jpsrc[],
_out_ Jparticle jpdst[])
{
const int tid = threadIdx.x + blockDim.x * blockIdx.x;
if(tid < nj){
const Jparticle jp = jpsrc[tid];
jpdst[jp.addr] = jp;
}
}
__global__ void kernel_predict(
const int nj,
const float2 ti,
const Jparticle jptcl[],
_out_ Jppred jpred[])
{
#if 0
const int tid = threadIdx.x + blockDim.x * blockIdx.x;
if(tid < nj){
jpred[tid].predict(jptcl[tid], ti);
}
#else
const int tid = threadIdx.x;
const int off = blockDim.x * blockIdx.x;
const int nth = blockDim.x;
__shared__ float4 sbuf[NTHREADS*5];
Jparticle *sptcl = (Jparticle *)sbuf;
Jppred *spred = (Jppred *)sbuf;
{ // LOAD
float4 *src = (float4 *)(jptcl + off);
float4 *dst = (float4 *)(sptcl);
#pragma unroll
for(int k=0; k<5; k++, src+=nth, dst+=nth){
dst[tid] = src[tid];
}
}
// Predict
__syncthreads();
Jppred pp;
pp.predict(sptcl[tid], ti);
__syncthreads();
spred[tid] = pp;
__syncthreads();
{ // STORE
float4 *src = (float4 *)(spred);
float4 *dst = (float4 *)(jpred + off);
#pragma unroll
for(int k=0; k<3; k++, src+=nth, dst+=nth){
dst[tid] = src[tid];
}
}
#endif
}
#define INTERACTION Interaction_NB
__global__ void kernel_gravity(
const int ni,
const int nj,
const Iparticle ipbuf[],
const Jppred jpbuf[],
_out_ Force fodev[][NJBLOCKS],
_out_ int nbbuf[][NJBLOCKS][NB_MAX],
const bool with_neib)
{
int ibid = blockIdx.x;
int jbid = blockIdx.y;
int tid = threadIdx.x;
int iaddr = tid + blockDim.x * ibid;
int jstart = (nj * (jbid )) / NJBLOCKS;
int jend = (nj * (jbid+1)) / NJBLOCKS;
int *nbdst = nbbuf[iaddr][jbid];
__shared__ Jppred jpshare[NJPSHRE]; // 32
const Iparticle ip = ipbuf[iaddr];
Force fo;
fo.clear();
if(with_neib){
for(int j=jstart; j<jend; j+=NJPSHRE){
const int jsize = NJPSHRE * Jppred::SIZE_F4; // 96
__syncthreads();
if(tid < jsize){ // 96 of 128
float4 *src = (float4 *)(jpbuf + j);
float4 *dst = (float4 *)(jpshare );
dst[tid] = src[tid];
}
if(tid+32 < jsize){ // for the case of 64 threads
float4 *src = (float4 *)(jpbuf + j);
float4 *dst = (float4 *)(jpshare );
dst[tid+32] = src[tid+32];
}
__syncthreads();
if(jend-j < NJPSHRE){
#pragma unroll 4
for(int jj=0; jj<jend-j; jj++){
const Jppred jp = jpshare[jj];
const Interaction_NB inter(ip, jp);
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
fo += inter;
}
}else{
#pragma unroll 32
for(int jj=0; jj<NJPSHRE; jj++){
const Jppred jp = jpshare[jj];
const Interaction_NB inter(ip, jp);
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
fo += inter;
}
}
}
}else{ // no neib
for(int j=jstart; j<jend; j+=NJPSHRE){
const int jsize = NJPSHRE * Jppred::SIZE_F4; // 96
__syncthreads();
if(tid < jsize){ // 96 of 128
float4 *src = (float4 *)(jpbuf + j);
float4 *dst = (float4 *)(jpshare );
dst[tid] = src[tid];
}
if(tid+32 < jsize){ // for the case of 64 threads
float4 *src = (float4 *)(jpbuf + j);
float4 *dst = (float4 *)(jpshare );
dst[tid+32] = src[tid+32];
}
__syncthreads();
if(jend-j < NJPSHRE){
#pragma unroll 4
for(int jj=0; jj<jend-j; jj++){
const Jppred jp = jpshare[jj];
const Interaction inter(ip, jp);
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
fo += inter;
}
}else{
#pragma unroll 32
for(int jj=0; jj<NJPSHRE; jj++){
const Jppred jp = jpshare[jj];
const Interaction inter(ip, jp);
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
fo += inter;
}
}
}
}
if(iaddr < ni){
fodev[iaddr][jbid] = fo;
}
}
__global__ void kernel_reduce(
const int ni,
const Force fodev[][NJBLOCKS],
_out_ Force fosum[]){
const int xid = threadIdx.x;
const int yid = threadIdx.y;
const int bid = blockIdx.x;
const int iaddr = yid + blockDim.y * bid;
__shared__ Force fshare[NYREDUCE][NXREDUCE];
if(xid < NJBLOCKS){
fshare[yid][xid] = fodev[iaddr][xid];
}else{
fshare[yid][xid].clear();
}
Force *fs = fshare[yid];
if(32 == NXREDUCE){
if(xid < 16) fs[xid] += fs[xid + 16];
}
if(xid < 8) fs[xid] += fs[xid + 8];
if(xid < 4) fs[xid] += fs[xid + 4];
if(xid < 2) fs[xid] += fs[xid + 2];
if(xid < 1) fs[xid] += fs[xid + 1];
if(iaddr < ni && 0 == xid){
fosum[iaddr] = fs[0];
}
}
__global__ void kernel_gather_nb(
const int ni,
const Force fodev[][NJBLOCKS],
const int2 nbcnt[],
const int nbbuf[][NJBLOCKS][NB_MAX],
_out_ int nblst[])
{
const int xid = threadIdx.x;
const int yid = threadIdx.y;
const int bid = blockIdx.x;
const int iaddr = yid + blockDim.y * bid;
if(iaddr >= ni) return;
if(nbcnt[iaddr].x < 0) return; // overflow
const int mynnb = (xid < NJBLOCKS) ? fodev[iaddr][xid].num_neib
: 0;
// now performe prefix sum
__shared__ int ishare[NYREDUCE][NXREDUCE];
ishare[yid][xid] = mynnb;
int *ish = ishare[yid];
if(xid>=1) ish[xid] += ish[xid-1];
if(xid>=2) ish[xid] += ish[xid-2];
if(xid>=4) ish[xid] += ish[xid-4];
if(xid>=8) ish[xid] += ish[xid-8];
if(32 == NXREDUCE){
if(xid>=16) ish[xid] += ish[xid-16];
}
const int off = (xid == 0) ? 0
: ish[xid-1];
int *nbdst = nblst + nbcnt[iaddr].y + off;
if(xid < NJBLOCKS){
for(int k=0; k<mynnb; k++){
const int nbid = nbbuf[iaddr][xid][k];
nbdst[k] = nbid;
}
}
}
class Resource{
private:
bool is_open;
bool predicted;
bool grav_called;
bool jp_flushed;
int gpid;
int njp_in_que;
int ni_save;
float2 ti;
cudaPointer<Iparticle> ipbuf;
cudaPointer<Jparticle> jpbuf;
cudaPointer<Jparticle> jpque;
cudaPointer<Jppred> jpred;
cudaPointer <Force[NJBLOCKS]> fodev;
cudaPointer <Force> fosum;
cudaPointer <int[NJBLOCKS][NB_MAX]> nbbuf;
cudaPointer <int> nblst;
cudaPointer <int2> nbcnt; // {num, off}
void allocate(){
ipbuf.allocate(NIMAX);
jpbuf.allocate(NBODY_MAX);
jpque.allocate(NBODY_MAX);
jpred.allocate(NBODY_MAX);
fodev.allocate(NIMAX);
fosum.allocate(NIMAX);
nbbuf.allocate(NIMAX);
nblst.allocate(MAX_NB_BUF);
nbcnt.allocate(NIMAX);
}
void free(){
ipbuf.free();
jpbuf.free();
jpque.free();
jpred.free();
fodev.free();
fosum.free();
nbbuf.free();
nblst.free();
nbcnt.free();
}
public:
void set_gpid(){
int dev;
cudaGetDevice(&dev);
gpid = dev;
}
void set_gpid(const int id){
gpid = id;
int dev;
cudaGetDevice(&dev);
assert(dev == gpid);
}
void open(){
assert(!is_open);
allocate();
is_open = true;
njp_in_que = 0;
predicted = false;
grav_called = false;
jp_flushed = false;
}
void close(){
assert(is_open);
free();
is_open = false;
}
void set_ip(
const int ni,
const double pos[][3],
const double vel[][3],
const double eps2[],
const double h2 [],
const int id [])
{
assert(is_open);
assert(ni <= NIMAX);
for(int i=0; i<ni; i++){
ipbuf[i].read(pos[i], vel[i], eps2[i], h2[i], id[i]);
}
ipbuf.htod(ni);
ni_save = ni;
}
void push_jp(
const double pos [3],
const double vel [3],
const double acc2[3],
const double jrk6[6],
const double mass,
const double tj,
const int id,
const int addr)
{
assert(is_open);
assert(addr < NBODY_MAX);
jpque[njp_in_que].read(pos, vel, acc2, jrk6, mass, tj, id, addr);
njp_in_que++;
jp_flushed = false;
}
void transter_jp(){
assert(is_open);
const int njq = njp_in_que;
jpque.htod(njq);
njp_in_que = 0;
const int Blocks = 1 + (njq-1)/NTHSCAT;
kernel_jp_scatter <<< Blocks, NTHSCAT >>>
(njq, jpque, jpbuf);
jp_flushed = true;
predicted = false;
}
void set_ti(const double dbl_ti){
assert(is_open);
ti = float2_split(dbl_ti);
predicted = false;
}
void predict_all(const int nj){
assert(is_open);
const int Blocks = 1 + (nj-1)/NTHREADS;
kernel_predict <<< Blocks, NTHREADS >>>
(nj, ti, jpbuf, jpred);
predicted = true;
}
void launch_gravity(
const int ni,
const int nj,
const bool with_neib)
{
assert(is_open);
assert(ni == ni_save);
assert(ni <= NIMAX);
assert(nj < NBODY_MAX);
if(!jp_flushed) transter_jp();
if(!predicted ) predict_all(nj);
if(ni <= 64){
dim3 grid ( 1, NJBLOCKS, 1);
dim3 threads(64, 1, 1);
kernel_gravity <<< grid, threads >>>
(ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib);
}else{
const int niblocks = 1 + (ni-1) / NTHREADS;
dim3 grid (niblocks, NJBLOCKS, 1);
dim3 threads(NTHREADS, 1, 1);
kernel_gravity <<< grid, threads >>>
(ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib);
}
grav_called = true;
}
void get_force(
const int ni,
_out_ double acc [][3],
_out_ double jrk [][3],
_out_ double pot [],
_out_ int nnb_id[])
{
assert(is_open);
assert(grav_called);
assert(ni == ni_save);
assert(ni <= NIMAX);
const int ni8 = 1 + (ni-1) / NYREDUCE;
dim3 grid (ni8, 1, 1);
dim3 threads(NXREDUCE, NYREDUCE, 1);
kernel_reduce <<< grid, threads >>>
(ni, fodev, fosum);
fosum.dtoh(ni);
grav_called = false;
for(int i=0; i<ni; i++){
fosum[i].write(acc[i], jrk[i], pot[i], nnb_id[i], nbcnt[i].x);
}
}
void receive_neighbor_list(){
assert(is_open);
const int ni = ni_save;
int nbsum = 0;
for(int i=0; i<ni; i++){
nbcnt[i].y = nbsum;
if(nbcnt[i].x >= 0) nbsum += nbcnt[i].x;
}
assert(nbsum <= MAX_NB_BUF);
nbcnt.htod(ni);
const int ni8 = 1 + (ni-1) / NYREDUCE;
dim3 grid (ni8, 1, 1);
dim3 threads(NXREDUCE, NYREDUCE, 1);
kernel_gather_nb <<< grid, threads >>>
(ni, fodev, nbcnt, nbbuf, nblst);
nblst.dtoh(nbsum);
}
void get_neighbor_list(
const int ipipe,
const int maxlen,
_out_ int *num_neib,
_out_ int list[])
{
assert(is_open);
assert(ipipe < ni_save);
const int nnb = nbcnt[ipipe].x;
const int off = nbcnt[ipipe].y;
const int *src = &nblst[off];
if(nnb > 0 && maxlen >= nnb){
for(int k=0; k<nnb; k++){
list[k] = src[k];
}
*num_neib = nnb;
}else{
*num_neib = -1;
}
}
void DEBUG_read_pred(
const int nj,
const int addr,
_out_ double pos [3],
_out_ double vel [3],
_out_ double mass[1],
_out_ int id [1])
{
jpred.dtoh(nj);
const Jppred &p = jpred[addr];
for(int k=0; k<3; k++){
pos[k] = p.pos[k].x + p.pos[k].y;
vel[k] = p.vel[k];
}
mass[0] = p.mass;
id [0] = p.id;
}
};
static Resource resource[MAX_GPU];
static int numGPU, numCPU;
static bool initialized = false;
static void lib_initialize(const int gpid){
if(initialized) return;
initialized = true;
assert(NXREDUCE >= NJBLOCKS);
assert(NXREDUCE <= 32);
assert(sizeof(Jppred) % sizeof(float4) == 0);
assert(sizeof(Jppred) / sizeof(float4) == Jppred::SIZE_F4);
assert(NJPSHRE * Jppred::SIZE_F4 <= NTHREADS);
// int devid[MAX_GPU];
// cudaGetDeviceCount(&numGPU);
// assert(numGPU <= MAX_GPU);
/*
char *gpu_list = getenv("GPU_LIST");
if(gpu_list)
{
// get GPU list from environment variable
numGPU = 0;
char *p = strtok(gpu_list, " ");
while(p)
{
devid[numGPU++] = atoi(p);
p = strtok(NULL, " ");
assert(numGPU <= MAX_GPU);
}
}
else
{
// use all GPUs
// for(int i=0; i<numGPU; i++) devid[i] = i;
// using the default GPU
numGPU = 1;
numCPU = 1;
devid[0] = -1;
}
*/
// using the default GPU
// numGPU = 1;
// numCPU = 1;
// devid[0] = -1;
// using the gpid GPU
numGPU = 1;
numCPU = 1;
// devid[0] = gpid;
/*
#pragma omp parallel
{
int tid = omp_get_thread_num();
if(tid == 0) numCPU = omp_get_num_threads();
numCPU = 1;
}
*/
// assert(numCPU <= MAX_CPU);
// assert(numGPU <= numCPU);
//#pragma omp parallel
// {
/*
int tid = omp_get_thread_num();
tid = 0;
if(tid < numGPU)
{
fprintf(stderr, "tid %d, numGPU %d, devid[tid] %d \n", tid, numGPU, devid[tid]);
if(devid[tid] >= 0)
{
cudaSetDevice(devid[tid]);
resource[tid].set_gpid(devid[tid]);
}
else
{
fprintf(stderr, "Skipping cudaSetDevice(), using the default GPU \n");
resource[tid].set_gpid();
}
}
*/
// }
cudaGetDeviceCount(&numGPU);
assert(numGPU <= MAX_GPU);
cudaSetDevice(gpid);
resource[0].set_gpid(gpid);
fprintf(stderr, "***********************\n");
fprintf(stderr, "Initializing Yebisu/G6 library\n");
fprintf(stderr, "#CPU %d, #GPU %d, device: %d\n", numCPU, numGPU, gpid);
// fprintf(stderr, "device: %d\n" gpid);
// fprintf(stderr, "device: ");
// for(int i=0; i<numGPU; i++) fprintf(stderr, " %d", devid[i]);
// fprintf(stderr, "\n");
fprintf(stderr, "***********************\n");
#if 1
cudaFuncSetCacheConfig(kernel_jp_scatter, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(kernel_predict, cudaFuncCachePreferShared);
cudaFuncSetCacheConfig(kernel_gravity, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(kernel_reduce, cudaFuncCachePreferShared);
cudaFuncSetCacheConfig(kernel_gather_nb, cudaFuncCachePreferL1);
#endif
}
#include "yebisu_g6.h"
extern "C"{
void yebisu_g6_open (const int gpid)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].open();
}
void yebisu_g6_close(const int gpid)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].close();
}
void yebisu_g6_set_ip(
const int gpid,
const int ni,
const double pos[][3],
const double vel[][3],
const double eps2[],
const double h2 [],
const int id [])
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].set_ip(ni, pos, vel, eps2, h2, id);
}
void yebisu_g6_push_jp(
const int gpid,
const double pos [3],
const double vel [3],
const double acc2[3],
const double jrk6[6],
const double mass,
const double tj,
const int id,
const int addr)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].push_jp( pos, vel, acc2, jrk6, mass, tj, id, addr);
}
void yebisu_g6_transfer_jp(const int gpid)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].transter_jp();
}
void yebisu_g6_set_ti(
const int gpid,
const double ti)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].set_ti(ti);
}
void yebisu_g6_predict_all(
const int gpid,
const int nj)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].predict_all(nj);
}
void yebisu_g6_launch_gravity(
const int gpid,
const int ni,
const int nj,
const int with_neib)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].launch_gravity(ni, nj, bool(with_neib));
}
void yebisu_g6_get_force(
const int gpid,
const int ni,
_out_ double acc [][3],
_out_ double jrk [][3],
_out_ double pot [],
_out_ int nnb_id[])
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].get_force(ni, acc, jrk, pot, nnb_id);
}
void yebisu_g6_receive_neighbor_list(const int gpid)
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].receive_neighbor_list();
}
void yebisu_g6_get_neighbor_list(
const int gpid,
const int ipipe,
const int maxlen,
_out_ int *num_neib,
_out_ int list[])
{
// lib_initialize();
lib_initialize(gpid);
resource[gpid].get_neighbor_list(ipipe, maxlen, num_neib, list);
}
void yebisu_g6_DEBUG_read_pred(
const int gpid,
const int nj,
const int addr,
_out_ double pos [3],
_out_ double vel [3],
_out_ double mass[1],
_out_ int id [1])
{
resource[gpid].DEBUG_read_pred(nj, addr, pos, vel, mass, id);
}
int yebisu_g6_get_nimax()
{
return NIMAX;
}
int yebisu_g6_get_njmax()
{
return NBODY_MAX;
}
}

61
yebisu_g6.h Normal file
View file

@ -0,0 +1,61 @@
extern "C"
{
void yebisu_g6_open (const int gpid);
void yebisu_g6_close(const int gpid);
void yebisu_g6_set_ip(
const int gpid,
const int ni,
const double pos[][3],
const double vel[][3],
const double eps2[],
const double h2 [],
const int id []);
void yebisu_g6_push_jp(
const int gpid,
const double pos [3],
const double vel [3],
const double acc2[3],
const double jrk6[6],
const double mass,
const double tj,
const int id,
const int addr);
void yebisu_g6_transfer_jp(const int gpid);
void yebisu_g6_set_ti(
const int gpid,
const double ti);
void yebisu_g6_predict_all(
const int gpid,
const int nj);
void yebisu_g6_launch_gravity(
const int gpid,
const int ni,
const int nj,
const int with_neib);
void yebisu_g6_get_force(
const int gpid,
const int ni,
double acc [][3],
double jrk [][3],
double pot [],
int nnb_id[]);
void yebisu_g6_receive_neighbor_list(const int gpid);
void yebisu_g6_get_neighbor_list(
const int gpid,
const int ipipe,
const int maxlen,
int *num_neib,
int list[]);
void yebisu_g6_DEBUG_read_pred(
const int gpid,
const int nj,
const int addr,
double pos [3],
double vel [3],
double mass[1],
int id [1]);
int yebisu_g6_get_nimax();
int yebisu_g6_get_njmax();
}