first commit
This commit is contained in:
commit
604f1cb18a
9 changed files with 1683 additions and 0 deletions
30
Makefile
Normal file
30
Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
ETICS_LMAX ?= 2
|
||||
ETICS_NMAX ?= 10
|
||||
GPUARCH ?= sm_75
|
||||
CUDAHOME ?= /usr/local/cuda
|
||||
OPTIMIZATION ?= 3
|
||||
|
||||
CXX ?= g++
|
||||
NVCC ?= $(CUDAHOME)/bin/nvcc
|
||||
CXXFLAGS += -O$(OPTIMIZATION)
|
||||
|
||||
all : libyebisug6.a libyebisug6.nb.a
|
||||
|
||||
yebisu_g6.o : yebisu_g6.cu yebisu_g6.h particle.h
|
||||
$(NVCC) -arch=$(GPUARCH) $(CUDAFLAGS) -Xcompiler "$(CXXFLAGS)" -I$(CUDAHOME)/include -I$(CUDAHOME)/samples/common/inc -c yebisu_g6.cu
|
||||
|
||||
grape6.o : grape6.cpp
|
||||
$(CXX) $(CXXFLAGS) -c -DNB_FLAG=0 -o $@ $<
|
||||
|
||||
grape6.nb.o : grape6.cpp
|
||||
$(CXX) $(CXXFLAGS) -c -DNB_FLAG=1 -o $@ $<
|
||||
|
||||
libyebisug6.a : yebisu_g6.o grape6.o
|
||||
ar -r $@ $^
|
||||
ranlib $@
|
||||
|
||||
libyebisug6.nb.a : yebisu_g6.o grape6.nb.o
|
||||
ar -r $@ $^
|
||||
ranlib $@
|
||||
clean:
|
||||
rm -f *.o *.a
|
||||
54
cuda_pointer.h
Normal file
54
cuda_pointer.h
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
#include <assert.h>
|
||||
// #include <cutil.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct cudaPointer{
|
||||
T *dev_pointer;
|
||||
T *host_pointer;
|
||||
int size;
|
||||
cudaPointer(){
|
||||
dev_pointer = NULL;
|
||||
host_pointer = NULL;
|
||||
size = 0;
|
||||
}
|
||||
~cudaPointer(){
|
||||
// free();
|
||||
}
|
||||
void allocate(int _size){
|
||||
size = _size;
|
||||
void *p;
|
||||
checkCudaErrors(cudaMalloc(&p, size * sizeof(T)));
|
||||
assert(p);
|
||||
dev_pointer = (T*)p;
|
||||
checkCudaErrors(cudaMallocHost(&p, size * sizeof(T)));
|
||||
assert(p);
|
||||
host_pointer = (T*)p;
|
||||
}
|
||||
void free(){
|
||||
checkCudaErrors(cudaFree(dev_pointer));
|
||||
checkCudaErrors(cudaFreeHost(host_pointer));
|
||||
dev_pointer = NULL;
|
||||
host_pointer = NULL;
|
||||
size = 0;
|
||||
}
|
||||
void htod(int count){
|
||||
checkCudaErrors(cudaMemcpy(dev_pointer, host_pointer, count * sizeof(T), cudaMemcpyHostToDevice));
|
||||
}
|
||||
void htod(){
|
||||
this->htod(size);
|
||||
}
|
||||
void dtoh(int count){
|
||||
checkCudaErrors(cudaMemcpy(host_pointer, dev_pointer, count * sizeof(T), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
void dtoh(){
|
||||
this->dtoh(size);
|
||||
}
|
||||
T &operator [] (int i){
|
||||
return host_pointer[i];
|
||||
}
|
||||
operator T* (){
|
||||
return dev_pointer;
|
||||
}
|
||||
};
|
||||
90
g6util.h
Normal file
90
g6util.h
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
#ifndef G6UTIL_H
|
||||
#define G6UTIL_H
|
||||
|
||||
// #include "typedef.h"
|
||||
|
||||
/* constant definitions */
|
||||
#if !defined(__LANGUAGE_FORTRAN__) && !defined(_LANGUAGE_FORTRAN)
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
/* C interface */
|
||||
|
||||
/*
|
||||
* standard functions.
|
||||
* the number of the cards is hidden to the user.
|
||||
*/
|
||||
void g6_open_all(void);
|
||||
void g6_close_all(void);
|
||||
int g6_set_j_particle_all(int address, int index, double tj, double dtj, double mass,
|
||||
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
|
||||
int g6_set_j_particle_mxonly_all(int address, int index, double mass, double x[3]);
|
||||
void g6_set_ti_all(double ti);
|
||||
void g6calc_firsthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
|
||||
void g6calc_firsthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
|
||||
int g6calc_lasthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
|
||||
int g6calc_lasthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
|
||||
int g6calc_lasthalf2_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
|
||||
int g6_read_neighbour_list_all(void);
|
||||
int g6_get_neighbour_list_all(int ipipe, int maxlength, int *nblen, int nbl[]);
|
||||
void g6_set_nip_all(int nip);
|
||||
void g6_set_njp_all(int njp);
|
||||
void g6_set_i_particle_scales_from_real_value_all(int address, double acc[3], double jerk[3], double phi,
|
||||
double jfactor, double ffactor);
|
||||
void g6_set_i_particle_all(int address, int index, double x[3], double v[3], double eps2, double h2);
|
||||
int g6_get_force_all(double acc[][3], double jerk[][3], double phi[], int flag[]);
|
||||
int g6_get_force_etc_all(double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
|
||||
void g6_get_predicted_j_particles_all(int nj, double (*x)[3], double (*v)[3]);
|
||||
int g6_getnjmax_all(void);
|
||||
|
||||
/*
|
||||
* primitive functions to control multiple cards individually.
|
||||
* the user needs to specify card's device id explicitly.
|
||||
*/
|
||||
void g6_open(int clusterid);
|
||||
void g6_close(int clusterid);
|
||||
void g6_set_tunit(int newtunit);
|
||||
void g6_set_xunit(int newxunit);
|
||||
int g6_set_j_particle(int clusterid, int address, int index, double tj, double dtj, double mass,
|
||||
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
|
||||
int g6_set_j_particle_mxonly(int clusterid, int address, int index, double mass, double x[3]);
|
||||
void g6_set_ti(int clusterid, double ti);
|
||||
void g6calc_firsthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
|
||||
void g6calc_firsthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
|
||||
int g6calc_lasthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
|
||||
int g6calc_lasthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
|
||||
int g6calc_lasthalf2(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
|
||||
int g6_read_neighbour_list(int clusterid);
|
||||
int g6_get_neighbour_list(int clusterid, int ipipe, int maxlength, int *nblen, int nbl[]);
|
||||
void g6_set_neighbour_list_sort_mode(int mode);
|
||||
int g6_get_neighbour_list_sort_mode(void);
|
||||
int g6_npipes(void);
|
||||
void g6_set_nip(int clusterid, int nip);
|
||||
void g6_set_njp(int clusterid, int njp);
|
||||
void g6_set_i_particle_scales_from_real_value(int clusterid, int address, double acc[3], double jerk[3], double phi,
|
||||
double jfactor, double ffactor);
|
||||
void g6_set_i_particle(int clusterid, int address, int index, double x[3], double v[3], double eps2, double h2);
|
||||
int g6_get_force(int clusterid, double acc[][3], double jerk[][3], double phi[], int flag[]);
|
||||
int g6_get_force_etc(int clusterid, double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
|
||||
void g6_get_predicted_j_particles(int clusterid, int nj, double x[][3], double v[][3]);
|
||||
int g6_getnjmax(int clusterid);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* LANGUAGE_FORTRAN */
|
||||
|
||||
#endif /* G6UTIL_H */
|
||||
80
gpu.h
Normal file
80
gpu.h
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
//#define GPUTYPE 8800 // MP=16
|
||||
//#define NJBL_value 32
|
||||
|
||||
//#define GPUTYPE C1060 // MP=30
|
||||
//#define NJBL_value 30
|
||||
|
||||
//#define GPUTYPE M2070 // MP=14
|
||||
//#define NJBL_value 28
|
||||
|
||||
//#define GPUTYPE GF460 // MP=7
|
||||
//#define NJBL_value 14
|
||||
|
||||
//#define GPUTYPE GF470 // MP=14
|
||||
//#define NJBL_value 28
|
||||
|
||||
//#define GPUTYPE GF480 // MP=15
|
||||
//#define NJBL_value 30
|
||||
|
||||
//#define GPUTYPE GF570 // MP=15
|
||||
//#define NJBL_value 30
|
||||
|
||||
#define GPUTYPE K20m // MP=13
|
||||
#define NJBL_value 26
|
||||
|
||||
//#define GPUTYPE GF660 // MP=5
|
||||
//#define NJBL_value 10
|
||||
|
||||
//#define GPUTYPE GF780 // MP=15
|
||||
//#define NJBL_value 30
|
||||
|
||||
//#define GPUTYPE GF1080 // MP=20
|
||||
//#define NJBL_value 20
|
||||
|
||||
//#define GPUTYPE GFTITANX // MP=24
|
||||
//#define NJBL_value 24
|
||||
|
||||
|
||||
// NJBLOCKS = 1 or 2 x MP
|
||||
// NXREDUCE is a power of 2 and >= NJBLOCKS ~32 is good for all
|
||||
// NYREDUCE is a power of 2 < NXREDUCE ~4 is good for all
|
||||
|
||||
enum{
|
||||
SM_VER = 35,
|
||||
NTHREADS = 128,
|
||||
NTHSCAT = 64,
|
||||
NIBLOCKS = 16,
|
||||
NJPSHRE = 32,
|
||||
NIMAX = (NTHREADS * NIBLOCKS),
|
||||
NJBLOCKS = NJBL_value,
|
||||
NXREDUCE = 32,
|
||||
NYREDUCE = 4,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
----------------------------------------------------------------------------
|
||||
type MP x Cores x GPU clock rate SP Gflop/s DP Gflop/s
|
||||
per MP (GHz) (nbody n=200k) (nbody n=200k)
|
||||
|
||||
8800 GT 14 x 8 x 1.62 = 181.44 -
|
||||
8800 512 16 x 8 x 1.80 = 230.40 298.266 -
|
||||
9800 GTX 16 x 8 x 1.85 = 236.80 331.754 -
|
||||
250 GTS 16 x 8 x 1.73 = 221.44
|
||||
|
||||
C1060 30 x 8 x 1.30 = 312.00 450.861 55.412
|
||||
|
||||
M2070 14 x 32 x 1.15 = 515.20 546.416 250.454
|
||||
|
||||
460 v2 7 x 48 x 1.56 = 524.16 421.335 67.939
|
||||
470 14 x 32 x 1.22 = 546.56 578.572 105.796
|
||||
480 15 x 32 x 1.40 = 672.00 705.514 145.502
|
||||
570 15 x 32 x 1.46 = 700.80 736.882 151.960
|
||||
|
||||
670 7 x 192 x 0.71 = 954.24 1151.030 100.092
|
||||
680 8 x 192 x 0.71 = 1090.56 1187.710 101.211
|
||||
K20m 13 x 192 x 0.71 = 1772.16 1322.374 576.523
|
||||
TITAN 14 x 192 x 0.88 = 2365.44 2026.287 751.671
|
||||
780 Ti 15 x 192 x 0.93 = 2678.40 2691.603 198.987
|
||||
----------------------------------------------------------------------------
|
||||
*/
|
||||
173
grape6.cpp
Normal file
173
grape6.cpp
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
#include "yebisu_g6.h"
|
||||
#include "g6util.h"
|
||||
|
||||
#define NIMAX 2048
|
||||
#define MAXDEV 4
|
||||
|
||||
#ifndef NB_FLAG
|
||||
#define NB_FLAG 0
|
||||
#endif
|
||||
|
||||
extern "C"
|
||||
{
|
||||
static int sort_mode = 0;
|
||||
static double eps2_buf[MAXDEV][NIMAX];
|
||||
static int nnb_buf[MAXDEV][NIMAX];
|
||||
|
||||
void g6_open(int clusterid){
|
||||
assert(clusterid < MAXDEV);
|
||||
assert(NIMAX >= g6_npipes());
|
||||
yebisu_g6_open(clusterid);
|
||||
}
|
||||
void g6_close(int clusterid){
|
||||
yebisu_g6_close(clusterid);
|
||||
}
|
||||
void g6_set_tunit(int newtunit){}
|
||||
void g6_set_xunit(int newxunit){}
|
||||
int g6_set_j_particle(
|
||||
int clusterid,
|
||||
int address,
|
||||
int index,
|
||||
double tj,
|
||||
double dtj,
|
||||
double mass,
|
||||
double a2by18[3],
|
||||
double a1by6[3],
|
||||
double aby2[3],
|
||||
double v[3],
|
||||
double x[3])
|
||||
{
|
||||
yebisu_g6_push_jp(clusterid, x, v, aby2, a1by6, mass, tj, index, address);
|
||||
return 0;
|
||||
}
|
||||
void g6_set_ti(int clusterid, double ti){
|
||||
yebisu_g6_set_ti(clusterid, ti);
|
||||
}
|
||||
void g6calc_firsthalf0(
|
||||
int clusterid,
|
||||
int nj,
|
||||
int ni,
|
||||
int index[],
|
||||
double xi[][3],
|
||||
double vi[][3],
|
||||
double fold[][3],
|
||||
double jold[][3],
|
||||
double phiold[],
|
||||
double *eps2,
|
||||
double h2[],
|
||||
int mode)
|
||||
{
|
||||
double *eps2_ptr = eps2;
|
||||
if(mode){ // constand eps2
|
||||
eps2_ptr = eps2_buf[clusterid];
|
||||
for(int i=0; i<ni; i++){
|
||||
eps2_ptr[i] = *eps2;
|
||||
}
|
||||
}
|
||||
|
||||
yebisu_g6_set_ip(clusterid, ni, xi, vi, eps2_ptr, h2, index);
|
||||
yebisu_g6_launch_gravity(clusterid, ni, nj, NB_FLAG);
|
||||
}
|
||||
void g6calc_firsthalf(
|
||||
int clusterid,
|
||||
int nj,
|
||||
int ni,
|
||||
int index[],
|
||||
double xi[][3],
|
||||
double vi[][3],
|
||||
double fold[][3],
|
||||
double jold[][3],
|
||||
double phiold[],
|
||||
double eps2,
|
||||
double h2[])
|
||||
{
|
||||
g6calc_firsthalf0(clusterid, nj, ni, index, xi, vi, fold, jold, phiold, &eps2, h2, 1);
|
||||
}
|
||||
int g6calc_lasthalf(
|
||||
int clusterid,
|
||||
int nj,
|
||||
int ni,
|
||||
int index[],
|
||||
double xi[][3],
|
||||
double vi[][3],
|
||||
double eps2,
|
||||
double h2[],
|
||||
double acc[][3],
|
||||
double jerk[][3],
|
||||
double pot[])
|
||||
{
|
||||
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnb_buf[clusterid]);
|
||||
return 0;
|
||||
}
|
||||
int g6calc_lasthalf0(
|
||||
int clusterid,
|
||||
int nj,
|
||||
int ni,
|
||||
int index[],
|
||||
double xi[][3],
|
||||
double vi[][3],
|
||||
double *eps2,
|
||||
double h2[],
|
||||
double acc[][3],
|
||||
double jerk[][3],
|
||||
double pot[],
|
||||
int mode)
|
||||
{
|
||||
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnb_buf[clusterid]);
|
||||
return 0;
|
||||
}
|
||||
int g6calc_lasthalf2(
|
||||
int clusterid,
|
||||
int nj,
|
||||
int ni,
|
||||
int index[],
|
||||
double xi[][3],
|
||||
double vi[][3],
|
||||
double eps2,
|
||||
double h2[],
|
||||
double acc[][3],
|
||||
double jerk[][3],
|
||||
double pot[],
|
||||
int nnbindex[])
|
||||
{
|
||||
assert(NB_FLAG);
|
||||
yebisu_g6_get_force(clusterid, ni, acc, jerk, pot, nnbindex);
|
||||
return 0;
|
||||
}
|
||||
int g6_read_neighbour_list(int clusterid){
|
||||
assert(NB_FLAG);
|
||||
yebisu_g6_receive_neighbor_list(clusterid);
|
||||
return 0;
|
||||
}
|
||||
int g6_get_neighbour_list(
|
||||
int clusterid,
|
||||
int ipipe,
|
||||
int maxlength,
|
||||
int *nblen,
|
||||
int nbl[])
|
||||
{
|
||||
assert(NB_FLAG);
|
||||
yebisu_g6_get_neighbor_list(clusterid, ipipe, maxlength, nblen, nbl);
|
||||
const int nnb = *nblen;
|
||||
if(nnb < 0) return -1;
|
||||
if(sort_mode){
|
||||
std::sort(nbl, nbl+nnb);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
void g6_set_neighbour_list_sort_mode(int mode){
|
||||
sort_mode = mode;
|
||||
}
|
||||
int g6_get_neighbour_list_sort_mode(void){
|
||||
return sort_mode;
|
||||
}
|
||||
int g6_npipes(void){
|
||||
return yebisu_g6_get_nimax();
|
||||
}
|
||||
int g6_getnjmax(int clusterid){
|
||||
return yebisu_g6_get_njmax();
|
||||
}
|
||||
}
|
||||
90
grape6.h
Normal file
90
grape6.h
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
#ifndef G6UTIL_H
|
||||
#define G6UTIL_H
|
||||
|
||||
// #include "typedef.h"
|
||||
|
||||
/* constant definitions */
|
||||
#if !defined(__LANGUAGE_FORTRAN__) && !defined(_LANGUAGE_FORTRAN)
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
/* C interface */
|
||||
|
||||
/*
|
||||
* standard functions.
|
||||
* the number of the cards is hidden to the user.
|
||||
*/
|
||||
void g6_open_all(void);
|
||||
void g6_close_all(void);
|
||||
int g6_set_j_particle_all(int address, int index, double tj, double dtj, double mass,
|
||||
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
|
||||
int g6_set_j_particle_mxonly_all(int address, int index, double mass, double x[3]);
|
||||
void g6_set_ti_all(double ti);
|
||||
void g6calc_firsthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
|
||||
void g6calc_firsthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
|
||||
int g6calc_lasthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
|
||||
int g6calc_lasthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
|
||||
int g6calc_lasthalf2_all(int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
|
||||
int g6_read_neighbour_list_all(void);
|
||||
int g6_get_neighbour_list_all(int ipipe, int maxlength, int *nblen, int nbl[]);
|
||||
void g6_set_nip_all(int nip);
|
||||
void g6_set_njp_all(int njp);
|
||||
void g6_set_i_particle_scales_from_real_value_all(int address, double acc[3], double jerk[3], double phi,
|
||||
double jfactor, double ffactor);
|
||||
void g6_set_i_particle_all(int address, int index, double x[3], double v[3], double eps2, double h2);
|
||||
int g6_get_force_all(double acc[][3], double jerk[][3], double phi[], int flag[]);
|
||||
int g6_get_force_etc_all(double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
|
||||
void g6_get_predicted_j_particles_all(int nj, double (*x)[3], double (*v)[3]);
|
||||
int g6_getnjmax_all(void);
|
||||
|
||||
/*
|
||||
* primitive functions to control multiple cards individually.
|
||||
* the user needs to specify card's device id explicitly.
|
||||
*/
|
||||
void g6_open(int clusterid);
|
||||
void g6_close(int clusterid);
|
||||
void g6_set_tunit(int newtunit);
|
||||
void g6_set_xunit(int newxunit);
|
||||
int g6_set_j_particle(int clusterid, int address, int index, double tj, double dtj, double mass,
|
||||
double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]);
|
||||
int g6_set_j_particle_mxonly(int clusterid, int address, int index, double mass, double x[3]);
|
||||
void g6_set_ti(int clusterid, double ti);
|
||||
void g6calc_firsthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]);
|
||||
void g6calc_firsthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode);
|
||||
int g6calc_lasthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]);
|
||||
int g6calc_lasthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode);
|
||||
int g6calc_lasthalf2(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3],
|
||||
double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]);
|
||||
int g6_read_neighbour_list(int clusterid);
|
||||
int g6_get_neighbour_list(int clusterid, int ipipe, int maxlength, int *nblen, int nbl[]);
|
||||
void g6_set_neighbour_list_sort_mode(int mode);
|
||||
int g6_get_neighbour_list_sort_mode(void);
|
||||
int g6_npipes(void);
|
||||
void g6_set_nip(int clusterid, int nip);
|
||||
void g6_set_njp(int clusterid, int njp);
|
||||
void g6_set_i_particle_scales_from_real_value(int clusterid, int address, double acc[3], double jerk[3], double phi,
|
||||
double jfactor, double ffactor);
|
||||
void g6_set_i_particle(int clusterid, int address, int index, double x[3], double v[3], double eps2, double h2);
|
||||
int g6_get_force(int clusterid, double acc[][3], double jerk[][3], double phi[], int flag[]);
|
||||
int g6_get_force_etc(int clusterid, double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]);
|
||||
void g6_get_predicted_j_particles(int clusterid, int nj, double x[][3], double v[][3]);
|
||||
int g6_getnjmax(int clusterid);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* LANGUAGE_FORTRAN */
|
||||
|
||||
#endif /* G6UTIL_H */
|
||||
343
particle.h
Normal file
343
particle.h
Normal file
|
|
@ -0,0 +1,343 @@
|
|||
//#define EPS_RED
|
||||
//#define EPS_MUL 0.0001f
|
||||
|
||||
static inline __host__
|
||||
float2 float2_split(const double x)
|
||||
{
|
||||
const float fx = float(x);
|
||||
const float fy = float(x - double(fx));
|
||||
return make_float2(fx, fy);
|
||||
}
|
||||
|
||||
static inline __host__
|
||||
double float2_todouble(const float2 f)
|
||||
{
|
||||
return double(f.x) + double(f.y);
|
||||
}
|
||||
|
||||
static inline __device__
|
||||
float2 float2_accum(const float2 acc, const float x)
|
||||
{
|
||||
const float ax = acc.x + x;
|
||||
const float ay = acc.y - ((ax - acc.x) - x);
|
||||
return make_float2(ax, ay);
|
||||
}
|
||||
|
||||
static inline __device__
|
||||
float float2_diff(const float2 xj, const float2 xi){
|
||||
return (xj.x - xi.x) + (xj.y - xi.y);
|
||||
}
|
||||
|
||||
struct Iparticle{
|
||||
float2 pos[3]; // 6
|
||||
float vel[3]; // 9
|
||||
float eps2; // 10
|
||||
float h2; // 11
|
||||
int id; // 12
|
||||
|
||||
__host__ void read(
|
||||
const double h_pos[],
|
||||
const double h_vel[],
|
||||
const double h_eps2,
|
||||
const double h_h2,
|
||||
const int h_id)
|
||||
{
|
||||
for(int k=0; k<3; k++){
|
||||
pos[k] = float2_split(h_pos[k]);
|
||||
vel[k] = float(h_vel[k]);
|
||||
}
|
||||
eps2 = float(h_eps2);
|
||||
h2 = float(h_h2);
|
||||
id = h_id;
|
||||
}
|
||||
};
|
||||
|
||||
struct Jparticle{
|
||||
float2 pos [3]; // 6
|
||||
float vel [3]; // 9
|
||||
float acc2[3]; // 12
|
||||
float jrk6[3]; // 15
|
||||
float mass; // 16
|
||||
float2 tj; // 18
|
||||
int id; // 19
|
||||
int addr; // 20
|
||||
|
||||
__host__ void read(
|
||||
const double h_pos[],
|
||||
const double h_vel[],
|
||||
const double h_acc2[],
|
||||
const double h_jrk6[],
|
||||
const double h_mass,
|
||||
const double h_tj,
|
||||
const int h_id,
|
||||
const int h_addr)
|
||||
{
|
||||
for(int k=0; k<3; k++){
|
||||
pos [k] = float2_split(h_pos[k]);
|
||||
vel [k] = float(h_vel [k]);
|
||||
acc2[k] = float(h_acc2[k]);
|
||||
jrk6[k] = float(h_jrk6[k]);
|
||||
}
|
||||
mass = float(h_mass);
|
||||
tj = float2_split(h_tj);
|
||||
id = h_id;
|
||||
addr = h_addr;
|
||||
assert(addr < NBODY_MAX);
|
||||
}
|
||||
};
|
||||
|
||||
struct Jppred{
|
||||
float2 pos[3]; // 6
|
||||
float vel[3]; // 9
|
||||
float mass; // 10
|
||||
int id; // 11
|
||||
int pad; // 12
|
||||
|
||||
enum{
|
||||
SIZE_F4 = 3,
|
||||
};
|
||||
|
||||
__device__ void predict(
|
||||
const Jparticle &jp,
|
||||
const float2 ti)
|
||||
{
|
||||
const float dt = float2_diff(ti, jp.tj);
|
||||
#pragma unroll
|
||||
for(int k=0; k<3; k++){
|
||||
pos[k].x = jp.pos[k].x;
|
||||
pos[k].y = jp.pos[k].y + dt*(jp.vel[k] + dt*(jp.acc2[k] + dt*jp.jrk6[k]));
|
||||
vel[k] = jp.vel[k] + (2.f*dt)*(jp.acc2[k] + (1.5f*dt)*(jp.jrk6[k]));
|
||||
}
|
||||
mass = jp.mass;
|
||||
id = jp.id ;
|
||||
}
|
||||
};
|
||||
|
||||
struct Interaction{
|
||||
float3 acc;
|
||||
float3 jrk;
|
||||
float pot;
|
||||
|
||||
__device__ Interaction(
|
||||
const Iparticle &ip,
|
||||
const Jppred &jp){
|
||||
const float dx = float2_diff(jp.pos[0], ip.pos[0]);
|
||||
const float dy = float2_diff(jp.pos[1], ip.pos[1]);
|
||||
const float dz = float2_diff(jp.pos[2], ip.pos[2]);
|
||||
const float dvx = jp.vel[0] - ip.vel[0];
|
||||
const float dvy = jp.vel[1] - ip.vel[1];
|
||||
const float dvz = jp.vel[2] - ip.vel[2];
|
||||
|
||||
#ifdef EPS_RED
|
||||
float r2, tmp_eps2;
|
||||
|
||||
tmp_eps2 = ip.eps2; // default value ----> eps = 1e-5 !!!
|
||||
|
||||
// larger ----> 10 x eps if mass > 1e-6 (i.e. high mass part.)
|
||||
|
||||
if( (jp.id < 99998) || (ip.id < 99998) ) tmp_eps2 *= 100.0f;
|
||||
|
||||
// if i or j is a BH's ----> 1e-2 * eps
|
||||
if( (ip.id == 999998) || (jp.id == 999998) || (ip.id == 999999) || (jp.id == 999999) )
|
||||
{
|
||||
r2 = EPS_MUL*tmp_eps2 + dx*dx + dy*dy + dz*dz;
|
||||
}
|
||||
else
|
||||
{
|
||||
r2 = tmp_eps2 + dx*dx + dy*dy + dz*dz;
|
||||
}
|
||||
#else
|
||||
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
#ifdef EPS_RED
|
||||
float r2;
|
||||
|
||||
// if i or j is a BH's
|
||||
|
||||
if( (ip.id == 0) || (jp.id == 0) || (ip.id == 1) || (jp.id == 1) )
|
||||
{
|
||||
r2 = EPS_MUL*ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
}
|
||||
else
|
||||
{
|
||||
r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
}
|
||||
#else
|
||||
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
#endif
|
||||
*/
|
||||
|
||||
// const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
const float rv = dx*dvx + dy*dvy + dz*dvz;
|
||||
|
||||
const float rinv1 = (jp.id == ip.id) ? 0.0f
|
||||
: rsqrtf(r2);
|
||||
|
||||
const float rinv2 = rinv1 * rinv1;
|
||||
const float mrinv1 = jp.mass * rinv1;
|
||||
const float mrinv3 = mrinv1 * rinv2;
|
||||
const float alpha = -3.f * rv * rinv2;
|
||||
|
||||
acc.x = mrinv3 * dx;
|
||||
acc.y = mrinv3 * dy;
|
||||
acc.z = mrinv3 * dz;
|
||||
jrk.x = mrinv3 * (dvx + alpha * dx);
|
||||
jrk.y = mrinv3 * (dvy + alpha * dy);
|
||||
jrk.z = mrinv3 * (dvz + alpha * dz);
|
||||
pot = mrinv1; // use positive definition here
|
||||
}
|
||||
__device__ void set_neib(int &dst) const{
|
||||
// do nothing
|
||||
}
|
||||
};
|
||||
|
||||
struct Interaction_NB{
|
||||
float3 acc;
|
||||
float3 jrk;
|
||||
float pot;
|
||||
float nb_rinv;
|
||||
int jid;
|
||||
bool is_neib;
|
||||
|
||||
__device__ Interaction_NB(
|
||||
const Iparticle &ip,
|
||||
const Jppred &jp)
|
||||
{
|
||||
const float dx = float2_diff(jp.pos[0], ip.pos[0]);
|
||||
const float dy = float2_diff(jp.pos[1], ip.pos[1]);
|
||||
const float dz = float2_diff(jp.pos[2], ip.pos[2]);
|
||||
const float dvx = jp.vel[0] - ip.vel[0];
|
||||
const float dvy = jp.vel[1] - ip.vel[1];
|
||||
const float dvz = jp.vel[2] - ip.vel[2];
|
||||
|
||||
// if( (jp.id) > 1 && (jp.id < 200000) ) ip.eps2 *= 100;
|
||||
|
||||
const float r2 = ip.eps2 + dx*dx + dy*dy + dz*dz;
|
||||
const float rv = dx*dvx + dy*dvy + dz*dvz;
|
||||
|
||||
const float rinv1 = (jp.id == ip.id) ? 0.0f
|
||||
: rsqrtf(r2);
|
||||
|
||||
const float rinv2 = rinv1 * rinv1;
|
||||
const float mrinv1 = jp.mass * rinv1;
|
||||
const float mrinv3 = mrinv1 * rinv2;
|
||||
const float alpha = -3.f * rv * rinv2;
|
||||
|
||||
acc.x = mrinv3 * dx;
|
||||
acc.y = mrinv3 * dy;
|
||||
acc.z = mrinv3 * dz;
|
||||
jrk.x = mrinv3 * (dvx + alpha * dx);
|
||||
jrk.y = mrinv3 * (dvy + alpha * dy);
|
||||
jrk.z = mrinv3 * (dvz + alpha * dz);
|
||||
pot = mrinv1; // use positive definition here
|
||||
nb_rinv = rinv1;
|
||||
jid = jp.id;
|
||||
is_neib = (r2 < ip.h2) && (jp.id != ip.id);
|
||||
}
|
||||
__device__ void set_neib(int &dst) const{
|
||||
if(is_neib) dst = jid;
|
||||
}
|
||||
};
|
||||
|
||||
struct Force{
|
||||
float2 acc[3]; // 6
|
||||
float jrk[3]; // 9
|
||||
float2 pot; // 11
|
||||
int nnb_id; // 12 ID of nearest neighbor
|
||||
float nnb_rinv; // 13 rinv of nearest neighbor
|
||||
int num_neib; // 14
|
||||
|
||||
__host__ void write(
|
||||
double h_acc[],
|
||||
double h_jrk[],
|
||||
double &h_pot,
|
||||
int &h_nnb_id,
|
||||
int &h_num_neib) const
|
||||
{
|
||||
for(int k=0; k<3; k++){
|
||||
h_acc[k] = float2_todouble(acc[k]);
|
||||
h_jrk[k] = double(jrk[k]);
|
||||
}
|
||||
h_pot = - float2_todouble(pot);
|
||||
h_nnb_id = nnb_id;
|
||||
h_num_neib = num_neib;
|
||||
}
|
||||
|
||||
__device__ void clear()
|
||||
{
|
||||
#pragma unroll
|
||||
for(int k=0; k<3; k++){
|
||||
acc[k] = make_float2(0.0f, 0.0f);
|
||||
jrk[k] = 0.0f;
|
||||
}
|
||||
pot = make_float2(0.0f, 0.0f);
|
||||
nnb_id = -1;
|
||||
nnb_rinv = 0.0f;
|
||||
num_neib = 0;
|
||||
}
|
||||
|
||||
__device__ void check_overflow(){
|
||||
if(num_neib > NB_MAX) num_neib = -1;
|
||||
}
|
||||
|
||||
// for the redction kernel
|
||||
__device__ void operator+=(
|
||||
const Force &fo)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int k=0; k<3; k++){
|
||||
acc[k] = float2_accum(acc[k], fo.acc[k].x);
|
||||
acc[k] = float2_accum(acc[k], fo.acc[k].y);
|
||||
jrk[k] += fo.jrk[k];
|
||||
}
|
||||
pot = float2_accum(pot, fo.pot.x);
|
||||
pot = float2_accum(pot, fo.pot.y);
|
||||
if(num_neib>=0 && fo.num_neib>=0){
|
||||
num_neib += fo.num_neib;
|
||||
}else{ // overflow
|
||||
num_neib = -1;
|
||||
}
|
||||
// nearest neighbor
|
||||
if(nnb_rinv < fo.nnb_rinv){
|
||||
nnb_id = fo.nnb_id;
|
||||
nnb_rinv = fo.nnb_rinv;
|
||||
}
|
||||
}
|
||||
|
||||
// for the gravity kernel
|
||||
__device__ void operator+=(
|
||||
const Interaction &fo)
|
||||
{
|
||||
acc[0] = float2_accum(acc[0], fo.acc.x);
|
||||
acc[1] = float2_accum(acc[1], fo.acc.y);
|
||||
acc[2] = float2_accum(acc[2], fo.acc.z);
|
||||
pot = float2_accum(pot, fo.pot);
|
||||
jrk[0] += fo.jrk.x;
|
||||
jrk[1] += fo.jrk.y;
|
||||
jrk[2] += fo.jrk.z;
|
||||
}
|
||||
|
||||
__device__ void operator+=(
|
||||
const Interaction_NB &fo)
|
||||
{
|
||||
acc[0] = float2_accum(acc[0], fo.acc.x);
|
||||
acc[1] = float2_accum(acc[1], fo.acc.y);
|
||||
acc[2] = float2_accum(acc[2], fo.acc.z);
|
||||
jrk[0] += fo.jrk.x;
|
||||
jrk[1] += fo.jrk.y;
|
||||
jrk[2] += fo.jrk.z;
|
||||
pot = float2_accum(pot, fo.pot);
|
||||
// neighbor list counter
|
||||
if(fo.is_neib) num_neib++;
|
||||
// nearest neighbor
|
||||
if(nnb_rinv < fo.nb_rinv){
|
||||
nnb_id = fo.jid;
|
||||
nnb_rinv = fo.nb_rinv;
|
||||
}
|
||||
}
|
||||
};
|
||||
762
yebisu_g6.cu
Normal file
762
yebisu_g6.cu
Normal file
|
|
@ -0,0 +1,762 @@
|
|||
//#include <cutil.h>
|
||||
#include <omp.h>
|
||||
#include "cuda_pointer.h"
|
||||
|
||||
enum{
|
||||
MAX_GPU = 4,
|
||||
MAX_CPU = 4,
|
||||
NBODY_MAX = (1048576), // 256KB
|
||||
NB_MAX = 256, // per block
|
||||
MAX_NB_BUF = (1048576), // 256KB
|
||||
};
|
||||
|
||||
#include "gpu.h"
|
||||
|
||||
#include "particle.h"
|
||||
|
||||
#define _out_
|
||||
|
||||
__global__ void kernel_jp_scatter(
|
||||
const int nj,
|
||||
const Jparticle jpsrc[],
|
||||
_out_ Jparticle jpdst[])
|
||||
{
|
||||
const int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(tid < nj){
|
||||
const Jparticle jp = jpsrc[tid];
|
||||
jpdst[jp.addr] = jp;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_predict(
|
||||
const int nj,
|
||||
const float2 ti,
|
||||
const Jparticle jptcl[],
|
||||
_out_ Jppred jpred[])
|
||||
{
|
||||
#if 0
|
||||
const int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(tid < nj){
|
||||
jpred[tid].predict(jptcl[tid], ti);
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const int off = blockDim.x * blockIdx.x;
|
||||
const int nth = blockDim.x;
|
||||
__shared__ float4 sbuf[NTHREADS*5];
|
||||
Jparticle *sptcl = (Jparticle *)sbuf;
|
||||
Jppred *spred = (Jppred *)sbuf;
|
||||
|
||||
{ // LOAD
|
||||
float4 *src = (float4 *)(jptcl + off);
|
||||
float4 *dst = (float4 *)(sptcl);
|
||||
#pragma unroll
|
||||
for(int k=0; k<5; k++, src+=nth, dst+=nth){
|
||||
dst[tid] = src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Predict
|
||||
__syncthreads();
|
||||
Jppred pp;
|
||||
pp.predict(sptcl[tid], ti);
|
||||
__syncthreads();
|
||||
spred[tid] = pp;
|
||||
__syncthreads();
|
||||
|
||||
{ // STORE
|
||||
float4 *src = (float4 *)(spred);
|
||||
float4 *dst = (float4 *)(jpred + off);
|
||||
#pragma unroll
|
||||
for(int k=0; k<3; k++, src+=nth, dst+=nth){
|
||||
dst[tid] = src[tid];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define INTERACTION Interaction_NB
|
||||
__global__ void kernel_gravity(
|
||||
const int ni,
|
||||
const int nj,
|
||||
const Iparticle ipbuf[],
|
||||
const Jppred jpbuf[],
|
||||
_out_ Force fodev[][NJBLOCKS],
|
||||
_out_ int nbbuf[][NJBLOCKS][NB_MAX],
|
||||
const bool with_neib)
|
||||
{
|
||||
int ibid = blockIdx.x;
|
||||
int jbid = blockIdx.y;
|
||||
int tid = threadIdx.x;
|
||||
int iaddr = tid + blockDim.x * ibid;
|
||||
int jstart = (nj * (jbid )) / NJBLOCKS;
|
||||
int jend = (nj * (jbid+1)) / NJBLOCKS;
|
||||
int *nbdst = nbbuf[iaddr][jbid];
|
||||
|
||||
__shared__ Jppred jpshare[NJPSHRE]; // 32
|
||||
|
||||
const Iparticle ip = ipbuf[iaddr];
|
||||
Force fo;
|
||||
fo.clear();
|
||||
|
||||
if(with_neib){
|
||||
for(int j=jstart; j<jend; j+=NJPSHRE){
|
||||
const int jsize = NJPSHRE * Jppred::SIZE_F4; // 96
|
||||
|
||||
__syncthreads();
|
||||
if(tid < jsize){ // 96 of 128
|
||||
float4 *src = (float4 *)(jpbuf + j);
|
||||
float4 *dst = (float4 *)(jpshare );
|
||||
dst[tid] = src[tid];
|
||||
}
|
||||
if(tid+32 < jsize){ // for the case of 64 threads
|
||||
float4 *src = (float4 *)(jpbuf + j);
|
||||
float4 *dst = (float4 *)(jpshare );
|
||||
dst[tid+32] = src[tid+32];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(jend-j < NJPSHRE){
|
||||
#pragma unroll 4
|
||||
for(int jj=0; jj<jend-j; jj++){
|
||||
const Jppred jp = jpshare[jj];
|
||||
const Interaction_NB inter(ip, jp);
|
||||
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
|
||||
fo += inter;
|
||||
}
|
||||
}else{
|
||||
#pragma unroll 32
|
||||
for(int jj=0; jj<NJPSHRE; jj++){
|
||||
const Jppred jp = jpshare[jj];
|
||||
const Interaction_NB inter(ip, jp);
|
||||
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
|
||||
fo += inter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}else{ // no neib
|
||||
for(int j=jstart; j<jend; j+=NJPSHRE){
|
||||
const int jsize = NJPSHRE * Jppred::SIZE_F4; // 96
|
||||
|
||||
__syncthreads();
|
||||
if(tid < jsize){ // 96 of 128
|
||||
float4 *src = (float4 *)(jpbuf + j);
|
||||
float4 *dst = (float4 *)(jpshare );
|
||||
dst[tid] = src[tid];
|
||||
}
|
||||
if(tid+32 < jsize){ // for the case of 64 threads
|
||||
float4 *src = (float4 *)(jpbuf + j);
|
||||
float4 *dst = (float4 *)(jpshare );
|
||||
dst[tid+32] = src[tid+32];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(jend-j < NJPSHRE){
|
||||
#pragma unroll 4
|
||||
for(int jj=0; jj<jend-j; jj++){
|
||||
const Jppred jp = jpshare[jj];
|
||||
const Interaction inter(ip, jp);
|
||||
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
|
||||
fo += inter;
|
||||
}
|
||||
}else{
|
||||
#pragma unroll 32
|
||||
for(int jj=0; jj<NJPSHRE; jj++){
|
||||
const Jppred jp = jpshare[jj];
|
||||
const Interaction inter(ip, jp);
|
||||
inter.set_neib(nbdst[fo.num_neib % NB_MAX]);
|
||||
fo += inter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(iaddr < ni){
|
||||
fodev[iaddr][jbid] = fo;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_reduce(
|
||||
const int ni,
|
||||
const Force fodev[][NJBLOCKS],
|
||||
_out_ Force fosum[]){
|
||||
const int xid = threadIdx.x;
|
||||
const int yid = threadIdx.y;
|
||||
const int bid = blockIdx.x;
|
||||
const int iaddr = yid + blockDim.y * bid;
|
||||
|
||||
__shared__ Force fshare[NYREDUCE][NXREDUCE];
|
||||
if(xid < NJBLOCKS){
|
||||
fshare[yid][xid] = fodev[iaddr][xid];
|
||||
}else{
|
||||
fshare[yid][xid].clear();
|
||||
}
|
||||
Force *fs = fshare[yid];
|
||||
|
||||
if(32 == NXREDUCE){
|
||||
if(xid < 16) fs[xid] += fs[xid + 16];
|
||||
}
|
||||
if(xid < 8) fs[xid] += fs[xid + 8];
|
||||
if(xid < 4) fs[xid] += fs[xid + 4];
|
||||
if(xid < 2) fs[xid] += fs[xid + 2];
|
||||
if(xid < 1) fs[xid] += fs[xid + 1];
|
||||
|
||||
if(iaddr < ni && 0 == xid){
|
||||
fosum[iaddr] = fs[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_gather_nb(
|
||||
const int ni,
|
||||
const Force fodev[][NJBLOCKS],
|
||||
const int2 nbcnt[],
|
||||
const int nbbuf[][NJBLOCKS][NB_MAX],
|
||||
_out_ int nblst[])
|
||||
{
|
||||
const int xid = threadIdx.x;
|
||||
const int yid = threadIdx.y;
|
||||
const int bid = blockIdx.x;
|
||||
const int iaddr = yid + blockDim.y * bid;
|
||||
if(iaddr >= ni) return;
|
||||
if(nbcnt[iaddr].x < 0) return; // overflow
|
||||
|
||||
const int mynnb = (xid < NJBLOCKS) ? fodev[iaddr][xid].num_neib
|
||||
: 0;
|
||||
|
||||
// now performe prefix sum
|
||||
__shared__ int ishare[NYREDUCE][NXREDUCE];
|
||||
ishare[yid][xid] = mynnb;
|
||||
int *ish = ishare[yid];
|
||||
if(xid>=1) ish[xid] += ish[xid-1];
|
||||
if(xid>=2) ish[xid] += ish[xid-2];
|
||||
if(xid>=4) ish[xid] += ish[xid-4];
|
||||
if(xid>=8) ish[xid] += ish[xid-8];
|
||||
if(32 == NXREDUCE){
|
||||
if(xid>=16) ish[xid] += ish[xid-16];
|
||||
}
|
||||
|
||||
const int off = (xid == 0) ? 0
|
||||
: ish[xid-1];
|
||||
int *nbdst = nblst + nbcnt[iaddr].y + off;
|
||||
if(xid < NJBLOCKS){
|
||||
for(int k=0; k<mynnb; k++){
|
||||
const int nbid = nbbuf[iaddr][xid][k];
|
||||
nbdst[k] = nbid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Resource{
|
||||
private:
|
||||
bool is_open;
|
||||
bool predicted;
|
||||
bool grav_called;
|
||||
bool jp_flushed;
|
||||
int gpid;
|
||||
int njp_in_que;
|
||||
int ni_save;
|
||||
float2 ti;
|
||||
|
||||
cudaPointer<Iparticle> ipbuf;
|
||||
cudaPointer<Jparticle> jpbuf;
|
||||
cudaPointer<Jparticle> jpque;
|
||||
cudaPointer<Jppred> jpred;
|
||||
cudaPointer <Force[NJBLOCKS]> fodev;
|
||||
cudaPointer <Force> fosum;
|
||||
cudaPointer <int[NJBLOCKS][NB_MAX]> nbbuf;
|
||||
cudaPointer <int> nblst;
|
||||
cudaPointer <int2> nbcnt; // {num, off}
|
||||
|
||||
void allocate(){
|
||||
ipbuf.allocate(NIMAX);
|
||||
jpbuf.allocate(NBODY_MAX);
|
||||
jpque.allocate(NBODY_MAX);
|
||||
jpred.allocate(NBODY_MAX);
|
||||
fodev.allocate(NIMAX);
|
||||
fosum.allocate(NIMAX);
|
||||
nbbuf.allocate(NIMAX);
|
||||
nblst.allocate(MAX_NB_BUF);
|
||||
nbcnt.allocate(NIMAX);
|
||||
}
|
||||
void free(){
|
||||
ipbuf.free();
|
||||
jpbuf.free();
|
||||
jpque.free();
|
||||
jpred.free();
|
||||
fodev.free();
|
||||
fosum.free();
|
||||
nbbuf.free();
|
||||
nblst.free();
|
||||
nbcnt.free();
|
||||
}
|
||||
public:
|
||||
void set_gpid(){
|
||||
int dev;
|
||||
cudaGetDevice(&dev);
|
||||
gpid = dev;
|
||||
}
|
||||
void set_gpid(const int id){
|
||||
gpid = id;
|
||||
int dev;
|
||||
cudaGetDevice(&dev);
|
||||
assert(dev == gpid);
|
||||
}
|
||||
void open(){
|
||||
assert(!is_open);
|
||||
allocate();
|
||||
is_open = true;
|
||||
njp_in_que = 0;
|
||||
predicted = false;
|
||||
grav_called = false;
|
||||
jp_flushed = false;
|
||||
}
|
||||
void close(){
|
||||
assert(is_open);
|
||||
free();
|
||||
is_open = false;
|
||||
}
|
||||
|
||||
void set_ip(
|
||||
const int ni,
|
||||
const double pos[][3],
|
||||
const double vel[][3],
|
||||
const double eps2[],
|
||||
const double h2 [],
|
||||
const int id [])
|
||||
{
|
||||
assert(is_open);
|
||||
assert(ni <= NIMAX);
|
||||
|
||||
for(int i=0; i<ni; i++){
|
||||
ipbuf[i].read(pos[i], vel[i], eps2[i], h2[i], id[i]);
|
||||
}
|
||||
ipbuf.htod(ni);
|
||||
ni_save = ni;
|
||||
}
|
||||
|
||||
void push_jp(
|
||||
const double pos [3],
|
||||
const double vel [3],
|
||||
const double acc2[3],
|
||||
const double jrk6[6],
|
||||
const double mass,
|
||||
const double tj,
|
||||
const int id,
|
||||
const int addr)
|
||||
{
|
||||
assert(is_open);
|
||||
assert(addr < NBODY_MAX);
|
||||
|
||||
jpque[njp_in_que].read(pos, vel, acc2, jrk6, mass, tj, id, addr);
|
||||
njp_in_que++;
|
||||
jp_flushed = false;
|
||||
}
|
||||
void transter_jp(){
|
||||
assert(is_open);
|
||||
|
||||
const int njq = njp_in_que;
|
||||
jpque.htod(njq);
|
||||
njp_in_que = 0;
|
||||
const int Blocks = 1 + (njq-1)/NTHSCAT;
|
||||
kernel_jp_scatter <<< Blocks, NTHSCAT >>>
|
||||
(njq, jpque, jpbuf);
|
||||
jp_flushed = true;
|
||||
predicted = false;
|
||||
}
|
||||
void set_ti(const double dbl_ti){
|
||||
assert(is_open);
|
||||
|
||||
ti = float2_split(dbl_ti);
|
||||
predicted = false;
|
||||
}
|
||||
void predict_all(const int nj){
|
||||
assert(is_open);
|
||||
|
||||
const int Blocks = 1 + (nj-1)/NTHREADS;
|
||||
kernel_predict <<< Blocks, NTHREADS >>>
|
||||
(nj, ti, jpbuf, jpred);
|
||||
predicted = true;
|
||||
}
|
||||
void launch_gravity(
|
||||
const int ni,
|
||||
const int nj,
|
||||
const bool with_neib)
|
||||
{
|
||||
assert(is_open);
|
||||
assert(ni == ni_save);
|
||||
assert(ni <= NIMAX);
|
||||
assert(nj < NBODY_MAX);
|
||||
|
||||
if(!jp_flushed) transter_jp();
|
||||
if(!predicted ) predict_all(nj);
|
||||
if(ni <= 64){
|
||||
dim3 grid ( 1, NJBLOCKS, 1);
|
||||
dim3 threads(64, 1, 1);
|
||||
kernel_gravity <<< grid, threads >>>
|
||||
(ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib);
|
||||
}else{
|
||||
const int niblocks = 1 + (ni-1) / NTHREADS;
|
||||
dim3 grid (niblocks, NJBLOCKS, 1);
|
||||
dim3 threads(NTHREADS, 1, 1);
|
||||
kernel_gravity <<< grid, threads >>>
|
||||
(ni, nj, ipbuf, jpred, fodev, nbbuf, with_neib);
|
||||
}
|
||||
grav_called = true;
|
||||
}
|
||||
void get_force(
|
||||
const int ni,
|
||||
_out_ double acc [][3],
|
||||
_out_ double jrk [][3],
|
||||
_out_ double pot [],
|
||||
_out_ int nnb_id[])
|
||||
{
|
||||
assert(is_open);
|
||||
assert(grav_called);
|
||||
assert(ni == ni_save);
|
||||
assert(ni <= NIMAX);
|
||||
|
||||
const int ni8 = 1 + (ni-1) / NYREDUCE;
|
||||
dim3 grid (ni8, 1, 1);
|
||||
dim3 threads(NXREDUCE, NYREDUCE, 1);
|
||||
kernel_reduce <<< grid, threads >>>
|
||||
(ni, fodev, fosum);
|
||||
fosum.dtoh(ni);
|
||||
grav_called = false;
|
||||
|
||||
for(int i=0; i<ni; i++){
|
||||
fosum[i].write(acc[i], jrk[i], pot[i], nnb_id[i], nbcnt[i].x);
|
||||
}
|
||||
}
|
||||
void receive_neighbor_list(){
|
||||
assert(is_open);
|
||||
|
||||
const int ni = ni_save;
|
||||
int nbsum = 0;
|
||||
for(int i=0; i<ni; i++){
|
||||
nbcnt[i].y = nbsum;
|
||||
if(nbcnt[i].x >= 0) nbsum += nbcnt[i].x;
|
||||
}
|
||||
assert(nbsum <= MAX_NB_BUF);
|
||||
nbcnt.htod(ni);
|
||||
|
||||
const int ni8 = 1 + (ni-1) / NYREDUCE;
|
||||
dim3 grid (ni8, 1, 1);
|
||||
dim3 threads(NXREDUCE, NYREDUCE, 1);
|
||||
kernel_gather_nb <<< grid, threads >>>
|
||||
(ni, fodev, nbcnt, nbbuf, nblst);
|
||||
nblst.dtoh(nbsum);
|
||||
}
|
||||
void get_neighbor_list(
|
||||
const int ipipe,
|
||||
const int maxlen,
|
||||
_out_ int *num_neib,
|
||||
_out_ int list[])
|
||||
{
|
||||
assert(is_open);
|
||||
assert(ipipe < ni_save);
|
||||
|
||||
const int nnb = nbcnt[ipipe].x;
|
||||
const int off = nbcnt[ipipe].y;
|
||||
const int *src = &nblst[off];
|
||||
if(nnb > 0 && maxlen >= nnb){
|
||||
for(int k=0; k<nnb; k++){
|
||||
list[k] = src[k];
|
||||
}
|
||||
*num_neib = nnb;
|
||||
}else{
|
||||
*num_neib = -1;
|
||||
}
|
||||
}
|
||||
|
||||
void DEBUG_read_pred(
|
||||
const int nj,
|
||||
const int addr,
|
||||
_out_ double pos [3],
|
||||
_out_ double vel [3],
|
||||
_out_ double mass[1],
|
||||
_out_ int id [1])
|
||||
{
|
||||
jpred.dtoh(nj);
|
||||
const Jppred &p = jpred[addr];
|
||||
for(int k=0; k<3; k++){
|
||||
pos[k] = p.pos[k].x + p.pos[k].y;
|
||||
vel[k] = p.vel[k];
|
||||
}
|
||||
mass[0] = p.mass;
|
||||
id [0] = p.id;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
static Resource resource[MAX_GPU];
|
||||
static int numGPU, numCPU;
|
||||
static bool initialized = false;
|
||||
|
||||
static void lib_initialize(const int gpid){
|
||||
if(initialized) return;
|
||||
initialized = true;
|
||||
|
||||
assert(NXREDUCE >= NJBLOCKS);
|
||||
assert(NXREDUCE <= 32);
|
||||
assert(sizeof(Jppred) % sizeof(float4) == 0);
|
||||
assert(sizeof(Jppred) / sizeof(float4) == Jppred::SIZE_F4);
|
||||
assert(NJPSHRE * Jppred::SIZE_F4 <= NTHREADS);
|
||||
|
||||
// int devid[MAX_GPU];
|
||||
|
||||
// cudaGetDeviceCount(&numGPU);
|
||||
// assert(numGPU <= MAX_GPU);
|
||||
|
||||
/*
|
||||
char *gpu_list = getenv("GPU_LIST");
|
||||
|
||||
if(gpu_list)
|
||||
{
|
||||
// get GPU list from environment variable
|
||||
numGPU = 0;
|
||||
char *p = strtok(gpu_list, " ");
|
||||
while(p)
|
||||
{
|
||||
devid[numGPU++] = atoi(p);
|
||||
p = strtok(NULL, " ");
|
||||
assert(numGPU <= MAX_GPU);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// use all GPUs
|
||||
// for(int i=0; i<numGPU; i++) devid[i] = i;
|
||||
|
||||
// using the default GPU
|
||||
numGPU = 1;
|
||||
numCPU = 1;
|
||||
devid[0] = -1;
|
||||
}
|
||||
*/
|
||||
|
||||
// using the default GPU
|
||||
// numGPU = 1;
|
||||
// numCPU = 1;
|
||||
// devid[0] = -1;
|
||||
|
||||
|
||||
// using the gpid GPU
|
||||
numGPU = 1;
|
||||
numCPU = 1;
|
||||
// devid[0] = gpid;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if(tid == 0) numCPU = omp_get_num_threads();
|
||||
numCPU = 1;
|
||||
}
|
||||
*/
|
||||
|
||||
// assert(numCPU <= MAX_CPU);
|
||||
// assert(numGPU <= numCPU);
|
||||
|
||||
//#pragma omp parallel
|
||||
// {
|
||||
|
||||
/*
|
||||
int tid = omp_get_thread_num();
|
||||
tid = 0;
|
||||
if(tid < numGPU)
|
||||
{
|
||||
|
||||
fprintf(stderr, "tid %d, numGPU %d, devid[tid] %d \n", tid, numGPU, devid[tid]);
|
||||
|
||||
if(devid[tid] >= 0)
|
||||
{
|
||||
cudaSetDevice(devid[tid]);
|
||||
resource[tid].set_gpid(devid[tid]);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Skipping cudaSetDevice(), using the default GPU \n");
|
||||
resource[tid].set_gpid();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// }
|
||||
|
||||
|
||||
cudaGetDeviceCount(&numGPU);
|
||||
assert(numGPU <= MAX_GPU);
|
||||
cudaSetDevice(gpid);
|
||||
resource[0].set_gpid(gpid);
|
||||
|
||||
|
||||
fprintf(stderr, "***********************\n");
|
||||
fprintf(stderr, "Initializing Yebisu/G6 library\n");
|
||||
fprintf(stderr, "#CPU %d, #GPU %d, device: %d\n", numCPU, numGPU, gpid);
|
||||
// fprintf(stderr, "device: %d\n" gpid);
|
||||
// fprintf(stderr, "device: ");
|
||||
// for(int i=0; i<numGPU; i++) fprintf(stderr, " %d", devid[i]);
|
||||
// fprintf(stderr, "\n");
|
||||
fprintf(stderr, "***********************\n");
|
||||
|
||||
#if 1
|
||||
cudaFuncSetCacheConfig(kernel_jp_scatter, cudaFuncCachePreferL1);
|
||||
cudaFuncSetCacheConfig(kernel_predict, cudaFuncCachePreferShared);
|
||||
cudaFuncSetCacheConfig(kernel_gravity, cudaFuncCachePreferL1);
|
||||
cudaFuncSetCacheConfig(kernel_reduce, cudaFuncCachePreferShared);
|
||||
cudaFuncSetCacheConfig(kernel_gather_nb, cudaFuncCachePreferL1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "yebisu_g6.h"
|
||||
extern "C"{
|
||||
|
||||
void yebisu_g6_open (const int gpid)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].open();
|
||||
}
|
||||
|
||||
void yebisu_g6_close(const int gpid)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].close();
|
||||
}
|
||||
|
||||
void yebisu_g6_set_ip(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
const double pos[][3],
|
||||
const double vel[][3],
|
||||
const double eps2[],
|
||||
const double h2 [],
|
||||
const int id [])
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].set_ip(ni, pos, vel, eps2, h2, id);
|
||||
}
|
||||
|
||||
void yebisu_g6_push_jp(
|
||||
const int gpid,
|
||||
const double pos [3],
|
||||
const double vel [3],
|
||||
const double acc2[3],
|
||||
const double jrk6[6],
|
||||
const double mass,
|
||||
const double tj,
|
||||
const int id,
|
||||
const int addr)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].push_jp( pos, vel, acc2, jrk6, mass, tj, id, addr);
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_transfer_jp(const int gpid)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].transter_jp();
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_set_ti(
|
||||
const int gpid,
|
||||
const double ti)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].set_ti(ti);
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_predict_all(
|
||||
const int gpid,
|
||||
const int nj)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].predict_all(nj);
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_launch_gravity(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
const int nj,
|
||||
const int with_neib)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].launch_gravity(ni, nj, bool(with_neib));
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_get_force(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
_out_ double acc [][3],
|
||||
_out_ double jrk [][3],
|
||||
_out_ double pot [],
|
||||
_out_ int nnb_id[])
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].get_force(ni, acc, jrk, pot, nnb_id);
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_receive_neighbor_list(const int gpid)
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].receive_neighbor_list();
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_get_neighbor_list(
|
||||
const int gpid,
|
||||
const int ipipe,
|
||||
const int maxlen,
|
||||
_out_ int *num_neib,
|
||||
_out_ int list[])
|
||||
{
|
||||
// lib_initialize();
|
||||
lib_initialize(gpid);
|
||||
resource[gpid].get_neighbor_list(ipipe, maxlen, num_neib, list);
|
||||
}
|
||||
|
||||
|
||||
void yebisu_g6_DEBUG_read_pred(
|
||||
const int gpid,
|
||||
const int nj,
|
||||
const int addr,
|
||||
_out_ double pos [3],
|
||||
_out_ double vel [3],
|
||||
_out_ double mass[1],
|
||||
_out_ int id [1])
|
||||
{
|
||||
resource[gpid].DEBUG_read_pred(nj, addr, pos, vel, mass, id);
|
||||
}
|
||||
|
||||
int yebisu_g6_get_nimax()
|
||||
{
|
||||
return NIMAX;
|
||||
}
|
||||
|
||||
int yebisu_g6_get_njmax()
|
||||
{
|
||||
return NBODY_MAX;
|
||||
}
|
||||
|
||||
}
|
||||
61
yebisu_g6.h
Normal file
61
yebisu_g6.h
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
|
||||
extern "C"
|
||||
{
|
||||
void yebisu_g6_open (const int gpid);
|
||||
void yebisu_g6_close(const int gpid);
|
||||
void yebisu_g6_set_ip(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
const double pos[][3],
|
||||
const double vel[][3],
|
||||
const double eps2[],
|
||||
const double h2 [],
|
||||
const int id []);
|
||||
void yebisu_g6_push_jp(
|
||||
const int gpid,
|
||||
const double pos [3],
|
||||
const double vel [3],
|
||||
const double acc2[3],
|
||||
const double jrk6[6],
|
||||
const double mass,
|
||||
const double tj,
|
||||
const int id,
|
||||
const int addr);
|
||||
void yebisu_g6_transfer_jp(const int gpid);
|
||||
void yebisu_g6_set_ti(
|
||||
const int gpid,
|
||||
const double ti);
|
||||
void yebisu_g6_predict_all(
|
||||
const int gpid,
|
||||
const int nj);
|
||||
void yebisu_g6_launch_gravity(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
const int nj,
|
||||
const int with_neib);
|
||||
void yebisu_g6_get_force(
|
||||
const int gpid,
|
||||
const int ni,
|
||||
double acc [][3],
|
||||
double jrk [][3],
|
||||
double pot [],
|
||||
int nnb_id[]);
|
||||
void yebisu_g6_receive_neighbor_list(const int gpid);
|
||||
void yebisu_g6_get_neighbor_list(
|
||||
const int gpid,
|
||||
const int ipipe,
|
||||
const int maxlen,
|
||||
int *num_neib,
|
||||
int list[]);
|
||||
|
||||
void yebisu_g6_DEBUG_read_pred(
|
||||
const int gpid,
|
||||
const int nj,
|
||||
const int addr,
|
||||
double pos [3],
|
||||
double vel [3],
|
||||
double mass[1],
|
||||
int id [1]);
|
||||
int yebisu_g6_get_nimax();
|
||||
int yebisu_g6_get_njmax();
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue