From 604f1cb18abfc8e72d858d8922fa5176c6dba636 Mon Sep 17 00:00:00 2001 From: Yohai Date: Sun, 25 Aug 2019 16:12:39 +0800 Subject: [PATCH] first commit --- Makefile | 30 ++ cuda_pointer.h | 54 ++++ g6util.h | 90 ++++++ gpu.h | 80 ++++++ grape6.cpp | 173 +++++++++++ grape6.h | 90 ++++++ particle.h | 343 ++++++++++++++++++++++ yebisu_g6.cu | 762 +++++++++++++++++++++++++++++++++++++++++++++++++ yebisu_g6.h | 61 ++++ 9 files changed, 1683 insertions(+) create mode 100644 Makefile create mode 100644 cuda_pointer.h create mode 100644 g6util.h create mode 100644 gpu.h create mode 100644 grape6.cpp create mode 100644 grape6.h create mode 100644 particle.h create mode 100644 yebisu_g6.cu create mode 100644 yebisu_g6.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2c33d0b --- /dev/null +++ b/Makefile @@ -0,0 +1,30 @@ +ETICS_LMAX ?= 2 +ETICS_NMAX ?= 10 +GPUARCH ?= sm_75 +CUDAHOME ?= /usr/local/cuda +OPTIMIZATION ?= 3 + +CXX ?= g++ +NVCC ?= $(CUDAHOME)/bin/nvcc +CXXFLAGS += -O$(OPTIMIZATION) + +all : libyebisug6.a libyebisug6.nb.a + +yebisu_g6.o : yebisu_g6.cu yebisu_g6.h particle.h + $(NVCC) -arch=$(GPUARCH) $(CUDAFLAGS) -Xcompiler "$(CXXFLAGS)" -I$(CUDAHOME)/include -I$(CUDAHOME)/samples/common/inc -c yebisu_g6.cu + +grape6.o : grape6.cpp + $(CXX) $(CXXFLAGS) -c -DNB_FLAG=0 -o $@ $< + +grape6.nb.o : grape6.cpp + $(CXX) $(CXXFLAGS) -c -DNB_FLAG=1 -o $@ $< + +libyebisug6.a : yebisu_g6.o grape6.o + ar -r $@ $^ + ranlib $@ + +libyebisug6.nb.a : yebisu_g6.o grape6.nb.o + ar -r $@ $^ + ranlib $@ +clean: + rm -f *.o *.a diff --git a/cuda_pointer.h b/cuda_pointer.h new file mode 100644 index 0000000..953169b --- /dev/null +++ b/cuda_pointer.h @@ -0,0 +1,54 @@ +#include +// #include +#include + + +template +struct cudaPointer{ + T *dev_pointer; + T *host_pointer; + int size; + cudaPointer(){ + dev_pointer = NULL; + host_pointer = NULL; + size = 0; + } + ~cudaPointer(){ + // free(); + } + void allocate(int _size){ + size = _size; + void *p; + checkCudaErrors(cudaMalloc(&p, size * sizeof(T))); + assert(p); + dev_pointer = (T*)p; + checkCudaErrors(cudaMallocHost(&p, size * sizeof(T))); + assert(p); + host_pointer = (T*)p; + } + void free(){ + checkCudaErrors(cudaFree(dev_pointer)); + checkCudaErrors(cudaFreeHost(host_pointer)); + dev_pointer = NULL; + host_pointer = NULL; + size = 0; + } + void htod(int count){ + checkCudaErrors(cudaMemcpy(dev_pointer, host_pointer, count * sizeof(T), cudaMemcpyHostToDevice)); + } + void htod(){ + this->htod(size); + } + void dtoh(int count){ + checkCudaErrors(cudaMemcpy(host_pointer, dev_pointer, count * sizeof(T), cudaMemcpyDeviceToHost)); + } + void dtoh(){ + this->dtoh(size); + } + T &operator [] (int i){ + return host_pointer[i]; + } + operator T* (){ + return dev_pointer; + } +}; diff --git a/g6util.h b/g6util.h new file mode 100644 index 0000000..1b61148 --- /dev/null +++ b/g6util.h @@ -0,0 +1,90 @@ +#ifndef G6UTIL_H +#define G6UTIL_H + +// #include "typedef.h" + +/* constant definitions */ +#if !defined(__LANGUAGE_FORTRAN__) && !defined(_LANGUAGE_FORTRAN) + +#if defined(__cplusplus) +extern "C" +{ +#endif + /* C interface */ + + /* + * standard functions. + * the number of the cards is hidden to the user. + */ + void g6_open_all(void); + void g6_close_all(void); + int g6_set_j_particle_all(int address, int index, double tj, double dtj, double mass, + double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]); + int g6_set_j_particle_mxonly_all(int address, int index, double mass, double x[3]); + void g6_set_ti_all(double ti); + void g6calc_firsthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3], + double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]); + void g6calc_firsthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3], + double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode); + int g6calc_lasthalf_all(int nj, int ni, int index[], double xi[][3], double vi[][3], + double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]); + int g6calc_lasthalf0_all(int nj, int ni, int index[], double xi[][3], double vi[][3], + double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode); + int g6calc_lasthalf2_all(int nj, int ni, int index[], double xi[][3], double vi[][3], + double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]); + int g6_read_neighbour_list_all(void); + int g6_get_neighbour_list_all(int ipipe, int maxlength, int *nblen, int nbl[]); + void g6_set_nip_all(int nip); + void g6_set_njp_all(int njp); + void g6_set_i_particle_scales_from_real_value_all(int address, double acc[3], double jerk[3], double phi, + double jfactor, double ffactor); + void g6_set_i_particle_all(int address, int index, double x[3], double v[3], double eps2, double h2); + int g6_get_force_all(double acc[][3], double jerk[][3], double phi[], int flag[]); + int g6_get_force_etc_all(double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]); + void g6_get_predicted_j_particles_all(int nj, double (*x)[3], double (*v)[3]); + int g6_getnjmax_all(void); + + /* + * primitive functions to control multiple cards individually. + * the user needs to specify card's device id explicitly. + */ + void g6_open(int clusterid); + void g6_close(int clusterid); + void g6_set_tunit(int newtunit); + void g6_set_xunit(int newxunit); + int g6_set_j_particle(int clusterid, int address, int index, double tj, double dtj, double mass, + double a2by18[3], double a1by6[3], double aby2[3], double v[3], double x[3]); + int g6_set_j_particle_mxonly(int clusterid, int address, int index, double mass, double x[3]); + void g6_set_ti(int clusterid, double ti); + void g6calc_firsthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3], + double fold[][3], double jold[][3], double phiold[], double eps2, double h2[]); + void g6calc_firsthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3], + double fold[][3], double jold[][3], double phiold[], double *eps2, double h2[], int mode); + int g6calc_lasthalf(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3], + double eps2, double h2[], double acc[][3], double jerk[][3], double pot[]); + int g6calc_lasthalf0(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3], + double *eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int mode); + int g6calc_lasthalf2(int clusterid, int nj, int ni, int index[], double xi[][3], double vi[][3], + double eps2, double h2[], double acc[][3], double jerk[][3], double pot[], int nnbindex[]); + int g6_read_neighbour_list(int clusterid); + int g6_get_neighbour_list(int clusterid, int ipipe, int maxlength, int *nblen, int nbl[]); + void g6_set_neighbour_list_sort_mode(int mode); + int g6_get_neighbour_list_sort_mode(void); + int g6_npipes(void); + void g6_set_nip(int clusterid, int nip); + void g6_set_njp(int clusterid, int njp); + void g6_set_i_particle_scales_from_real_value(int clusterid, int address, double acc[3], double jerk[3], double phi, + double jfactor, double ffactor); + void g6_set_i_particle(int clusterid, int address, int index, double x[3], double v[3], double eps2, double h2); + int g6_get_force(int clusterid, double acc[][3], double jerk[][3], double phi[], int flag[]); + int g6_get_force_etc(int clusterid, double acc[][3], double jerk[][3], double phi[], int nnbindex[], int flag[]); + void g6_get_predicted_j_particles(int clusterid, int nj, double x[][3], double v[][3]); + int g6_getnjmax(int clusterid); + +#if defined(__cplusplus) +} +#endif + +#endif /* LANGUAGE_FORTRAN */ + +#endif /* G6UTIL_H */ diff --git a/gpu.h b/gpu.h new file mode 100644 index 0000000..e1ae350 --- /dev/null +++ b/gpu.h @@ -0,0 +1,80 @@ +//#define GPUTYPE 8800 // MP=16 +//#define NJBL_value 32 + +//#define GPUTYPE C1060 // MP=30 +//#define NJBL_value 30 + +//#define GPUTYPE M2070 // MP=14 +//#define NJBL_value 28 + +//#define GPUTYPE GF460 // MP=7 +//#define NJBL_value 14 + +//#define GPUTYPE GF470 // MP=14 +//#define NJBL_value 28 + +//#define GPUTYPE GF480 // MP=15 +//#define NJBL_value 30 + +//#define GPUTYPE GF570 // MP=15 +//#define NJBL_value 30 + +#define GPUTYPE K20m // MP=13 +#define NJBL_value 26 + +//#define GPUTYPE GF660 // MP=5 +//#define NJBL_value 10 + +//#define GPUTYPE GF780 // MP=15 +//#define NJBL_value 30 + +//#define GPUTYPE GF1080 // MP=20 +//#define NJBL_value 20 + +//#define GPUTYPE GFTITANX // MP=24 +//#define NJBL_value 24 + + +// NJBLOCKS = 1 or 2 x MP +// NXREDUCE is a power of 2 and >= NJBLOCKS ~32 is good for all +// NYREDUCE is a power of 2 < NXREDUCE ~4 is good for all + +enum{ + SM_VER = 35, + NTHREADS = 128, + NTHSCAT = 64, + NIBLOCKS = 16, + NJPSHRE = 32, + NIMAX = (NTHREADS * NIBLOCKS), + NJBLOCKS = NJBL_value, + NXREDUCE = 32, + NYREDUCE = 4, +}; + + +/* +---------------------------------------------------------------------------- +type MP x Cores x GPU clock rate SP Gflop/s DP Gflop/s + per MP (GHz) (nbody n=200k) (nbody n=200k) + +8800 GT 14 x 8 x 1.62 = 181.44 - +8800 512 16 x 8 x 1.80 = 230.40 298.266 - +9800 GTX 16 x 8 x 1.85 = 236.80 331.754 - +250 GTS 16 x 8 x 1.73 = 221.44 + +C1060 30 x 8 x 1.30 = 312.00 450.861 55.412 + +M2070 14 x 32 x 1.15 = 515.20 546.416 250.454 + +460 v2 7 x 48 x 1.56 = 524.16 421.335 67.939 +470 14 x 32 x 1.22 = 546.56 578.572 105.796 +480 15 x 32 x 1.40 = 672.00 705.514 145.502 +570 15 x 32 x 1.46 = 700.80 736.882 151.960 + +670 7 x 192 x 0.71 = 954.24 1151.030 100.092 +680 8 x 192 x 0.71 = 1090.56 1187.710 101.211 +K20m 13 x 192 x 0.71 = 1772.16 1322.374 576.523 +TITAN 14 x 192 x 0.88 = 2365.44 2026.287 751.671 +780 Ti 15 x 192 x 0.93 = 2678.40 2691.603 198.987 +---------------------------------------------------------------------------- +*/ \ No newline at end of file diff --git a/grape6.cpp b/grape6.cpp new file mode 100644 index 0000000..1a0f2c4 --- /dev/null +++ b/grape6.cpp @@ -0,0 +1,173 @@ +#include +#include + +#include "yebisu_g6.h" +#include "g6util.h" + +#define NIMAX 2048 +#define MAXDEV 4 + +#ifndef NB_FLAG +#define NB_FLAG 0 +#endif + +extern "C" +{ + static int sort_mode = 0; + static double eps2_buf[MAXDEV][NIMAX]; + static int nnb_buf[MAXDEV][NIMAX]; + + void g6_open(int clusterid){ + assert(clusterid < MAXDEV); + assert(NIMAX >= g6_npipes()); + yebisu_g6_open(clusterid); + } + void g6_close(int clusterid){ + yebisu_g6_close(clusterid); + } + void g6_set_tunit(int newtunit){} + void g6_set_xunit(int newxunit){} + int g6_set_j_particle( + int clusterid, + int address, + int index, + double tj, + double dtj, + double mass, + double a2by18[3], + double a1by6[3], + double aby2[3], + double v[3], + double x[3]) + { + yebisu_g6_push_jp(clusterid, x, v, aby2, a1by6, mass, tj, index, address); + return 0; + } + void g6_set_ti(int clusterid, double ti){ + yebisu_g6_set_ti(clusterid, ti); + } + void g6calc_firsthalf0( + int clusterid, + int nj, + int ni, + int index[], + double xi[][3], + double vi[][3], + double fold[][3], + double jold[][3], + double phiold[], + double *eps2, + double h2[], + int mode) + { + double *eps2_ptr = eps2; + if(mode){ // constand eps2 + eps2_ptr = eps2_buf[clusterid]; + for(int i=0; i