From 1718e6287bee102f3f46f868e405811c87c18887 Mon Sep 17 00:00:00 2001 From: Yohai Date: Sun, 25 Aug 2019 16:08:44 +0800 Subject: [PATCH] added project files --- Makefile | 49 +++++ README.md | 102 +++++++++ WORKPLAN.md | 26 +++ __init__.py | 1 + format-convert.py | 126 +++++++++++ interface.cu | 378 ++++++++++++++++++++++++++++++++ interface.py | 160 ++++++++++++++ src/Makefile | 72 ++++++ src/common.hpp | 117 ++++++++++ src/file.ini | 33 +++ src/ic.cpp | 269 +++++++++++++++++++++++ src/ic.hpp | 7 + src/integrate.cu | 127 +++++++++++ src/integrate.hpp | 27 +++ src/io.cpp | 367 +++++++++++++++++++++++++++++++ src/io.hpp | 19 ++ src/main.cu | 347 +++++++++++++++++++++++++++++ src/mathaux.cu | 57 +++++ src/mathaux.hpp | 30 +++ src/multicenter.cu | 131 +++++++++++ src/multicenter.hpp | 6 + src/noboost.inc | 15 ++ src/scf.cu | 518 ++++++++++++++++++++++++++++++++++++++++++++ src/scf.hpp | 61 ++++++ 24 files changed, 3045 insertions(+) create mode 100644 Makefile create mode 100644 README.md create mode 100644 WORKPLAN.md create mode 100644 __init__.py create mode 100644 format-convert.py create mode 100644 interface.cu create mode 100644 interface.py create mode 100644 src/Makefile create mode 100644 src/common.hpp create mode 100644 src/file.ini create mode 100644 src/ic.cpp create mode 100644 src/ic.hpp create mode 100644 src/integrate.cu create mode 100644 src/integrate.hpp create mode 100644 src/io.cpp create mode 100644 src/io.hpp create mode 100644 src/main.cu create mode 100644 src/mathaux.cu create mode 100644 src/mathaux.hpp create mode 100644 src/multicenter.cu create mode 100644 src/multicenter.hpp create mode 100644 src/noboost.inc create mode 100644 src/scf.cu create mode 100644 src/scf.hpp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fe49095 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +# standard amuse configuration include +# config.mk will be made after ./configure has run +AMUSE_DIR?=../../../.. +-include $(AMUSE_DIR)/config.mk +GPUARCH ?= 60 +CUDAFLAGS += -arch=sm_$(GPUARCH) +######### ugly!!!! should be in the amuse base make file + + +MPICXX ?= mpicxx + +CFLAGS += -Wall -g +CXXFLAGS += $(CFLAGS) +LDFLAGS += -lm $(MUSE_LD_FLAGS) + +OBJS = interface.o + +CODELIB = src/libetics.a + +CODE_GENERATOR = $(AMUSE_DIR)/build.py + +all: etics_worker + +clean: + $(RM) *.so *.o *.pyc worker_code.cc worker_code.h + $(RM) *~ etics_worker worker_code.cc + make -C src clean + +$(CODELIB): + make -C src libetics.a + +worker_code.cc: interface.py + $(CODE_GENERATOR) --type=c interface.py EticsInterface -o $@ + +worker_code.h: interface.py + $(CODE_GENERATOR) --type=H interface.py EticsInterface -o $@ + +etics_worker: worker_code.cc worker_code.h $(CODELIB) $(OBJS) + $(MPICXX) $(CXXFLAGS) -L/usr/local/cuda/lib64 $< $(OBJS) $(CODELIB) -o $@ -lcudart + echo the above compilation directive is not nice + +.SUFFIXES: .cu .o + +.cu.o: $< + $(NVCC) -Xcompiler="$(CXXFLAGS)" -c -o $@ $< + + +# .cc.o: $< +# $(CXX) $(CXXFLAGS) -c -o $@ $< diff --git a/README.md b/README.md new file mode 100644 index 0000000..6475b1f --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +ETICS +===== + +This is ETICS (Expansion Techniques In Collisionless Systems), a GPU (currently +**CUDA only**) N-body code which uses series expansion to calculate the +gravitational field. See more details in this publication: + +Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98 + +http://adsabs.harvard.edu/abs/2014ApJ...792...98M + + +What's inside +------------- + +- ETICS standalone program +- ETICS static library +- ETICS module for AMUSE + + +Prerequisites +------------- + +- CUDA (>= 6; mandatory) +- HDF5 (optional) +- Boost (optional) +- AMUSE (mandatory only for AMUSE module) + +To disable HDF5 [insert explanation here] +To disable Boost [insert explanation here] + + +Compilation +----------- + +### Standalone program + + make standalone + +builds in `src/`. + + +### Static library + + make library + +builds in `src/`. + + +### AMUSE module + + make + +builds in top level directory. The whole `etics` directory has to be placed (or +linked) inside: + + $AMUSE_DIR/src/amuse/community/ + + +How to use +---------- + +The `file.ini` is a self-explanatory input file; if compiled without Boost, fill +in the relevant variables in file `noboost.inc` which is compiled into the +executable (any change of parameters requires re-compilation). Start simulation +with: + + ./etics file.ini + +Any input file name is acceptable. + + +Known issues +------------ + +* No MEX + +The MEX (Multipole Expansion) method is not available in this version; thus, the +SCF (Self-Consistent Field) method is the only expansion technique available. +The ETICS program has been heavily restructured and the MEX routines are no +longer compatible. Hopefully this will be fixed. + +* Hardcoded launch configuration + +For at least one of the CUDA kernels, for various reasons, it seems that "brute +force" search is needed to find the optimal launch configuration. Currently it +is hardcoded, and a primitive search routine is provided. + +* Problem for particles with |θ| << 1 + +Due to using an unstable recursion relation to calculate the Associated Legendre +polynomials, particles in a narrow cone around the z-axis cannot be considered +accurately. This means that they give an erroneous contribution to the +gravitational field and also are assigned erroneous force and potential. The +size of this cone increases with the angular term of the expansion. To partly +solve this, the current (ugly) fix is to only consider particles with cos(θ) > +0.999 at the monopole level. This is not so bad because the monopole is always +the most dominant term (and is error free) and the number of particles in this +cone is small and they are transient (i.e. they come out of it usually in a +small number of steps). A somewhat better solution is to make this arbitrary +cutoff of 0.999 l-dependent, and an even better solution would be to use an +asymptotic expression for the Associated Legendre polynomials. diff --git a/WORKPLAN.md b/WORKPLAN.md new file mode 100644 index 0000000..591474c --- /dev/null +++ b/WORKPLAN.md @@ -0,0 +1,26 @@ +The idea is to be able to handle mergers and sinking galactic satellites by +having each subsystem centered on one (massless?) particle, and have the +expansion calculated separately in each center of mass system (only for +particles belonging to it). After the internal forces are calculated for each +subsystem, the mutual forces are calculated as well, and all the particles are +integrated in a global coordinate system. + +Technically, most of the SCF module has to become a class. Some of the arrays +like RadCoeff and AngCoeff (anything else?) should remain global and would be +initialized by some global initialization routine. Then each subsystem would be +a different instance of the SCF class, with its own list of coefficients and +particle cache. It seems that all subsystems can (should?) share the integrator +object. Maybe (probably?) the integrator should be agnostic to how particles are +divided into subsystems, and the call to CalculateGravity() would take care of +everything. In this case we are talking about a class for the current SCF +routines but also some wrapper that knows how to CalculateGravity() by calling +the appropriate methods for each subsystem's object, and also work out the +mutual gravity. Need a good name for this intermediate level between the +integrator and the SCF core. + +This intermediate level or even a different class would deal how to associate +particles to the different subsystems and how to figure out the initial +conditions. Also, the central particles might need some special treatment +(possibly the integrator considers them normal particles but this other level +knows not to use them for force calculation; I should think whether those should +be omitted from output, but it might be helpful to keep them). diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..abe3ba8 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +# generated file \ No newline at end of file diff --git a/format-convert.py b/format-convert.py new file mode 100644 index 0000000..bab62eb --- /dev/null +++ b/format-convert.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# This script converts phiGRAPE snapshot files to HDF5. +# Author: Yohai Meiron + +import argparse, os, sys, numpy, re + +# Set commandline parameters +parser = argparse.ArgumentParser(description='Convert phiGRPAE snapshot files into HDF5 (h5part) format.') +parser.add_argument('InputFile', metavar='InputFile', type=str, help='first input file (numbers in the file name must represet numerical order)') +parser.add_argument('--num', type=int, help='number of snapshot to convert (by default, all are converted)') +parser.add_argument('-o', '--output', default='output', help='output file name (or pattern)') +parser.add_argument('--particles', type=int, help='number of particles (if not the whole thing)') +parser.add_argument('--maxfs', type=float, help='maximum size in MB of a single HDF5 output') +parser.add_argument('--single', action='store_true', help='use single precision instead of double') +parser.add_argument('--bh', type=int, nargs='?', const=1, default=0, help='number of black holes to grab from the end of the file') + +# Print message and parse input +print 'Welcome to Yohai\'s format converter! (phiGRAPE ---> HDF5)\n' +args = parser.parse_args() + +# Parameter conflict tests +if (args.bh > 0) and (args.particles==None): + parser.print_usage() + print '%s: error: cannot --bh without --particles' % sys.argv[0] + sys.exit(2) +Nbh = args.bh + +# Verify h5py +try: + import h5py +except: + print '%s: error: booooo, can\'t find h5py.' % sys.argv[0] + sys.exit(1) + +# Check if input file exists, separate name from path and get file name pattern +if not os.path.exists(args.InputFile): + print 'File `%s` not found!' % args.InputFile + sys.exit(1) +InputFileName = os.path.basename(args.InputFile) +InputFilePath = os.path.dirname(os.path.realpath(args.InputFile)) +Pattern = re.sub(r'\d', r'\d', InputFileName) +Pattern = '^' + Pattern + "$" +r = re.compile(Pattern) + +# Look for the other files +ListDir = os.listdir(InputFilePath) +FileList = [] +for Filename in ListDir: + m = r.search(Filename) + if not m is None: + FileList.append(Filename) +FileList.sort() +FileList = FileList[FileList.index(InputFileName):] + +# Verify that enough files were found; shorten list if needed +if (not args.num==None) and args.num > len(FileList): + print 'Requested to convert %d files, but only %d found!' % (args.num, len(FileList)) + sys.exit(1) +if (not args.num==None): num = args.num +else: num = len(FileList) +FileList = FileList[:num] + +print 'Will convert %d files in the directory `%s`.' % (num, InputFilePath) + +NumperOfStepsPerFile = 2147483647 +OutputFileNum = 2 +StepCounter = 0 +FirstTime = True + +for Filename in FileList: + if StepCounter > NumperOfStepsPerFile: + OutputFile.close() + OutputFile = h5py.File('output(%d).h5part' % OutputFileNum, 'w') + StepCounter = 0 + OutputFileNum += 1 + FullFilename = os.path.join(InputFilePath, Filename) + with open(FullFilename, 'r') as File: + Line = File.readline() + try: + StepNumer = int(Line) + except: + print 'Error to read snapshot number (line 1 in file `%s`)' % Filename + sys.exit(1) + File.readline() + Line = File.readline() + try: + Time = numpy.double(Line) + except: + print 'Error to read snapshot time (line 3 in file `%s`)' % Filename + sys.exit(1) + Data = numpy.loadtxt(FullFilename, skiprows=3) + if (not args.particles==None) and (args.particles > Data.shape[0]): + print 'Requested to use %d paticles, but file %s has only %d!' % (args.particles, Filename, Data.shape[0]) + sys.exit(1) + if (not args.particles==None): N = args.particles + else: N = Data.shape[0] + if FirstTime: + if Nbh == 0: print 'Reading %d particles.' % (N) + else: print 'Reading %d normal particles + %d black hole(s).' % (N, Nbh) + if args.single: EstimatedDataSize = 32*N/1024.0**2 + else: EstimatedDataSize = 60*N/1024.0**2 + if not args.maxfs==None: NumperOfStepsPerFile = (int)(args.maxfs/EstimatedDataSize) + FirstTime = False + if NumperOfStepsPerFile > num: OutputFile = h5py.File('output.h5part', 'w') + else: OutputFile = h5py.File('output(1).h5part', 'w') + + DataBH = Data[-Nbh-1:-1] + Data = Data[:N] + Data = numpy.vstack((Data, DataBH)) + Group = OutputFile.create_group('Step#%d' % (StepNumer)) + Group.attrs['Time'] = Time + Group.attrs['TotalN'] = N + Nbh + if args.single: T='f' + else: T='d' + DataSet = Group.create_dataset('ID', (N+Nbh,), dtype='i'); DataSet[...] = Data[:,0] + DataSet = Group.create_dataset('Mass', (N+Nbh,), dtype=T); DataSet[...] = Data[:,1] + DataSet = Group.create_dataset('X', (N+Nbh,), dtype=T); DataSet[...] = Data[:,2] + DataSet = Group.create_dataset('Y', (N+Nbh,), dtype=T); DataSet[...] = Data[:,3] + DataSet = Group.create_dataset('Z', (N+Nbh,), dtype=T); DataSet[...] = Data[:,4] + DataSet = Group.create_dataset('VX', (N+Nbh,), dtype=T); DataSet[...] = Data[:,5] + DataSet = Group.create_dataset('VY', (N+Nbh,), dtype=T); DataSet[...] = Data[:,6] + DataSet = Group.create_dataset('VZ', (N+Nbh,), dtype=T); DataSet[...] = Data[:,7] + OutputFile.flush() + print 'Completed /Step#%d' % (StepNumer) + StepCounter += 1 +OutputFile.close() diff --git a/interface.cu b/interface.cu new file mode 100644 index 0000000..203ce2a --- /dev/null +++ b/interface.cu @@ -0,0 +1,378 @@ +#include + +#include "src/common.hpp" +#include "src/scf.hpp" +#include "src/integrate.hpp" +#include +#include +#include + +using namespace std; +using namespace etics; + +Integrator IntegratorObj; + +// GLOBAL VARIABLES +Real ConstantStep = 0.001953125; +Real T, Step, dT1, dT2, Tcrit; +int N; +extern Real mass; +extern int k3gs, k3bs, k4gs, k4bs; + +/*extern*/ Particle *hostP; +thrust::host_vector PPP; +/*extern*/ thrust::device_vector PPPPP; + +// /*extern*/ thrust::device_vector F0xxxxxx; +// /*extern*/ thrust::device_vector PotPotPot; // ugly name +// /*extern*/ thrust::device_vector F1xxxxxx; +/*extern*/ vec3 *F1_ptr; + + + +// extern Particle *P_h; +// extern thrust::device_vector P; +// +// extern thrust::device_vector F0; +// extern thrust::device_vector Potential; +// extern thrust::device_vector F1; + + +void CommitParticles(); +// void InitSCF(int N); +// void ForceSCF(int N, Real *Potential, Particle *PPPPP, vec3 *F); +void DriftStep(); +void KickStep(); +void CommitForces(); +int InitilizeIntegratorMemory(); +#define PTR(x) (thrust::raw_pointer_cast((x).data())) + + +int initialize_code() { +#warning initscf should be here!!! just the problem is that N is requires, so fix it + return 0; +} + +int recommit_parameters() { + return 0; +} + +int commit_parameters() { + return 0; +} + +int new_particle(int *id, double mass, double x, double y, double z, double vx, double vy, double vz, double radius) { + Particle p; + p.m = mass; + p.pos = vec3(x, y, z); + p.vel = vec3(vx, vy, vz); + PPP.push_back(p); + *id = N; + N++; + return 0; +} + +int commit_particles() { +// cerr << "calling commit_particles" << endl; + cerr << "we did commit_particles()" << endl; + etics::scf::Init(N, 180, 64, 2605, 384); +#warning hardcoded launch configuration + IntegratorObj = Integrator(&PPP[0], N); + return 0; +} + +struct CenterMassFunctor { + Real ConstantMass; + __host__ __device__ CenterMassFunctor(Real _ConstantMass) : ConstantMass(_ConstantMass) {} + __host__ __device__ vec3 operator() (const Particle &p) const {return p.pos;} +}; + +struct ShiftFunctor { + vec3 Shift; + __host__ __device__ ShiftFunctor(vec3 _Shift) : Shift(_Shift) {} + __host__ __device__ Particle operator() (Particle &p) const { + p.pos += Shift; + p.CalculateR2(); + return p; + } +}; + +int counttt = 0; +bool FirstStep = true; +int evolve_model(double t) { +// PPPPP = PPP; + cerr << "call evolve_model t_end = " << t << " dt = " << t - T << "****************" << endl; + +// vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus()); +// CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass +// cerr << "CENTER OF MASS " << CenterMass.x << endl; +// +// // thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(-CenterMass)); +// +// vec3 CenterMass2 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus()); +// CenterMass2 = CenterMass2 * (1.0/N); //ugly should divide by the total mass +// cerr << "CENTER OF MASS after correction " << CenterMass2.x << endl; +// + + Step = ConstantStep; + while (T <= t) { + // Take the drift step. + IntegratorObj.DriftStep(Step); + + // Calculate the forces in the new positions. +// ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F1xxxxxx)); + IntegratorObj.CalculateGravity(); + + // Finish by taking the kick step. + // The kick functor also "commits" the predicted forces into the "acc" member. + IntegratorObj.KickStep(Step); + + // N particles were implicitly propagated in this iteration. + + // Advance global time. + T += Step; + } +// +// vec3 CenterMass3 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus()); +// CenterMass3 = CenterMass3 * (1.0/N); //ugly should divide by the total mass +// cerr << "CENTER OF MASS after evolve " << CenterMass3.x << endl; +// +// cerr << "done evolve; transform" << endl; +// // thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(+CenterMass)); // antishift +// +// vec3 CenterMass4 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus()); +// CenterMass4 = CenterMass4 * (1.0/N); //ugly should divide by the total mass +// cerr << "CENTER OF MASS after antishift " << CenterMass4.x << endl; +// +// cerr << "done transform; download to RAM" << endl; + IntegratorObj.CopyParticlesToHost(&PPP[0]); +// +// cerr << "done download; return" << endl; + return 0; +} + +int set_begin_time(double time_begin) { +// cerr << "called set_begin_time(" << time_begin << endl; + return 0; +} + + +int get_begin_time(double *time_begin) { + *time_begin = 0; + return 0; +} + +int get_mass(int index_of_the_particle, double *mass) { + *mass = PPP[index_of_the_particle].m; + return 0; +} + +int get_time(double *time) { + *time = T; + return 0; +} + +int set_mass(int index_of_the_particle, double mass) { +// cerr << "calling set_mass" << endl; + PPP[index_of_the_particle].m = mass; + return 0; +} + +int get_index_of_first_particle(int *index_of_the_particle) { +// cerr << "calling get_index_of_first_particle" << endl; + *index_of_the_particle = 0; + return 0; +} + +int get_total_radius(double *radius) { + return -2; +} + +int get_potential_at_point(double soft, double x, double y, double z, double *phi) { + return -2; +} + +int get_total_mass(double *mass) { + return -2; +} + +int set_eps2(double epsilon_squared) { + return -1; +} + +int get_eps2(double *epsilon_squared) { + *epsilon_squared = 0; + return -1; +} + +int get_number_of_particles(int *number_of_particles) { +// cerr << "calling get_number_of_particles" << endl; + *number_of_particles = PPP.size(); + return 0; +} + +int get_index_of_next_particle(int index_of_the_particle, int *index_of_the_next_particle) { + *index_of_the_next_particle = index_of_the_particle + 1; + return 0; +} + +int delete_particle(int index_of_the_particle) { + return -2; +} + +int get_potential(int index_of_the_particle, double *potential) { + return -2; +} + +int synchronize_model() { +// cerr << "calling synchronize_model" << endl; + return 0; +} + +int set_state(int index_of_the_particle, double mass, double radius, double x, double y, double z, double vx, double vy, double vz) { + cerr << "calling set_state" << endl; +// cerr << "calling set_state" << endl; + PPP[index_of_the_particle].pos = vec3(x, y, z); + PPP[index_of_the_particle].vel = vec3(vx, vy, vz); + return 0; +} + +int get_state(int index_of_the_particle, double *mass, double *radius, double *x, double *y, double *z, double *vx, double *vy, double *vz) { +// cerr << "calling get_state" << endl; + Particle p = PPP[index_of_the_particle]; + *mass = index_of_the_particle; + *x = p.pos.x; + *y = p.pos.y; + *z = p.pos.z; + *vx = p.vel.x; + *vy = p.vel.y; + *vz = p.vel.z; + return 0; +} + +int get_time_step(double *time_step) { +// cerr << "calling get_time_step" << endl; + *time_step = ConstantStep; + return 0; +} + +int set_time_step(double time_step) { + cerr << "calling set_time_step" << endl; + ConstantStep = time_step; + return 0; +} + +int get_launch_config(int **launch_config) { + return 0; +} + +int set_launch_config(int *launch_config) { +// k3gs = launch_config[0]; +// k3bs = launch_config[1]; +// k4gs = launch_config[2]; +// k4bs = launch_config[3]; + return -2; +} + +int recommit_particles() { +// cerr << "calling recommit_particles" << endl; +#warning put something here + cerr << "hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" << endl; + PPPPP = PPP; + return -2; +} + +int set_acceleration(int index_of_the_particle, double ax, double ay, double az) { + return -2; +} + +int get_center_of_mass_position(double *x, double *y, double *z) { +// vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus()); +// CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass +// *x = CenterMass.x; +// *y = CenterMass.y; +// *z = CenterMass.z; + return 0; +} + +int get_center_of_mass_velocity(double *vx, double *vy, double *vz) { + return -2; +} + +int get_radius(int index_of_the_particle, double *radius) { + *radius = 0; + return 0; +} + +int set_radius(int index_of_the_particle, double radius) { + // should store the radius somewhere but completely ignored by code +// cerr << "calling set_radius" << endl; + return 0; +} + +int cleanup_code() { + IntegratorObj.~Integrator(); + cerr << "bye" << endl; + return 0; +} + +int get_gravity_at_point(double soft, double x, double y, double z, double *forcex, double *forcey, double *forcez) { + return -2; +} + +int get_velocity(int index_of_the_particle, double *vx, double *vy, double *vz) { + *vx = PPP[index_of_the_particle].vel.x; + *vy = PPP[index_of_the_particle].vel.y; + *vz = PPP[index_of_the_particle].vel.z; + return 0; +} + +int get_position(int index_of_the_particle, double *x, double *y, double *z) { + *x = PPP[index_of_the_particle].pos.x; + *y = PPP[index_of_the_particle].pos.y; + *z = PPP[index_of_the_particle].pos.z; + return 0; +} + +bool already_printed = false; + +int set_position(int index_of_the_particle, double x, double y, double z) { + if (already_printed == false) { + cerr << "calling set_position" << endl; + cerr << "---------index_of_the_particle=" << index_of_the_particle << endl; + cerr << "--------- x" << PPP[index_of_the_particle].pos.x << "--->" << x << endl; + cerr << "--------- y" << PPP[index_of_the_particle].pos.y << "--->" << y << endl; + cerr << "--------- z" << PPP[index_of_the_particle].pos.z << "--->" << z << endl; + already_printed = true; + } + PPP[index_of_the_particle].pos = vec3(x, y, z); + counttt++; + return 0; +} + +int get_acceleration(int index_of_the_particle, double *ax, double *ay, double *az) { + return -2; +} + +int set_velocity(int index_of_the_particle, double vx, double vy, double vz) { +// cerr << "calling set_velocity" << endl; + PPP[index_of_the_particle].vel = vec3(vx, vy, vz); + return 0; +} + + +int get_kinetic_energy(double *kinetic_energy) { + *kinetic_energy = IntegratorObj.KineticEnergy(); + return 0; +} + +int get_potential_energy(double *potential_energy) { + *potential_energy = IntegratorObj.PotentialEnergy(); + return 0; +} + +int update_force_potential_arrays(double tttt) { +#warning time shouldnt be a parameter to this one +// ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F0xxxxxx)); + return 0; +} \ No newline at end of file diff --git a/interface.py b/interface.py new file mode 100644 index 0000000..4db3881 --- /dev/null +++ b/interface.py @@ -0,0 +1,160 @@ +from amuse.community import * +from amuse.community.interface.gd import GravitationalDynamics +from amuse.community.interface.gd import GravitationalDynamicsInterface + +class EticsInterface(CodeInterface, GravitationalDynamicsInterface, LiteratureReferencesMixIn): + """ + .. [#] Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98: + .. [#] ... "Expansion techniques for collisionless stellar dynamical simulations" + """ + + include_headers = ['worker_code.h'] + + def __init__(self, **keyword_arguments): + CodeInterface.__init__(self, name_of_the_worker='etics_worker', **keyword_arguments) + LiteratureReferencesMixIn.__init__(self) + + @legacy_function + def new_particle(): + function = LegacyFunctionSpecification() + function.can_handle_array = True + function.addParameter('index_of_the_particle', dtype='int32', direction=function.OUT) + function.addParameter('mass', dtype='float64', direction=function.IN, + description = 'The mass of the particle') + function.addParameter('x', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('y', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('z', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('vx', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.addParameter('vy', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.addParameter('vz', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.addParameter('radius', dtype='float64', direction=function.IN, + description = 'The radius of the particle', default = 0) + function.result_type = 'int32' + return function + + @legacy_function + def set_state(): + function = LegacyFunctionSpecification() + function.can_handle_array = True + function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN) + function.addParameter('mass', dtype='float64', direction=function.IN, + description = 'The mass of the particle') + function.addParameter('radius', dtype='float64', direction=function.IN, + description = 'The radius of the particle') + function.addParameter('x', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('y', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('z', dtype='float64', direction=function.IN, + description = 'The initial position vector of the particle') + function.addParameter('vx', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.addParameter('vy', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.addParameter('vz', dtype='float64', direction=function.IN, + description = 'The initial velocity vector of the particle') + function.result_type = 'int32' + return function + + @legacy_function + def get_state(): + function = LegacyFunctionSpecification() + function.can_handle_array = True + function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN) + function.addParameter('mass', dtype='float64', direction=function.OUT, + description = 'The mass of the particle') + function.addParameter('radius', dtype='float64', direction=function.OUT, + description = 'The radius of the particle') + function.addParameter('x', dtype='float64', direction=function.OUT, + description = 'The initial position vector of the particle') + function.addParameter('y', dtype='float64', direction=function.OUT, + description = 'The initial position vector of the particle') + function.addParameter('z', dtype='float64', direction=function.OUT, + description = 'The initial position vector of the particle') + function.addParameter('vx', dtype='float64', direction=function.OUT, + description = 'The initial velocity vector of the particle') + function.addParameter('vy', dtype='float64', direction=function.OUT, + description = 'The initial velocity vector of the particle') + function.addParameter('vz', dtype='float64', direction=function.OUT, + description = 'The initial velocity vector of the particle') + function.result_type = 'int32' + return function + + @legacy_function + def evolve_model(): + function = LegacyFunctionSpecification() + function.can_handle_array = True + function.addParameter('dt', dtype='float64', direction=function.IN) + function.result_type = 'int32' + return function + + @legacy_function + def get_number_of_particles(): + function = LegacyFunctionSpecification() + function.addParameter('number_of_particles', dtype='int32', direction=function.OUT, description = 'number of particles') + function.result_type = 'int32' + return function + + @legacy_function + def set_time_step(): + function = LegacyFunctionSpecification() + function.addParameter('time_step', dtype='float64', direction=function.IN, description = 'time step') + function.result_type = 'int32' + return function + + @legacy_function + def update_force_potential_arrays(): + function = LegacyFunctionSpecification() + function.can_handle_array = True + function.addParameter('dt', dtype='float64', direction=function.IN) + function.result_type = 'int32' + return function + +class Etics(GravitationalDynamics): + def __init__(self, convert_nbody = None, **keyword_arguments): + legacy_interface = EticsInterface(**keyword_arguments) + GravitationalDynamics.__init__(self, legacy_interface, convert_nbody, **keyword_arguments) + + def define_parameters(self, object): + object.add_method_parameter( + 'get_begin_time', + 'set_begin_time', + 'begin_time', + 'model time to start the simulation at', + default_value = 0.0 | nbody_system.time + ) + object.add_method_parameter( + 'get_time_step', + 'set_time_step', + 'time_step', + 'constant timestep for iteration', + default_value = 0.001953125 | nbody_system.time + ) + + def define_methods(self, object): + GravitationalDynamics.define_methods(self, object) + # Define some shortcuts for better readability. + M = nbody_system.mass + L = nbody_system.length + V = nbody_system.speed + T = nbody_system.time + object.add_method('new_particle', (M,L,L,L,V,V,V,L), (object.INDEX, object.ERROR_CODE)) + object.add_method('set_state', (object.INDEX, M,L,L,L,L,V,V,V), (object.ERROR_CODE)) + object.add_method('get_state', (object.INDEX), (M,L,L,L,L,V,V,V, object.ERROR_CODE)) + object.add_method('set_time_begin', (T), (object.ERROR_CODE)) + object.add_method('get_time_begin', (), (T, object.ERROR_CODE)) + object.add_method('get_number_of_particles', (), (units.none, object.ERROR_CODE)) + object.add_method('get_time_step', (), (T, object.ERROR_CODE)) + object.add_method('set_time_step', (T), (object.ERROR_CODE)) + object.add_method('update_force_potential_arrays', (T), (object.ERROR_CODE)) + + def define_particle_sets(self, object): + GravitationalDynamics.define_particle_sets(self, object) + + diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..a65224e --- /dev/null +++ b/src/Makefile @@ -0,0 +1,72 @@ +ETICS_LMAX ?= 2 +ETICS_NMAX ?= 10 +GPUARCH ?= sm_75 +CUDAHOME ?= /usr/local/cuda +OPTIMIZATION ?= 3 + +#Optional CPPFLAGS: -DETICS_HDF5 -DETICS_BOOST -DETICS_MPI +#Note that -DETICS_MPI is /not/ needed if ETICS is used as a library +CPPFLAGS = -DSCF -DLMAX=$(ETICS_LMAX) -DNMAX=$(ETICS_NMAX) + +CXX ?= g++ +NVCC ?= $(CUDAHOME)/bin/nvcc +LIB += -lm -L$(CUDAHOME)/lib64 -lcudadevrt -lcudart -lcuda +CXXFLAGS += -O$(OPTIMIZATION) +CUDAFLAGS += -arch=$(GPUARCH) + +# If -DETICS_HDF5 is used, please specify the following +#H5HOME = /home/meiron/local +#INC += -I$(H5HOME)/include +#LIB += -L$(H5HOME)/lib $(H5HOME)/lib/libhdf5_hl_cpp.a $(H5HOME)/lib/libhdf5_cpp.a $(H5HOME)/lib/libhdf5_hl.a $(H5HOME)/lib/libhdf5.a -lz -ldl -lm +#LDFLAGS += -Wl,-rpath -Wl,$(H5HOME)/lib + +# If -DETICS_BOOST is used, please specify the following +#BOOSTHOME ?= /home/meiron/local +#INC += -I$(BOOSTHOME)/include + +# If -DETICS_MPI is used, please specify the following +#MPIHOME ?= /home/meiron/local +#INC += -I$(MPIHOME)/include +#LIB += -L$(MPIHOME)/lib -lmpi +#CXXFLAGS += -pthread +#LDFLAGS += -Wl,-rpath -Wl,$(MPIHOME) -Wl,--enable-new-dtags + +CODELIB = libetics.a + +# object files needed for the library and the standalone +CODEOBJS1 = integrate.o mathaux.o scf.o ic.o + +# object files only needed for the standalone +CODEOBJS2 = io.o main.o + +AR = ar ruv +RANLIB = ranlib + +all: $(CODELIB) standalone + +clean: + $(RM) *.o *.a etics + +$(CODELIB): $(CODEOBJS1) + $(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) -o dlink.o + $(AR) $@ $(CODEOBJS1) dlink.o + $(RANLIB) $@ + +library: $(CODELIB) + +standalone: $(CODEOBJS1) $(CODEOBJS2) + echo -e "\e[31mWARNING! \e[0mThe standalone program has not been tested for a long time. Use extreme caution." + $(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) $(CODEOBJS2) -o dlink.o + $(CXX) $(CXXFLAGS) -o etics $(CODEOBJS1) $(CODEOBJS2) dlink.o $(LIB) $(LDFLAGS) + +.SUFFIXES: .cu .cpp .o + +# only io.cpp should be compiled with all the weird HDF5 flags +io.o: io.cpp + $(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o io.o io.cpp + +.cu.o: $< + $(NVCC) $(CPPFLAGS) $(INC) -Xcompiler "$(CXXFLAGS)" $(CUDAFLAGS) -dc -o $@ $< + +.cpp.o: $< + $(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o $@ $< diff --git a/src/common.hpp b/src/common.hpp new file mode 100644 index 0000000..2724751 --- /dev/null +++ b/src/common.hpp @@ -0,0 +1,117 @@ +#pragma once +#include +#include + +#ifndef __CUDACC__ + #define __host__ + #define __device__ +#endif + +#if defined ETICS_SINGLE_PRECISION && defined ETICS_DOUBLE_PRECISION + #error Contradictory precision flags! +#endif + +#ifndef ETICS_SINGLE_PRECISION + #define ETICS_DOUBLE_PRECISION + #define Real double + #define MPI_ETICS_REAL MPI_DOUBLE +#else + #define Real float + #define MPI_ETICS_REAL MPI_FLOAT +#endif + +#if defined MEX && defined SCF + #error Contradictory method flags! +#endif + +#if defined MEX && !defined LMAX + #error LMAX not defined. +#endif + +#if defined SCF && ((!defined NMAX) || (!defined LMAX)) + #error Both NMAX and LMAX should be defined! +#endif + +#if defined MEX && defined NMAX + #warning NMAX is defined, but will be ignored since the method is MEX! +#endif + +struct vec3 { + Real x, y, z; + + __host__ __device__ vec3() : x(0), y(0), z(0) {} + __host__ __device__ vec3(Real _x, Real _y, Real _z) : x(_x), y(_y), z(_z) {} + __host__ __device__ Real abs2() const {return x*x + y*y + z*z;} + __host__ __device__ vec3 operator+ (const vec3& V) const {return vec3(this->x + V.x, this->y + V.y, this->z + V.z);} + __host__ __device__ vec3 operator- (const vec3& V) const {return vec3(this->x - V.x, this->y - V.y, this->z - V.z);} + __host__ __device__ vec3 operator* (const Real& C) const {return vec3(this->x*C, this->y*C, this->z*C);} + __host__ __device__ vec3& operator+= (const vec3& V) {this->x += V.x; this->y += V.y; this->z += V.z; return *this;} + __host__ __device__ vec3& operator-= (const vec3& V) {this->x -= V.x; this->y -= V.y; this->z -= V.z; return *this;} + __host__ __device__ vec3& operator/= (const Real& C) {this->x /= C; this->y /= C; this->z /= C; return *this;} + __host__ __device__ vec3 operator- () const {return vec3(-this->x, -this->y, -this->z);} + __host__ __device__ vec3 operator+ () const {return vec3(this->x, this->y, this->z);} +}; + + +class Particle { + public: + int ID; + unsigned char Status; + Real m; + vec3 pos, vel, acc; + Real R2; + + __host__ __device__ Particle() {} + __host__ __device__ void CalculateR2() {R2 = pos.abs2();} + __host__ __device__ bool operator< (const Particle& p) const {return (this->R2 < p.R2);} +}; + +#define ETICS_MY_CLOCK CLOCK_MONOTONIC +class HostTimer { + timespec StartTime, StopTime; + public: + void Start() { + clock_gettime(ETICS_MY_CLOCK, &StartTime); + } + void Stop() { + clock_gettime(ETICS_MY_CLOCK, &StopTime); + } + double Difference() { + timespec TimeDiff; + if ((StopTime.tv_nsec - StartTime.tv_nsec) < 0) { + TimeDiff.tv_sec = StopTime.tv_sec - StartTime.tv_sec - 1; + TimeDiff.tv_nsec = 1000000000 + StopTime.tv_nsec - StartTime.tv_nsec; + } else { + TimeDiff.tv_sec = StopTime.tv_sec - StartTime.tv_sec; + TimeDiff.tv_nsec = StopTime.tv_nsec - StartTime.tv_nsec; + } + return (double)TimeDiff.tv_sec + ((double)TimeDiff.tv_nsec)*1.0e-9; + } +}; + +#ifdef __CUDACC__ +class DeviceTimer { + cudaEvent_t StartTime, StopTime; + public: + DeviceTimer() { + cudaEventCreate(&StartTime); + cudaEventCreate(&StopTime); + } + ~DeviceTimer() { + cudaEventDestroy(StartTime); + cudaEventDestroy(StopTime); + } + void Start() { + cudaEventRecord(StartTime); + } + void Stop() { + cudaEventRecord(StopTime); + cudaEventSynchronize(StopTime); + } + double Difference(){ + float TimeDiff; + cudaEventElapsedTime(&TimeDiff, StartTime, StopTime); + return (double)TimeDiff * 1.0e-3; + } +}; +#endif diff --git a/src/file.ini b/src/file.ini new file mode 100644 index 0000000..3188564 --- /dev/null +++ b/src/file.ini @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This is an ETICS parameter file +; Required parameters are: +; Filename, N, Tcrit, StepSize, dT1, dT2 +; +; Special values for Filename: +; _nofile_ or _hernquist_ Generates a Hernquist sphere +; _plummer_ Generates a Plummer sphere +; The _xxx_ can be followed by a integer indicating the random seed, otherwise, +; the system time is used. The generated initial conditions are only saved at +; the t=0 snapshot. +; +; Optional parameters are: +; device (CUDA device ID; by default or if negative, cudaSetDevice is skipped) +; Prefix (="", prefix for the snapshot file names) +; OutputFormat (="ascii", the output format, either ascii or hdf5) + + +Filename = _hernquist_ 0 +;N = 1000000 +;Tcrit = 0.03125 +;StepSize = 0.001953125 +;dT1 = 0.001953125 +;dT2 = 0.25 + +N = 1000000 +Tcrit = 500 +StepSize = 0.03125 +dT1 = 1.0 +dT2 = 5.0 + +Prefix = /home/ym/tmp/merger +OutputFormat = ascii diff --git a/src/ic.cpp b/src/ic.cpp new file mode 100644 index 0000000..f093713 --- /dev/null +++ b/src/ic.cpp @@ -0,0 +1,269 @@ +/** + * @file ic.cpp + * @author Yohai Meiron + * @brief Functions to generate initial conditions. + * + * These are functions to generate a Hernquist or a Plummer sphere. In both + * cases the model has a total mass of 1 and the total energy is -0.25 Henon + * energy units. The Hernquist model generator was adapted from a program by + * John Dubinski (1999, original version) from the NEMO toolbox; the accessory + * functions are from Numerical Recipes in C (Press et al. 1992). The Plummer + * generator is adapted from the gen-plum program by Peter Berczik. + */ + +#include +#include +#include +#include +#include "common.hpp" +#include "ic.hpp" + +// Internal functions +namespace etics { + namespace ic { + double E; + + void polint(double xa[], double ya[], int n, double x, double *y, double *dy); + double trapzd(double (*func)(double), double a, double b, int n); + double qromb(double (*func)(double), double a, double b); + double xrandom(double a, double b); + double xrandom(); + double f(double E2), drho2(double u); + } +} + +double etics::ic::trapzd(double (*func)(double), double a, double b, int n) { + double x, tnm, sum, del; + static double s; + int it, j; + + if (n == 1) { + return (s=0.5*(b-a)*((*func)(a) + (*func)(b))); + } else { + for (it = 1, j = 1; j < n-1; j++) it <<= 1; + tnm = it; + del = (b-a)/tnm; + x = a + 0.5*del; + for (sum = 0, j = 1; j <= it; j++, x += del) sum += (*func)(x); + s = 0.5*(s+(b-a)*sum/tnm); + return s; + } +} + +void etics::ic::polint(double xa[], double ya[], int n, double x, double *y, double *dy) { + int ns = 1; + double den, dif, dift, ho, hp, w; + double *c, *d; + + dif = fabs(x-xa[1]); + c = new double[n+1]; + d = new double[n+1]; + for (int i = 1; i <= n; i++) { + if ((dift=fabs(x-xa[i])) < dif) { + ns = i; + dif = dift; + } + c[i] = ya[i]; + d[i] = ya[i]; + } + *y = ya[ns--]; + for (int m = 1; m < n; m++) { + for (int i = 1; i <= n-m; i++) { + ho = xa[i]-x; + hp = xa[i+m]-x; + w = c[i+1]-d[i]; + if ((den=ho-hp) == 0.0) { + std::cerr << "Error in routine polint" << std::endl; + exit(1); + } + den = w/den; + d[i] = hp*den; + c[i] = ho*den; + } + *y += (*dy=(2*ns < (n-m) ? c[ns+1] : d[ns--])); + } + delete[] c; + delete[] d; +} + +#define EPS 1.0e-6 +#define JMAX 20 +#define JMAXP (JMAX+1) +#define K 5 +double etics::ic::qromb(double (*func)(double), double a, double b) { + double ss, dss; + double s[JMAXP+1], h[JMAXP+1]; + + h[1] = 1.0; + for (int j = 1; j <= JMAX; j++) { + s[j] = trapzd(func, a, b, j); + if (j >= K) { + polint(&h[j-K], &s[j-K], K, 0, &ss, &dss); + if (fabs(dss) < EPS*fabs(ss)) return ss; + } + s[j+1] = s[j]; + h[j+1] = 0.25*h[j]; + } + std::cerr << "Too many steps in routine qromb" << std::endl; + exit(1); + return 0; +} +#undef EPS +#undef JMAX +#undef JMAXP +#undef K + +double etics::ic::xrandom(double a, double b) { + double Result = ((double)rand())/((double)RAND_MAX); + Result *= (b - a); + Result += a; + return Result; +} + +double etics::ic::xrandom() { + return xrandom(0, 1); +} + +/* Distribution function of a Hernquist model */ +double etics::ic::f(double E2) { + E = E2; + return qromb(drho2, 0.0, 2.0*sqrt(E))*(sqrt(32.)*(M_PI*M_PI*M_PI)); +} + +double etics::ic::drho2(double u) { + double p = E - 0.25*u*u; + double r; + if (p <= 0) return 0; + else { + double p1 = 1/p; + double b0 = 0.5*(2-p1); + double c0 = (1-p1); + r = -b0 + sqrt(b0*b0 - c0); + } + + double r2 = r*r, r3 = r2*r; + + double c0 = 1.0 + r; + double c02 = c0*c0; + double c03 = c02*c0; + double c04 = c02*c02; + double dp1 = -1/c02; + double dp2 = 2/c03; + + double drho1r = -(1+4*r)/(r2*c04); + double drho2r = 2.0*(14*r2 + 5*r + 1)/(r3*c04*c0); + + return drho2r/(dp1*dp1) - drho1r/(dp1*dp1*dp1)*dp2; +} + +void etics::ic::hernquist(int N, int Seed, Particle **ParticleList) { + using namespace etics::ic; + double x, y, z, vx, vy, vz; + double fmax, f0, f1, v2, vmax, vmax2, Ep=0, Ek=0; + + srand(Seed); + + *ParticleList = new Particle[N]; + + for(int i = 0; i < N; i++) { + double eta = xrandom(0.0, 1.0); + double radius = sqrt(eta)/(1-sqrt(eta)); + double phi = xrandom(0.0, 2*M_PI); + double cth = xrandom(-1.0, 1.0); + double sth = sqrt(1.0 - cth*cth); + x = radius*sth*cos(phi); + y = radius*sth*sin(phi); + z = radius*cth; + double psi0 = 1/(1 + radius); + Ep += -psi0; + vmax2 = 2.0*psi0; + vmax = sqrt(vmax2); + fmax = f(psi0); + f0 = fmax; f1 = 1.1*fmax; // just some initial values + while(f1 > f0) { + // pick a velocity + v2 = 2.0*vmax2; + while(v2 > vmax2) { // rejection technique + vx = vmax*xrandom(-1.0, 1.0); + vy = vmax*xrandom(-1.0, 1.0); + vz = vmax*xrandom(-1.0, 1.0); + v2 = vx*vx + vy*vy + vz*vz; + } + f0 = f(psi0 - 0.5*v2); + f1 = fmax*xrandom(0.0, 1.0); + } + Ek += 0.5*v2; + Particle p; + p.m = 1.0/N; + p.pos = vec3(x, y, z); + p.vel = vec3(vx, vy, vz); + p.ID = i; + p.Status = 0; + p.CalculateR2(); + (*ParticleList)[i] = p; + } + Ep /= 2*N; Ek /= N; + double E = Ep + Ek; + double RadiusFactor = -4*E; // should be close to 1/3 + double VelocityFactor = 1/sqrt(-4*E); // should be close to sqrt(3) + for(int i = 0; i < N; i++) { + (*ParticleList)[i].pos = (*ParticleList)[i].pos * RadiusFactor; + (*ParticleList)[i].vel = (*ParticleList)[i].vel * VelocityFactor; + } +} + +#define RMAX 10.0 +void etics::ic::plummer(int N, int Seed, Particle **ParticleList) { + double X, Y, Z, Vx, Vy, Vz, + X1, X2, X3, X4, X5, X6, X7, + R, Ve, V; + + srand(Seed); + + *ParticleList = new Particle[N]; + + int i = 0; + while(i < N) { + X1 = xrandom(); X2 = xrandom(); X3 = xrandom(); + R = 1.0/sqrt( (pow(X1,-2.0/3.0) - 1.0) ); + if (R > RMAX) continue; + Z = (1.0 - 2.0*X2)*R; + X = sqrt(R*R - Z*Z) * cos(2.0*M_PI*X3); + Y = sqrt(R*R - Z*Z) * sin(2.0*M_PI*X3); + + Ve = sqrt(2.0)*pow( (1.0 + R*R), -0.25 ); + + X4 = 0.0; + X5 = 0.0; + while( 0.1*X5 >= X4*X4*pow( (1.0-X4*X4), 3.5) ) { + X4 = xrandom(); + X5 = xrandom(); + } + + V = Ve*X4; + + X6 = xrandom(); + X7 = xrandom(); + + Vz = (1.0 - 2.0*X6)*V; + Vx = sqrt(V*V - Vz*Vz) * cos(2.0*M_PI*X7); + Vy = sqrt(V*V - Vz*Vz) * sin(2.0*M_PI*X7); + +#define CONV (3.0*M_PI/16.0) + X *= CONV; Y *= CONV; Z *= CONV; + Vx /= sqrt(CONV); Vy /= sqrt(CONV); Vz /= sqrt(CONV); +#undef CONV + + Particle p; + p.m = 1.0/N; + p.pos = vec3(X, Y, Z); + p.vel = vec3(Vx, Vy, Vz); + p.ID = i; + p.Status = 0; + p.CalculateR2(); + (*ParticleList)[i] = p; + + i++; + } /* while(i < N) */ +} +#undef RMAX diff --git a/src/ic.hpp b/src/ic.hpp new file mode 100644 index 0000000..f9d121d --- /dev/null +++ b/src/ic.hpp @@ -0,0 +1,7 @@ +#pragma once +namespace etics { + namespace ic { + void hernquist(int N, int Seed, Particle **ParticleList); + void plummer(int N, int Seed, Particle **ParticleList); + } +} \ No newline at end of file diff --git a/src/integrate.cu b/src/integrate.cu new file mode 100644 index 0000000..a6638d9 --- /dev/null +++ b/src/integrate.cu @@ -0,0 +1,127 @@ +#include +#include +#include "common.hpp" +#include "integrate.hpp" + +#ifdef MEX + #define method mex + #include "mex.hpp" +#elif defined(SCF) + #define method scf + #include "scf.hpp" +#elif defined(MULTICENTER) + #define method multicenter + #include "multicenter.hpp" +#endif + +#define PTR(x) (thrust::raw_pointer_cast((x).data())) + +namespace etics +{ + +// The 'drift' step is performed using the 'acc' member. +struct DriftFunctor { + Real Step, Step2; + __host__ __device__ DriftFunctor(Real _Step) : Step(_Step), Step2(_Step*_Step) {} + __host__ __device__ Particle operator() (Particle &p) const { + p.pos += p.vel*Step + p.acc*0.5*Step2; + p.CalculateR2(); // needed for both MEX and SCF, but not generally needed in leapfrog + return p; + } +}; + +// The 'kick' step is performed using the 'acc' member and also the force F, +// calculated at the new (predicted) position. +struct KickFunctor { + Real Step; + __host__ __device__ KickFunctor(Real _Step) : Step(_Step) {} + __host__ __device__ Particle operator() (Particle& p, const vec3& F) const { + p.vel += (p.acc + F)*0.5*Step; + p.acc = F; + return p; + } +}; + +struct KineticEnergyFunctor { + __host__ __device__ Real operator() (const Particle &p) const {return 0.5*p.m*p.vel.abs2();} +}; + +struct PotentialEnergyFunctor { + __host__ __device__ Real operator() (const Particle &p, const Real &Potential) const {return p.m*Potential;} +}; + +Integrator::Integrator() { + N = 0; + Time = 0; +} + +Integrator::Integrator(Particle *P_h, int _N) { + N = _N; + Time = 0; + P = thrust::device_vector(P_h, P_h+N); + Potential = thrust::device_vector(N); + Force = thrust::device_vector(N); + Method = &etics::method::CalculateGravity; + CalculateGravity(); + KickStep(0); // Just to "commit" the forces to the particle list. +} + +Integrator::~Integrator() { + N = 0; + // Weird way to free Thrust memory. + P.clear(); P.shrink_to_fit(); + Potential.clear(); Potential.shrink_to_fit(); + Force.clear(); Force.shrink_to_fit(); +} + +void Integrator::CalculateGravity() { + (*Method)(PTR(P), N, PTR(Potential), PTR(Force)); +} + +void Integrator::DriftStep(Real Step) { + thrust::transform(P.begin(), P.end(), P.begin(), DriftFunctor(Step)); +} + +void Integrator::KickStep(Real Step) { + thrust::transform(P.begin(), P.end(), Force.begin(), P.begin(), KickFunctor(Step)); +} + +Real Integrator::GetTime() { + return Time; +} + +int Integrator::GetN() { + return N; +} + +Real Integrator::KineticEnergy() { + return thrust::transform_reduce( + P.begin(), P.end(), + KineticEnergyFunctor(), + (Real)0, // It must be clear to the function that this zero is a Real. + thrust::plus() + ); +} + +Real Integrator::PotentialEnergy() { + return 0.5*thrust::inner_product( + P.begin(), P.end(), + Potential.begin(), + (Real)0, + thrust::plus(), + PotentialEnergyFunctor() + ); +} + +void Integrator::CopyParticlesToHost(Particle *P_h) { + thrust::copy(P.begin(), P.end(), P_h); +} + +void Integrator::CopyParticlesToHost(Particle **P_h, int *_N) { + Particle *LocalList = new Particle[N]; + thrust::copy(P.begin(), P.end(), LocalList); + *P_h = LocalList; + *_N = N; +} + +} // namespace etics diff --git a/src/integrate.hpp b/src/integrate.hpp new file mode 100644 index 0000000..69c4bab --- /dev/null +++ b/src/integrate.hpp @@ -0,0 +1,27 @@ +#pragma once +#include + +namespace etics { + class Integrator { + public: + Integrator(); + Integrator(Particle *P_h, int _N); + ~Integrator(); + void CalculateGravity(); + void DriftStep(Real Step); + void KickStep(Real Step); + Real GetTime(); + int GetN(); + Real KineticEnergy(); + Real PotentialEnergy(); + void CopyParticlesToHost(Particle *P_h); + void CopyParticlesToHost(Particle **P_h, int *_N); + private: + int N; + double Time; + thrust::device_vector P; + thrust::device_vector Potential; + thrust::device_vector Force; + void (*Method)(Particle*, int, Real*, vec3*); + }; +} diff --git a/src/io.cpp b/src/io.cpp new file mode 100644 index 0000000..a4df58a --- /dev/null +++ b/src/io.cpp @@ -0,0 +1,367 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.hpp" +#include "io.hpp" + +#ifdef ETICS_BOOST + #include + #include +#endif + +using namespace std; + +void ReadICsASCII(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) { + *P_h = new Particle[N]; + string Line; + ifstream InputFile(Filename.c_str()); + int LineNum; + + int i = 0; + if (InputFile.is_open()) { + getline(InputFile, Line); // First line is the snapshot number. + int Count = sscanf(Line.c_str(), "%d", FileSnapshotNum); + if (Count != 1) { + cerr << "Problem in input file, line 1." << endl; + exit(1); + } + getline(InputFile, Line); // Second line is the expected number of particles. + int ExpectedN; + Count = sscanf(Line.c_str(), "%d", &ExpectedN); // We ignore it actually. + if (Count != 1) { + cerr << "Problem in input file, line 2." << endl; + exit(1); + } + getline(InputFile, Line); + double T0; + Count = sscanf(Line.c_str(), "%lf", &T0); + if (Count != 1) { + cerr << "Problem in input file, line 3." << endl; + exit(1); + } + *FileTime = (Real)T0; + LineNum = 3; + while (getline(InputFile, Line)) + { + double m, x, y, z, vx, vy, vz; + int id; + Particle p; + Count = sscanf(Line.c_str(), "%d %lf %lf %lf %lf %lf %lf %lf", &id, &m, &x, &y, &z, &vx, &vy, &vz); + if (Count != 8) { + cerr << "Problem in input file, line " << LineNum + 1 << "." << endl; + exit(1); + } + p.ID = id; + p.m = (Real)m; + p.pos = vec3((Real)x, (Real)y, (Real)z); // This is to ensure proper casting in the case of single prec. + p.vel = vec3((Real)vx, (Real)vy, (Real)vz); + p.Status = 0; + p.CalculateR2(); + (*P_h)[i] = p; + i++; + LineNum++; + if (i == N) break; + } + InputFile.close(); + } else { + cerr << "Can't open file" << Filename << "." << endl; + exit(1); + } + if (i < N) { + cerr << "Was only able to read " << i << " particles while " << N << " were requested." << endl; + exit(1); + } +} + +void WriteSnapshotASCII(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) { + char S[512]; + sprintf(S, "%s%04d.dat", Prefix.c_str(), SnapNumber); + ofstream SnapshotFile; + SnapshotFile.open(S); + sprintf(S, "%06d\n", SnapNumber); SnapshotFile << S; + sprintf(S, "%06d\n", N); SnapshotFile << S; + sprintf(S, "%.16E\n", T); SnapshotFile << S; + for (int i = 0; i < N; i++) { + sprintf(S, "%06d%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e\n", P_h[i].ID, P_h[i].m, P_h[i].pos.x, P_h[i].pos.y, P_h[i].pos.z, P_h[i].vel.x, P_h[i].vel.y, P_h[i].vel.z); SnapshotFile << S; + } + SnapshotFile.close(); +} + +#define THROW_EXCEPTION(str, stat) {cerr << str << endl; exit((stat));} +void ParseInput(int argc, char *argv[], ParametersStruct *Params) { + ParametersStruct P; +#ifdef ETICS_BOOST + if (argc < 2) THROW_EXCEPTION("Usage: etics [FILE]...", 1) + ifstream TestFileObject(argv[1]); + if (!TestFileObject) THROW_EXCEPTION("Problem reading file...", 1) + boost::property_tree::ptree pt; + boost::property_tree::ini_parser::read_ini(argv[1], pt); // what if file doesn't exist? + + P.N = pt.get("N", 0); + if (P.N <= 0) THROW_EXCEPTION("Could not read number of particles (N) from ini file.", 1) + + P.Tcrit = pt.get("Tcrit", -1); + if (P.Tcrit < 0) THROW_EXCEPTION("Could not read finish time (Tcrit) from ini file.", 1) + + P.dT1 = pt.get("dT1", 0); + if (P.dT1 <= 0) THROW_EXCEPTION("Could output time interval (dT1) from ini file.", 1) + + P.dT2 = pt.get("dT2", 0); + if (P.dT2 <= 0) THROW_EXCEPTION("Could snapshot time interval (dT2) from ini file.", 1) + + P.ConstantStep = pt.get("StepSize", -1); + if (P.ConstantStep < 0) THROW_EXCEPTION("Could not read step size (StepSize) from ini file.", 1) + + P.Filename = pt.get("Filename", "\n"); + if (P.Filename == "\n") THROW_EXCEPTION("Could not read initial condition file name (Filename) from ini file.", 1) + + P.Prefix = pt.get("Prefix", ""); + P.OutputFormat = pt.get("OutputFormat", "ascii"); + P.DeviceID = pt.get("device", -1); + if (argc >= 3) { // If there is a argument after the file, it must be either --device or -d + int Version1 = strncmp(argv[2], "--device", 8); + int Version2 = strncmp(argv[2], "-d", 2); + if ((Version1!=0) && (Version2!=0)) THROW_EXCEPTION("Commandline argument after input file must be device number.", 1) + char *DeviceIDStr = strchr(argv[2], '='); + if (DeviceIDStr!=NULL) DeviceIDStr = DeviceIDStr + 1; + else { + if (argc <= 3) THROW_EXCEPTION("Device number must follow.", 1) + DeviceIDStr = argv[3]; + } + P.DeviceID = atoi(DeviceIDStr); + if ((P.DeviceID==0) && (strcmp(DeviceIDStr, "0")!=0)) THROW_EXCEPTION("Error understanding device number.", 1) + } +#else + // If no BOOST, we include a file with the parameters and compile it. + int N, DeviceID = -1; + Real dT1, dT2, Tcrit, StepSize; + std::string Filename, Prefix = "", OutputFormat=""; + #include "noboost.inc" + P.N = N; + P.Filename = Filename; + P.dT1 = dT1; + P.dT2 = dT2; + P.Tcrit = Tcrit; + P.ConstantStep = StepSize; + P.DeviceID = DeviceID; + P.OutputFormat = OutputFormat; +#endif + P.Seed = INT_MIN; + if ((P.Filename[0]=='_') && (P.Filename.rfind("_") > 0)) { + int Break = P.Filename.rfind("_") + 1; + string SeedStr = P.Filename.substr(Break, P.Filename.length() - Break); + int SeedInt; + istringstream iss(SeedStr); + iss >> ws >> P.Seed >> ws; + if(!iss.eof()) THROW_EXCEPTION("Could not understand random seed (in Filename).", 1) + P.Filename = P.Filename.substr(0, Break); + } + if (P.Seed == INT_MIN) P.Seed = (int)time(NULL); + *Params = P; +} + + +// make safety: if single precision, fail to compile +#ifdef ETICS_HDF5 +#include "H5Cpp.h" + +int H5IterNum; +int *H5IterSnapNumbers; +string *H5IterGroupNames; + +herr_t file_info(hid_t loc_id, const char *name, void *opdata) { + int res = sscanf(name, "Step#%d", H5IterSnapNumbers + H5IterNum); + if (res != 1) { + cerr << "Problem understanding group \"" << name << "\" in HDF5 file" << endl; + exit(1); + } + H5IterGroupNames[H5IterNum] = name; + H5IterNum++; + return 0; +} + + +void ReadICsHDF5(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) { + double *Mass, *X, *Y, *Z, *VX, *VY, *VZ; + int *ID; + Mass = new double[N]; + X = new double[N]; + Y = new double[N]; + Z = new double[N]; + VX = new double[N]; + VY = new double[N]; + VZ = new double[N]; + ID = new int[N]; + + H5::H5File file; + file = H5::H5File(Filename, H5F_ACC_RDONLY); + + H5::Group group = file.openGroup("/"); + int NumberOfGroups = group.getNumObjs(); + H5IterSnapNumbers = new int[NumberOfGroups]; + H5IterGroupNames = new string[NumberOfGroups]; + H5IterNum = 0; + int ret = file.iterateElems("/", NULL, file_info, NULL); + int MaxSnapIndex = 0; + for (int i=1; i < NumberOfGroups; i++) MaxSnapIndex = (H5IterSnapNumbers[i]>H5IterSnapNumbers[MaxSnapIndex])?i:MaxSnapIndex; + string GroupName = H5IterGroupNames[MaxSnapIndex]; + + + group = file.openGroup("/" + GroupName); + H5::Attribute attribute = group.openAttribute("Time"); + attribute.read(H5::PredType::NATIVE_DOUBLE, FileTime); + *FileSnapshotNum = H5IterSnapNumbers[MaxSnapIndex]; + + H5::DataSet dataset = file.openDataSet("/" + GroupName + "/ID"); + int NfromFile = dataset.getStorageSize()/sizeof(int); + if (NfromFile < N) { + cerr << "Was only able to read " << NfromFile << " particles while " << N << " were requested." << endl; + exit(1); + } + dataset.read(ID, H5::PredType::NATIVE_UINT32); + + dataset = file.openDataSet("/" + GroupName + "/Mass"); dataset.read(Mass, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/X"); dataset.read(X, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/Y"); dataset.read(Y, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/Z"); dataset.read(Z, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/VX"); dataset.read(VX, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/VY"); dataset.read(VY, H5::PredType::NATIVE_DOUBLE); + dataset = file.openDataSet("/" + GroupName + "/VZ"); dataset.read(VZ, H5::PredType::NATIVE_DOUBLE); + + dataset.close(); + group.close(); + file.close(); + + *P_h = new Particle[N]; + for (int i = 0; i < N; i++) { + Particle p; + p.ID = ID[i]; + p.m = (Real)Mass[i]; + p.pos = vec3((Real)X[i], (Real)Y[i], (Real)Z[i]); // This is to ensure proper casting in the case of single prec. + p.vel = vec3((Real)VX[i], (Real)VY[i], (Real)VZ[i]); + p.Status = 0; + p.CalculateR2(); + (*P_h)[i] = p; + } +} + +bool H5FirstSnapshot = true; + +void WriteSnapshotHDF5(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) { + string Filename = Prefix + ".h5part"; + + H5::H5File file; + + if (H5FirstSnapshot && std::ifstream(Filename.c_str())) { + cerr << "File \"" << Filename << "\" already exist!" << endl; + exit(1); + } + H5FirstSnapshot = false; + + if (std::ifstream(Filename.c_str())) { + file = H5::H5File(Filename, H5F_ACC_RDWR); + } + else file = H5::H5File(Filename, H5F_ACC_TRUNC); + + char GroupName[64]; + sprintf(GroupName, "/Step#%d", SnapNumber); + + H5::Group group; + try { + H5::Exception::dontPrint(); + group = H5::Group(file.createGroup(GroupName)); + } + catch ( H5::FileIException error ) { + cerr << "Couldn't create group \"" << GroupName << "\" in HDF5 file, maybe it already exists?" << endl; + exit(1); + } + + double *Mass, *X, *Y, *Z, *VX, *VY, *VZ, *AX, *AY, *AZ; + Mass = new double[N]; + X = new double[N]; + Y = new double[N]; + Z = new double[N]; + VX = new double[N]; + VY = new double[N]; + VZ = new double[N]; + AX = new double[N]; + AY = new double[N]; + AZ = new double[N]; + int *ID; + ID = new int[N]; + + for (int i = 0; i < N; i++) { + Particle p = P_h[i]; + ID[i] = p.ID; + Mass[i] = p.m; + X[i] = p.pos.x; + Y[i] = p.pos.y; + Z[i] = p.pos.z; + VX[i] = p.vel.x; + VY[i] = p.vel.y; + VZ[i] = p.vel.z; + AX[i] = p.acc.x; + AY[i] = p.acc.y; + AZ[i] = p.acc.z; + } + + H5::DataSpace attr_dataspace = H5::DataSpace(H5S_SCALAR); + + H5::Attribute attribute = group.createAttribute("Time", H5::PredType::NATIVE_DOUBLE, attr_dataspace); + attribute.write( H5::PredType::NATIVE_DOUBLE, &T); + + attribute = group.createAttribute("TotalN", H5::PredType::NATIVE_UINT32, attr_dataspace); + attribute.write( H5::PredType::NATIVE_UINT32, &N); + + hsize_t dimsxxx = N; + H5::DataSpace dataspacexxx(1, &dimsxxx); + H5::DataSet dataset3; + dataset3 = H5::DataSet(group.createDataSet("ID", H5::PredType::NATIVE_UINT32, dataspacexxx)); + dataset3.write(ID, H5::PredType::NATIVE_UINT32); + dataset3 = H5::DataSet(group.createDataSet("Mass", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(Mass, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("X", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(X, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("Y", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(Y, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("Z", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(Z, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("VX", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(VX, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("VY", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(VY, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("VZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(VZ, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("AX", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(AX, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("AY", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(AY, H5::PredType::NATIVE_DOUBLE); + dataset3 = H5::DataSet(group.createDataSet("AZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx)); + dataset3.write(AZ, H5::PredType::NATIVE_DOUBLE); + + dataset3.close(); + dataspacexxx.close(); + + attr_dataspace.close(); + attribute.close(); + group.close(); + file.close(); + + delete [] Mass; + delete [] X; + delete [] Y; + delete [] Z; + delete [] VX; + delete [] VY; + delete [] VZ; + delete [] AX; + delete [] AY; + delete [] AZ; + delete [] ID; +} + +#endif diff --git a/src/io.hpp b/src/io.hpp new file mode 100644 index 0000000..8b02b5e --- /dev/null +++ b/src/io.hpp @@ -0,0 +1,19 @@ +#pragma once + +struct ParametersStruct { + int N; + std::string Filename; + int Seed; + Real dT1, dT2, Tcrit, ConstantStep; + std::string Prefix; + std::string OutputFormat; + int DeviceID; +}; + +void ReadICsASCII(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime); +void WriteSnapshotASCII(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T); +void ParseInput(int argc, char *argv[], ParametersStruct *Params); +#ifdef ETICS_HDF5 +void ReadICsHDF5(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime); +void WriteSnapshotHDF5(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T); +#endif diff --git a/src/main.cu b/src/main.cu new file mode 100644 index 0000000..33f3bf0 --- /dev/null +++ b/src/main.cu @@ -0,0 +1,347 @@ +/** + * @file main.cu + * @author Yohai Meiron + * @version 1.0 + */ + +// If you have CUDA then compile with: +// nvcc main.cu -lm -O3 -arch=sm_20 -o output +// Otherwise enable OpenMP and compile with GCC: +// g++ -x c++ -O3 -o output main.cu -DOMP -fopenmp -lgomp -I/home/ym +// The -I is the path to the parent directory where thrust is. + +#include +#include +#include +#include +#include +#include +#ifdef ETICS_MPI +#include +#endif + +#define CUDA + +#ifdef OMP + #error Sorry, OpenMP is currently disabled. + #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND_OMP + #undef CUDA + #define PARALLEL_GET_TID omp_get_thread_num() + #define PARALLEL_ADVANCE omp_get_num_threads() + #define __global__ +// #include +#else + #define PARALLEL_GET_TID threadIdx.x + blockIdx.x * blockDim.x + #define PARALLEL_ADVANCE blockDim.x * gridDim.x +// #include "cuda_complex.hpp" +#endif +// #define complex complex + +#include +#include +#include +#include +#include + +#include "common.hpp" +#include "io.hpp" +#include "ic.hpp" +#include "integrate.hpp" + +#ifdef MEX + #define method mex + #include "mex.hpp" +#elif defined(SCF) + #define method scf + #include "scf.hpp" +#elif defined(MULTICENTER) + #define method multicenter + #include "multicenter.hpp" +#endif + +using namespace std; +using namespace etics; + +// GLOBAL VARIABLES +int MyRank, NumProcs; +Real ConstantStep = 0.001953125; +Real T, Step, dT1, dT2, Tcrit, FileTime; +int NSteps = 0, FileSnapshotNum; + +struct ReorderingFunctor { + __host__ __device__ bool operator() (const Particle &lhs, const Particle &rhs) { + return (lhs.ID <= rhs.ID); + } +}; + +Real CalculateStepSize() { + return ConstantStep; +} + +void DisplayInformation(Integrator IntegratorObj) { + Real Ek = IntegratorObj.KineticEnergy(); + Real Ep = IntegratorObj.PotentialEnergy(); + Real Energy = Ek + Ep; + int N=IntegratorObj.GetN(); + + Real TotalEnergy; + int TotalN; + +#ifdef ETICS_MPI + MPI_Reduce(&Energy, &TotalEnergy, 1, MPI_ETICS_REAL, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(&N, &TotalN, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); +#else + TotalEnergy = Energy; + TotalN = N; +#endif + + if (MyRank==0) { + printf(" TIME =%6.2f NSTEPS =%6d ENERGY =%20.16f N = %d\n", T, NSteps, TotalEnergy, TotalN); + fflush(stdout); + } +} + +void PrepareSnapshot(Integrator IntegratorObj, Particle **ParticleList, int *CurrentTotalN) { + Particle *LocalList; + int LocalBufferSize; + IntegratorObj.CopyParticlesToHost(&LocalList, &LocalBufferSize); + int TotalN = 0; + +#ifdef ETICS_MPI + LocalBufferSize *= sizeof(Particle); + int BufferSizes[NumProcs]; + MPI_Gather(&LocalBufferSize, 1, MPI_INT, BufferSizes, 1, MPI_INT, 0, MPI_COMM_WORLD); + int Displacements[NumProcs]; + if (MyRank==0) { + for (int p = 0; p < NumProcs; p++) TotalN += BufferSizes[p]/sizeof(Particle); + Displacements[0] = 0; + for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1]; + *ParticleList = new Particle[TotalN]; + } + MPI_Gatherv(LocalList, LocalBufferSize, MPI_BYTE, *ParticleList, BufferSizes, Displacements, MPI_BYTE, 0, MPI_COMM_WORLD); +#else + TotalN = LocalBufferSize/sizeof(Particle); + *ParticleList = new Particle[TotalN]; + std::copy(LocalList, LocalList+TotalN, *ParticleList); +#endif + +#ifdef MEX + thrust::sort(*ParticleList, (*ParticleList)+TotalN, ReorderingFunctor()); +#endif + *CurrentTotalN = TotalN; + free(LocalList); +//WARNING Honestly I dont get why I use allocate memory inside the fuction, its very confusing and makes it harder to find leaks. +//Future Yohai (2019): Yeah, it's done both here for **ParticleList and internally for *LocalList in Integrator::CopyParticlesToHost that does it, pretty ugly. +} + +int main(int argc, char *argv[]) { +#ifdef ETICS_MPI + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &MyRank); + MPI_Comm_size(MPI_COMM_WORLD, &NumProcs); +#endif + + if (MyRank==0) { + cerr << "Welcome to ETICS..." << endl; +#ifdef MEX + cerr << "Using method: MEX" << endl; + cerr << "LMAX=" << LMAX << endl; +#elif defined(SCF) + cerr << "Using method: SCF" << endl; + cerr << "LMAX=" << LMAX << endl; + cerr << "NMAX=" << NMAX << endl; +#endif + } + + string Filename; + int DeviceID = 0; + + ParametersStruct Params; + // Instead of reading the input file with MyRank=0 and broadcast the result, we let every rank read the file. This probably saves ~20 lines of ugly MPI code. + ParseInput(argc, argv, &Params); + int N = Params.N; // total; will be divided by number of processes + Filename = Params.Filename; + Tcrit = Params.Tcrit; + ConstantStep = Params.ConstantStep; + DeviceID = Params.DeviceID; + dT1 = Params.dT1; + dT2 = Params.dT2; + + if (DeviceID >= 0) { + if (cudaSetDevice(DeviceID) != cudaSuccess) { + cerr << "Problem opening device (ID=" << DeviceID << ")" << endl; + exit(1); + } + } else { + cerr << "Skipping call to cudaSetDevice." << endl; + } + + // Read an input file and initialize the global particle structure. + Particle *FullList; + if (MyRank==0) { + if ((Filename == "_nofile_") || (Filename == "_hernquist_")) { + cout << "Generating a Hernquist sphere..." << endl; + etics::ic::hernquist(N, Params.Seed, &FullList); + FileSnapshotNum = 0; + FileTime = 0; + cout << "Done." << endl; + } else if (Filename == "_plummer_") { + cout << "Generating a Plummer sphere..." << endl; + etics::ic::plummer(N, Params.Seed, &FullList); + FileSnapshotNum = 0; + FileTime = 0; + cout << "Done." << endl; + } + else { + string InputFileSuffix = Filename.substr(Filename.find_last_of("."), Filename.length()-Filename.find_last_of(".")); + + if ((InputFileSuffix==".h5part") || (InputFileSuffix==".hdf5") || (InputFileSuffix==".h5")) { +#ifndef ETICS_HDF5 + cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot read input in this format." << endl; + exit(1); +#else + ReadICsHDF5(Filename, N, &FullList, &FileSnapshotNum, &FileTime); +#endif + } else ReadICsASCII(Filename, N, &FullList, &FileSnapshotNum, &FileTime); + } + } + +#ifndef ETICS_HDF5 + if (Params.OutputFormat == "hdf5") { + cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot output in requested format." << endl; + exit(1); + } +#endif + if (!(Params.OutputFormat == "hdf5") && !(Params.OutputFormat == "ascii")) { + cerr << "Requested output format unrecognized." << endl; + exit(1); + } + +#ifdef ETICS_MPI + int LocalN = N / NumProcs; + + int Remainder = N - LocalN*NumProcs; + if (MyRank==NumProcs-1) LocalN += Remainder; + Particle *LocalList = new Particle[LocalN]; + int BufferSizes[NumProcs]; + int Displacements[NumProcs]; + if (MyRank==0) { + for (int p = 0; p < NumProcs; p++) BufferSizes[p] = (N / NumProcs)*sizeof(Particle); + BufferSizes[NumProcs-1] += Remainder*sizeof(Particle); + Displacements[0] = 0; + for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1]; + } + MPI_Scatterv(FullList, BufferSizes, Displacements, MPI_BYTE, LocalList, LocalN*sizeof(Particle), MPI_BYTE, 0, MPI_COMM_WORLD); + + if (MyRank==0) free(FullList); + N = LocalN; +#else + Particle *LocalList = new Particle[N]; + copy(FullList, FullList+N, LocalList); +#endif + +#ifdef ETICS_MPI +// This code block is just to ask each MPI process to report, and find if there is a conflict (i.e. multiple processes using the same physical device) + cudaDeviceProp DeviceProperties; + const int etics_str_len = 256; + cudaGetDeviceProperties(&DeviceProperties, 0); + char ProcessorName[etics_str_len]; + int tmp; + MPI_Get_processor_name(ProcessorName, &tmp); + char UniqueDeviceID[etics_str_len]; + sprintf(UniqueDeviceID, "%d$$$%s", DeviceProperties.pciBusID, ProcessorName); + char Message[etics_str_len]; + sprintf(Message, "Hello from rank %d (of %d) on %s, using \"%s\" with PCI bus ID %d; this rank has %d particles.\n", MyRank, NumProcs, ProcessorName, DeviceProperties.name, DeviceProperties.pciBusID, LocalN); + if (MyRank == 0) { + printf("%s", Message); + fflush(stdout); + for (int Rank = 1; Rank < NumProcs; Rank++) { + MPI_Recv(Message, etics_str_len, MPI_CHAR, Rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + printf("%s", Message); + fflush(stdout); + } + } else { + MPI_Send(Message, etics_str_len, MPI_CHAR, 0, 0, MPI_COMM_WORLD); + } + + // Here we collect the GPU IDs from all MPI processes and print a warning if one GPU is assigned to more than one process. + char *StrBuf; + StrBuf = (char*)malloc(NumProcs*etics_str_len*sizeof(char)); + MPI_Gather( UniqueDeviceID, etics_str_len, MPI_CHAR, StrBuf, etics_str_len, MPI_CHAR, 0, MPI_COMM_WORLD); + if (MyRank == 0) { + bool DuplicateFound = false; + for (int i = 0; i < NumProcs; i++) { + for (int j = i+1; j < NumProcs; j++) { + if (strcmp(StrBuf+i*etics_str_len, StrBuf+j*etics_str_len) == 0) { + DuplicateFound = true; + break; + } + if (DuplicateFound) break; + } + } + if (DuplicateFound) { + printf("\x1B[31m!!SEVERE WARNING!!\x1B[0m It seems the same physical GPU device was assigned to multiple processes; check the submission script.\n"); + } + } + free(StrBuf); +#endif + + + // Now initiate the code + method::Init(N, 0, 0, 0, 0); + + // Initiate the integrator + Integrator IntegratorObj(LocalList, N); + + // More initializations. + Real NextOutput = 0, NextSnapshot = 0; + T = FileTime; + int SnapNumber = FileSnapshotNum; + Step = CalculateStepSize(); + + while (T <= Tcrit) { + if (T >= NextOutput) { + DisplayInformation(IntegratorObj); + NextOutput += dT1; + } + if (T >= NextSnapshot) { +// cerr << "TESTING MODE!!! NOT WRITING SNAPSHOTS!!!" << endl; + int CurrentTotalN; + PrepareSnapshot(IntegratorObj, &FullList, &CurrentTotalN); + if (MyRank==0) { + if (Params.OutputFormat == "ascii") WriteSnapshotASCII(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T); +#ifdef ETICS_HDF5 + else if (Params.OutputFormat == "hdf5") WriteSnapshotHDF5(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T); +#endif + else {cerr << "Error" << endl; exit(1);} + free(FullList); + } + SnapNumber++; + NextSnapshot += dT2; + } + + // Take the drift step. + IntegratorObj.DriftStep(Step); + + // Calculate the forces in the new positions. + IntegratorObj.CalculateGravity(); + + // Finish by taking the kick step. + // The kick functor also "commits" the predicted forces into the "acc" member. + IntegratorObj.KickStep(Step); + + // N particles were implicitly propagated in this iteration. + NSteps += 1; + + // Advance global time. + T += Step; + + // Calculate the next step. + Step = CalculateStepSize(); + } + IntegratorObj.~Integrator(); +#ifdef ETICS_MPI + MPI_Finalize(); +#endif + return 0; +} diff --git a/src/mathaux.cu b/src/mathaux.cu new file mode 100644 index 0000000..c2d6aee --- /dev/null +++ b/src/mathaux.cu @@ -0,0 +1,57 @@ +#include "common.hpp" +// #include "mathaux.hpp" + +double FactorialSCF_tmpname(int x) { + return (x <= 1) ? 1 : x*FactorialSCF_tmpname(x - 1); +} + +double GammaPlusHalf(int x) { + return pow(0.25, x) * sqrt(M_PI) * FactorialSCF_tmpname(2*x) / FactorialSCF_tmpname(x); +} + +void RadialCoefficients(Real *Result) { + for (int n = 0; n <= NMAX; n++) { + for (int l = 0; l <= LMAX; l++) { + double AnlTilde = - pow(2.0, 8*l + 6) * FactorialSCF_tmpname(n) * (n + 2*l + 1.5); + AnlTilde *= GammaPlusHalf(2*l + 1) * GammaPlusHalf(2*l + 1); + AnlTilde /= (4*M_PI * (n*(n + 3 + 4*l)/2 + (l+1)*(2*l+1)) * FactorialSCF_tmpname(n+4*l+2)); + Result[(LMAX+1)*n+l] = (Real)AnlTilde; + } + } +} + +__host__ __device__ Real Pl(int l, Real x) { + // No range check or anything... The first 10 polynomials are hardcoded + switch (l) { + case 0 : return 1; + case 1 : return x; + case 2 : return (1.5*x*x - 0.5); + case 3 : return (2.5*x*x*x - 1.5*x); + case 4 : {Real x2=x*x; return (4.375*x2*x2 - 3.75*x2 + 0.375);} + case 5 : {Real x2=x*x, x3=x2*x; return (7.875*x3*x2 - 8.75*x3 + 1.875*x);} + case 6 : {Real x2=x*x, x4=x2*x2; return (14.4375*x4*x2 - 19.6875*x4 + 6.5625*x2 - 0.3125);} + case 7 : {Real x2=x*x, x3=x2*x, x5=x3*x2; return (26.8125*x5*x2 - 43.3125*x5 + 19.6875*x3 - 2.1875*x);} + case 8 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4; return (50.2734375*x8 - 93.84375*x6 + 54.140625*x4 - 9.84375*x2 + 0.2734375);} + case 9 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2; + return (94.9609375*x9 - 201.09375*x7 + 140.765625*x5 - 36.09375*x3 + 2.4609375*x);} + case 10 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4; + return (180.42578125*x10 - 427.32421875*x8 + 351.9140625*x6 - 117.3046875*x4 + 13.53515625*x2 - 0.24609375);} + case 11 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2; + return (344.44921875*x11 - 902.12890625*x9 + 854.6484375*x7 - 351.9140625*x5 + 58.65234375*x3 - 2.70703125*x);} + case 12 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6; + return (660.1943359375*x12 - 1894.470703125*x10 + 2029.7900390625*x8 - 997.08984375*x6 + 219.9462890625*x4 - 17.595703125*x2 + 0.2255859375);} + case 13 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2, x13=x9*x2; + return (1269.6044921875*x13 - 3961.166015625*x11 + 4736.1767578125*x9 - 2706.38671875*x7 + 747.8173828125*x5 -87.978515625*x3 + 2.9326171875*x);} + case 14 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6, x14=x8*x6; + return (2448.52294921875*x14 - 8252.42919921875*x12 + 10893.2065429688*x10 - 7104.26513671875*x8 + 2368.08837890625*x6 - 373.90869140625*x4 + 21.99462890625*x2 - 0.20947265625);} + default : return -1e30; + } +} + +void AngularCoefficients(Real *Result) { + for (int l = 0; l <= LMAX; l++) { + for (int m = 0; m <= l; m++) { + Result[(l+1)*l/2+m] = (sqrt((2*l+1)/(4*M_PI)) * sqrt(FactorialSCF_tmpname(l-m)/FactorialSCF_tmpname(l+m))); + } + } +} diff --git a/src/mathaux.hpp b/src/mathaux.hpp new file mode 100644 index 0000000..415a3b3 --- /dev/null +++ b/src/mathaux.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "common.hpp" +#include +#ifdef ETICS_DOUBLE_PRECISION + #define Complex cuDoubleComplex + #define make_Complex make_cuDoubleComplex + #define Complex_add cuCadd + #define Complex_mul cuCmul + #define Complex_imag cuCimag + #define Complex_real cuCreal + #define Complex_conj cuConj +#else + #define Complex cuFloatComplex + #define make_Complex make_cuFloatComplex + #define Complex_add cuCaddf + #define Complex_mul cuCmulf + #define Complex_imag cuCimagf + #define Complex_real cuCrealf + #define Complex_conj cuConjf +#endif + +#define SQRT_4_PI 3.5449077018110320545963349666822903655950989122447742564276 + + +double FactorialSCF_tmpname(int x); +double GammaPlusHalf(int x); +void RadialCoefficients(Real *Result); +#define MAX_HARDCODED_PL 14 +__host__ __device__ Real Pl(int l, Real x); +void AngularCoefficients(Real *Result); diff --git a/src/multicenter.cu b/src/multicenter.cu new file mode 100644 index 0000000..975417b --- /dev/null +++ b/src/multicenter.cu @@ -0,0 +1,131 @@ +#include "common.hpp" +#include "mathaux.hpp" +#include "scf.hpp" +#include "integrate.hpp" +#include "multicenter.hpp" +#include +#include +#include + +etics::scf::scfclass SCFObject1; +etics::scf::scfclass SCFObject2; + +vec3 *F_tmp; +vec3 *F_tmp2; +Real *Potential_tmp; +Real *Potential_tmp2; +Particle Origins[2]; + +void etics::multicenter::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!! + cudaMalloc((void**)&F_tmp, Nmax*sizeof(vec3)); + cudaMalloc((void**)&F_tmp2, Nmax*sizeof(vec3)); + cudaMalloc((void**)&Potential_tmp, Nmax*sizeof(Real)); + cudaMalloc((void**)&Potential_tmp2, Nmax*sizeof(Real)); + SCFObject1.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new); +// SCFObject1.OriginPos = vec3(5,0,0); +// SCFObject1.OriginVel = vec3(0,0.1,0); + + SCFObject2.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new); + Particle p; + p.m = 0; + p.pos = vec3(-6.000000e-01 , 0 , 0); + p.vel = vec3(0 , -3.803629e-03 , 0); + Origins[0] = p; + p.pos = vec3(2.940000e+01 , 0 , 0); + p.vel = vec3(0 , 1.863778e-01 , 0); + Origins[1] = p; +// OriginsIntegrator = etics::Integrator(Origins, 2); + +} + +void etics::multicenter::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) { + Particle PP[2]; + cudaMemcpy(PP, P+N-2, 2*sizeof(Particle), cudaMemcpyDeviceToHost); +// std::cerr << PP[0].pos.x << std::endl; +// std::cerr << PP[0].pos.y << std::endl; +// std::cerr << PP[0].pos.z << std::endl; +// std::cerr << PP[1].pos.x << std::endl; +// std::cerr << PP[1].pos.y << std::endl; +// std::cerr << PP[1].pos.z << std::endl; + + +// #error all we did last time was to add the offset to the subroutines, now the energy is nan. must be some kind of self force + + SCFObject1.GetGpuLock(); + SCFObject1.SendCachePointersToGPU(); + SCFObject1.LoadParticlesToCache(P, 800000, PP[0].pos); + SCFObject1.CalculateCoefficients(); + SCFObject1.ReleaseGpuLock(); + + + SCFObject2.GetGpuLock(); + SCFObject2.SendCachePointersToGPU(); + SCFObject2.LoadParticlesToCache(P+800000, 200000-2, PP[1].pos); + SCFObject2.CalculateCoefficients(); + SCFObject2.ReleaseGpuLock(); + + + SCFObject1.GetGpuLock(); + SCFObject1.SendCoeffsToGPU(); + SCFObject1.SendCachePointersToGPU(); + SCFObject1.LoadParticlesToCache(P, N-2, PP[0].pos); + SCFObject1.CalculateGravityFromCoefficients(Potential_tmp, F_tmp); + SCFObject1.ReleaseGpuLock(); + + SCFObject2.GetGpuLock(); + SCFObject2.SendCoeffsToGPU(); + SCFObject2.SendCachePointersToGPU(); + SCFObject2.LoadParticlesToCache(P, N-2, PP[1].pos); + SCFObject2.CalculateGravityFromCoefficients(Potential_tmp2, F_tmp2); + SCFObject2.ReleaseGpuLock(); + + // now deal just with the central particles + SCFObject1.GetGpuLock(); + SCFObject1.SendCoeffsToGPU(); + SCFObject1.SendCachePointersToGPU(); + SCFObject1.LoadParticlesToCache(P+N-1, 1, PP[0].pos); //instead of doing that, you need to "push" it at the end of the list + SCFObject1.CalculateGravityFromCoefficients(Potential_tmp+N-1, F_tmp+N-1); // the last particle is the second center, so should feel the first's force + SCFObject1.ReleaseGpuLock(); + + SCFObject2.GetGpuLock(); + SCFObject2.SendCoeffsToGPU(); + SCFObject2.SendCachePointersToGPU(); + SCFObject2.LoadParticlesToCache(P+N-2, 1, PP[1].pos); + SCFObject2.CalculateGravityFromCoefficients(Potential_tmp+N-2, F_tmp+N-2); + SCFObject2.ReleaseGpuLock(); + + +thrust::plus op; +thrust::transform(thrust::device, Potential_tmp, Potential_tmp+N-2, Potential_tmp2, Potential_tmp, thrust::plus()); +thrust::transform(thrust::device, F_tmp, F_tmp+N-2, F_tmp2, F_tmp, thrust::plus()); + +/* + vec3 FFF; + Real PPP; + SCFObject1.CalculateGravityAtPoint(vec3(100,0.001,0.001), &PPP, &FFF); + + std::cerr << PPP << std::endl; + std::cerr << FFF.x << std::endl; + std::cerr << FFF.y << std::endl; + std::cerr << FFF.z << std::endl;*/ + +// vec3 FFF; +// Real PPP; +// cudaMemcpy(&PPP, Potential_tmp+N-2, 1*sizeof(Real), cudaMemcpyDeviceToHost); +// cudaMemcpy(&FFF, F_tmp+N-2, 1*sizeof(vec3), cudaMemcpyDeviceToHost); +// std::cerr << "===" << std::endl; +// std::cerr << PPP << std::endl; +// std::cerr << FFF.x << std::endl; +// std::cerr << FFF.y << std::endl; +// std::cerr << FFF.z << std::endl; +// cudaMemcpy(&PPP, Potential_tmp+N-1, 1*sizeof(Real), cudaMemcpyDeviceToHost); +// cudaMemcpy(&FFF, F_tmp+N-1, 1*sizeof(vec3), cudaMemcpyDeviceToHost); +// std::cerr << "+++" << std::endl; +// std::cerr << PPP << std::endl; +// std::cerr << FFF.x << std::endl; +// std::cerr << FFF.y << std::endl; +// std::cerr << FFF.z << std::endl; + + cudaMemcpy(Potential, Potential_tmp, N*sizeof(Real), cudaMemcpyDeviceToDevice); + cudaMemcpy(F, F_tmp, N*sizeof(vec3), cudaMemcpyDeviceToDevice); +} \ No newline at end of file diff --git a/src/multicenter.hpp b/src/multicenter.hpp new file mode 100644 index 0000000..4723798 --- /dev/null +++ b/src/multicenter.hpp @@ -0,0 +1,6 @@ +namespace etics { + namespace multicenter { + void Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new); + void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F); + } +} \ No newline at end of file diff --git a/src/noboost.inc b/src/noboost.inc new file mode 100644 index 0000000..4735d28 --- /dev/null +++ b/src/noboost.inc @@ -0,0 +1,15 @@ +/******************************************************************************* +* This is an ETICS parameter file used when the BOOST library is not available. +* When changing a parameter, the code has to be recompiled; if using BOOST, this +* is not required. The syntax is C++, just fill in the variables used in the +* regular initialization file. No error messages will be shown if input is bad. +*******************************************************************************/ + +Filename = "_nofile_ 0"; +N = 1048576; +Tcrit = 16; +StepSize = 0.001953125; +dT1 = 0.001953125; +dT2 = 0.1; +Prefix = "test_del_me"; +OutputFormat = "ascii"; diff --git a/src/scf.cu b/src/scf.cu new file mode 100644 index 0000000..5cbed0a --- /dev/null +++ b/src/scf.cu @@ -0,0 +1,518 @@ +/** + * @file + * @author Yohai Meiron + * @brief Functions to calculate gravitational force using the SCF method. + */ +#include "common.hpp" +#include "mathaux.hpp" +#include "scf.hpp" + +#include +using std::cout; +using std::cerr; +using std::endl; + +#include +#include +#include +#include +#include + +// Use the MPI flag only for the standalone program +#ifdef ETICS_MPI +#include +#endif + +namespace etics { + namespace scf { + scfclass SCFObject; + + // These are truly global variables that are initialized once and can be read by every scfclass object. + bool ScfGloballyInitialized = false; + __constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)]; + __constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2]; + Real RadCoeff_h[(NMAX+1)*(LMAX+1)]; + Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2]; + __constant__ Real LengthScale; + Real LengthScale_h; + + // These are global variables that are read/written by each scfclass object; they must be locked before use. + __constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2]; + __constant__ CacheStruct Cache; + + // These variables provide the mechanism to lock and release the global constant memory variables above. + scfclass *GpuLockOwner = NULL; + #define GPU_LOCK_PANIC \ + { \ + std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \ + exit(1); \ + } + } +} + +namespace etics { + namespace scf { + int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function + return (LMAX+1)*sizeof(Complex)*BlockSize; + } + } +} + +// Class implementation +etics::scf::scfclass::scfclass() { + N=0; + Nmax=0; + k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1; +} + +etics::scf::scfclass::~scfclass() { + if (etics::scf::GpuLockOwner == this) ReleaseGpuLock(); + cudaFree(Cache_h.xi); + cudaFree(Cache_h.Phi0l); + cudaFree(Cache_h.Wprev1); + cudaFree(Cache_h.Wprev2); + cudaFree(Cache_h.costheta); + cudaFree(Cache_h.sintheta_I); + cudaFree(Cache_h.Exponent); + cudaFree(Cache_h.mass); + free(PartialSum_h); + cudaFree(PartialSum); +} + +void etics::scf::scfclass::GetGpuLock() { + if (etics::scf::GpuLockOwner == this) return; + if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this; + else GPU_LOCK_PANIC; +} + +void etics::scf::scfclass::ReleaseGpuLock() { + if (etics::scf::GpuLockOwner == NULL) return; + if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL; + else GPU_LOCK_PANIC; +} + +void etics::scf::scfclass::CalculateCoefficients(int n, int l) { + int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2; + etics::scf::CalculateCoefficientsPartial<<>>(N, n, l, PartialSum); + cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost); + for (int m = 0; m <= l; m++) + for (int Block=0; Block 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter + for (int n = 0; n <= NMAX; n++) { + CalculateCoefficients(n, l); + } + } +} + +void etics::scf::scfclass::GetCoefficients(Complex *A) { + std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A); +} + + +void etics::scf::scfclass::SendCoeffsToGPU() { + if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC; + cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex)); +} + +void etics::scf::scfclass::SendCachePointersToGPU() { + if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC; + cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct)); +} + +void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { + if (N > Nmax) { + fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax); + exit(1); + } + this->N = N; + etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset); +} + +void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) { + etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F); +} + +void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) { + static bool first = true; + if (first) { + printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n"); + first = false; + } + Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z); + Real xi = (r-1)/(r+1); + Real costheta = Pos.z/r; + Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y); + Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I); + + etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential); +} + +void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) { + GetGpuLock(); + SendCachePointersToGPU(); + LoadParticlesToCache(P, N); + CalculateCoefficients(); +#ifdef ETICS_MPI + Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2]; + MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD); + std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h); + // not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs +#endif + SendCoeffsToGPU(); + CalculateGravityFromCoefficients(Potential, F); + ReleaseGpuLock(); +} + +void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { + if (k3gs_new != k3gs) { + if (PartialSum_h != NULL) free(PartialSum_h); + if (PartialSum != NULL) cudaFree(PartialSum_h); + PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex)); + cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex)); + } + k3gs = k3gs_new; + k3bs = k3bs_new; + k4gs = k4gs_new; + k4bs = k4bs_new; +} + +void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { + // First, initialize global arrays if not already initialized. + if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit(); + + // Next, set the launch configuation (either guess it or use what the user specified). + if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) { + int k3gs_new, k3bs_new, k4gs_new, k4bs_new; + etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new); + SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new); + } + else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new); + + // If Nmax > 0 then initialize the cahche memory. + if (Nmax > 0) { + this->Nmax = Nmax; + cudaMalloc((void**)&Cache_h.xi, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.Phi0l, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.Wprev1, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.Wprev2, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.costheta, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real)); + cudaMalloc((void**)&Cache_h.Exponent, Nmax * sizeof(Complex)); + cudaMalloc((void**)&Cache_h.mass, Nmax * sizeof(Real)); + } else { + Cache_h.xi = NULL; + Cache_h.Phi0l = NULL; + Cache_h.Wprev1 = NULL; + Cache_h.Wprev2 = NULL; + Cache_h.costheta = NULL; + Cache_h.sintheta_I = NULL; + Cache_h.Exponent = NULL; + Cache_h.mass = NULL; + } +} + +// Kernels + +__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1" + int i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < N) { + vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale; + Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z); + Real xi = (r-1)/(r+1); + Real costheta = Pos.z/r; + Real sintheta_I = rsqrt(1-costheta*costheta); + + Cache.xi[i] = xi; + Cache.Phi0l[i] = 0.5 * (1 - xi); + Cache.costheta[i] = costheta; + Cache.sintheta_I[i] = sintheta_I; + + Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y); + Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I); + Cache.Exponent[i] = Exponent; + Cache.mass[i] = P[i].m; + + i += blockDim.x * gridDim.x; + } +} + +__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2" + int i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < N) { + Real xi = Cache.xi[i]; + Cache.Phi0l[i] *= 0.25*(1-xi*xi); + + i += blockDim.x * gridDim.x; + } +} + +__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3" + extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch + int tid = threadIdx.x; + for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0); + int i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < N) { + Real xi = Cache.xi[i]; + Real Wnl; + if (n == 0) Wnl = 1; + else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;} + else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;} + else { + Real Wprev1 = Cache.Wprev1[i]; + Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n; + if (n < NMAX) { // Writing is expensive, avoid if possible. + Cache.Wprev2[i] = Wprev1; + Cache.Wprev1[i] = Wnl; + } + } + Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l]; + Real costheta = Cache.costheta[i]; + Real Plm = Pl(l, costheta); + ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0)); + if (l == 0) {i += blockDim.x * gridDim.x; continue;} + + //////////////////////////////// ugly fix + if ((costheta < -0.999) || (costheta > +0.999)) { + i += blockDim.x * gridDim.x; + continue; + } + //////////////////////////////// ugly fix + + Real Plm_prev1 = Plm; + Real sintheta_I = Cache.sintheta_I[i]; + Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; + Complex Exponent = Cache.Exponent[i]; + Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1]; + ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y)); + + if (l == 1) {i += blockDim.x * gridDim.x; continue;} + + Complex TorodialPart = Exponent; + for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop + Real Plm_prev2 = Plm_prev1; + Plm_prev1 = Plm; + Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2; + TorodialPart = Complex_mul(TorodialPart, Exponent); + tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m]; + ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y)); + } + i += blockDim.x * gridDim.x; + } + __syncthreads(); + for (int m = 0; m <= l; m++) { + i = blockDim.x/2; + while (i != 0) { + if (tid < i) + ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]); + __syncthreads(); + i /= 2; + } + if (tid == 0) + PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x]; + } +} + +template +__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) { +// it gets A as parameter because it can be either on host or device +// 0 = both force and potential, 1 = only force, 2 = only pot +//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!! +#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m] +#ifndef __CUDA_ARCH__ + #define AngCoeff AngCoeff_h + #define LengthScale LengthScale_h +#endif + Real dPhiLeft; + Real dPhiLeftMul; + Real dPhiRight; + Real dPhiRightAdd; + Real dPhi; + Real RadialPart2; + Real PlmDerivTheta; + + Real Pot = 0; + Real Fr = 0, Ftheta = 0, Fphi = 0; + Real OneOverXiPlusOne = 1/(1+xi); + Real r_I = (1-xi)*OneOverXiPlusOne; + Real r = 1/r_I; // It's quite likely we can do everything without r. + Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite + + Complex ExponentTmp[LMAX]; + ExponentTmp[0] = Exponent; + for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent); + if (Mode != 2) { + Real xi2 = xi*xi; + Real xi3 = xi2*xi; + dPhiLeft = -0.25*OneOverXiPlusOne; + dPhiLeftMul = 0.25*(1-xi2); + dPhiRight = xi3 - xi2 - xi + 1; + dPhiRightAdd = 2*(xi3 - 2*xi2 + xi); + } + Real Phi0l = 1/(1+r); + Real tmp1 = Phi0l*Phi0l*r; + for (int l = 0; l <= LMAX; l++) { + if (Mode != 2) { + if (l > 0) { + dPhiLeft *= dPhiLeftMul; + dPhiRight += dPhiRightAdd; + } + } + if (Mode != 2) dPhi = dPhiLeft * dPhiRight; + for (int n = 0; n <= NMAX; n++) { + Real Wnl, Wprev1, Wprev2; + if (n == 0) Wnl = 1; + else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;} + else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;} + else { + Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n; + Wprev2 = Wprev1; + Wprev1 = Wnl; + } + + Real Wderiv = 0; + if (n == 1) {Wderiv = 4*l + 3;} + else if (n > 1) { + Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi); + } // From an unknown reason it's faster to have this Block separate from the previous one. + + Real RadialPart = - SQRT_4_PI * Phi0l * Wnl; + if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2)); + + Real Plm = Pl(l, costheta); + Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm; + if (Mode != 1) Pot += RadialPart * tmp2; + if (Mode != 2) Fr += RadialPart2 * tmp2; + + if (l == 0) continue; + + //////////////////////////////// ugly fix + if ((costheta < -0.999) || (costheta > +0.999)) { + continue; + } + //////////////////////////////// ugly fix + + // The Block below is l>=1, m=0. + if (Mode != 2) { + PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster + Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I; + } + + // The Block below is l>=1, m=1. + if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1) + Real Plm_prev1 = Plm; + Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1. + if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1; + tmp2 = 2 * AngCoeff[(l+1)*l/2+1]; + Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1)); + Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y); + Complex tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y); + Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y); + if (Mode != 1) Pot += Complex_real(tmp6); + if (Mode != 2) { + Fr += RadialPart2 * Complex_real(tmp5); + Fphi += Complex_imag(tmp6) * sintheta_I * r_I; + Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I; + } + + if (l == 1) continue; + + for (int m = 2; m <= l; m++) { + Real Plm_prev2 = Plm_prev1; + Plm_prev1 = Plm; + Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2; + tmp2 = 2 * AngCoeff[(l+1)*l/2+m]; + tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m)); + tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y); + tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y); + tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y); + if (Mode != 1) Pot += Complex_real(tmp6); + if (Mode != 2) { + PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1; + Fr += RadialPart2 * Complex_real(tmp5); + Fphi += m * Complex_imag(tmp6) * sintheta_I * r_I; + Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I; + } + } + } + Phi0l *= tmp1; + } + + if (Mode != 2) { + Real sintheta = 1/sintheta_I; + Real tanphi = Exponent.y/Exponent.x; + Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit? + Real sinphi = tanphi*cosphi; + *F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi, costheta*Fr - sintheta*Ftheta); + *F = (*F)*LengthScale*LengthScale; + } + if (Mode != 1) *Potential = Pot*LengthScale; +#ifndef __CUDA_ARCH__ + #undef AngCoeff + #undef LengthScale +#endif +#undef A +} + +__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4" + int i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < N) { + CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]); + i += blockDim.x * gridDim.x; + } +} + +// Global initialization +void etics::scf::GlobalInit() { + int pid = getpid(); + int devid; + cudaGetDevice(&devid); + char hostname[256]; + gethostname(hostname, 256); + char output_string[256]; + sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname); + std::cout << output_string << std::endl; + + RadialCoefficients(etics::scf::RadCoeff_h); + cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real)); + AngularCoefficients(etics::scf::AngCoeff_h); + cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real)); + etics::scf::LengthScale_h = 1; + cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real)); + ScfGloballyInitialized = true; +} + +void etics::scf::SetLengthScale(Real NewLengthScale) { + etics::scf::LengthScale_h = NewLengthScale; + cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real)); +} + +// Old API +void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) { + int blockSize; + int minGridSize; + int gridSize; +//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem. + cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128); + gridSize = minGridSize; + *k3gs_new = gridSize; + *k3bs_new = blockSize; + + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N); + gridSize = (N + blockSize - 1) / blockSize; + *k4gs_new = gridSize; + *k4bs_new = blockSize; +} + +void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!! + SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new); +} + +void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) { + SCFObject.CalculateGravity(P, N, Potential, F); +} diff --git a/src/scf.hpp b/src/scf.hpp new file mode 100644 index 0000000..87e902d --- /dev/null +++ b/src/scf.hpp @@ -0,0 +1,61 @@ +#pragma once +#include "common.hpp" +#include "mathaux.hpp" +namespace etics { + namespace scf { + // CUDA Kernels and device functions cannot be in class space + __global__ void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0)); + __global__ void CalculatePhi0l(int N, int l); + __global__ void CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum); + template __host__ __device__ void CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential); + __global__ void CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F); + + void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F); + void Init(int Nmax, int k3gs_new=0, int k3bs_new=0, int k4gs_new=0, int k4bs_new=0); // to be removed! + void GlobalInit(); + void GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new); + void SetLengthScale(Real NewLengthScale); + + struct CacheStruct { + Real *xi; + Real *Phi0l; + Real *Wprev1; + Real *Wprev2; + Real *costheta; + Real *sintheta_I; + Complex *Exponent; + Real *mass; + }; + + class scfclass { + public: + scfclass(); + ~scfclass(); + void SendCachePointersToGPU(); + void SendCoeffsToGPU(); + void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0)); + void CalculateCoefficients(); + void CalculateGravityFromCoefficients(Real *Potential, vec3 *F); + void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F); + void CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F); + void Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new); + void SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new); + void GetCoefficients(Complex *A); + void GetGpuLock(); + void ReleaseGpuLock(); +//WARNING: the following two functions are not implemented but they should be + int GetNMAX(); + int GetLMAX(); +//TODO We'll make them private in the final version, now we need access for debugging + int N; + int Nmax; +//WARNING we just hardcode 1024 as the maximum size of the coefficient structure. This is because having NMAX and LMAX here causes problems. + Complex A_h[1024]; + CacheStruct Cache_h; + Complex *PartialSum = NULL; + Complex *PartialSum_h = NULL; + int k3gs, k3bs, k4gs, k4bs; + void CalculateCoefficients(int n, int l); + }; + } +}