added project files

This commit is contained in:
Yohai 2019-08-25 16:08:44 +08:00
commit 1718e6287b
24 changed files with 3045 additions and 0 deletions

49
Makefile Normal file
View file

@ -0,0 +1,49 @@
# standard amuse configuration include
# config.mk will be made after ./configure has run
AMUSE_DIR?=../../../..
-include $(AMUSE_DIR)/config.mk
GPUARCH ?= 60
CUDAFLAGS += -arch=sm_$(GPUARCH)
######### ugly!!!! should be in the amuse base make file
MPICXX ?= mpicxx
CFLAGS += -Wall -g
CXXFLAGS += $(CFLAGS)
LDFLAGS += -lm $(MUSE_LD_FLAGS)
OBJS = interface.o
CODELIB = src/libetics.a
CODE_GENERATOR = $(AMUSE_DIR)/build.py
all: etics_worker
clean:
$(RM) *.so *.o *.pyc worker_code.cc worker_code.h
$(RM) *~ etics_worker worker_code.cc
make -C src clean
$(CODELIB):
make -C src libetics.a
worker_code.cc: interface.py
$(CODE_GENERATOR) --type=c interface.py EticsInterface -o $@
worker_code.h: interface.py
$(CODE_GENERATOR) --type=H interface.py EticsInterface -o $@
etics_worker: worker_code.cc worker_code.h $(CODELIB) $(OBJS)
$(MPICXX) $(CXXFLAGS) -L/usr/local/cuda/lib64 $< $(OBJS) $(CODELIB) -o $@ -lcudart
echo the above compilation directive is not nice
.SUFFIXES: .cu .o
.cu.o: $<
$(NVCC) -Xcompiler="$(CXXFLAGS)" -c -o $@ $<
# .cc.o: $<
# $(CXX) $(CXXFLAGS) -c -o $@ $<

102
README.md Normal file
View file

@ -0,0 +1,102 @@
ETICS
=====
This is ETICS (Expansion Techniques In Collisionless Systems), a GPU (currently
**CUDA only**) N-body code which uses series expansion to calculate the
gravitational field. See more details in this publication:
Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98
http://adsabs.harvard.edu/abs/2014ApJ...792...98M
What's inside
-------------
- ETICS standalone program
- ETICS static library
- ETICS module for AMUSE
Prerequisites
-------------
- CUDA (>= 6; mandatory)
- HDF5 (optional)
- Boost (optional)
- AMUSE (mandatory only for AMUSE module)
To disable HDF5 [insert explanation here]
To disable Boost [insert explanation here]
Compilation
-----------
### Standalone program
make standalone
builds in `src/`.
### Static library
make library
builds in `src/`.
### AMUSE module
make
builds in top level directory. The whole `etics` directory has to be placed (or
linked) inside:
$AMUSE_DIR/src/amuse/community/
How to use
----------
The `file.ini` is a self-explanatory input file; if compiled without Boost, fill
in the relevant variables in file `noboost.inc` which is compiled into the
executable (any change of parameters requires re-compilation). Start simulation
with:
./etics file.ini
Any input file name is acceptable.
Known issues
------------
* No MEX
The MEX (Multipole Expansion) method is not available in this version; thus, the
SCF (Self-Consistent Field) method is the only expansion technique available.
The ETICS program has been heavily restructured and the MEX routines are no
longer compatible. Hopefully this will be fixed.
* Hardcoded launch configuration
For at least one of the CUDA kernels, for various reasons, it seems that "brute
force" search is needed to find the optimal launch configuration. Currently it
is hardcoded, and a primitive search routine is provided.
* Problem for particles with |θ| << 1
Due to using an unstable recursion relation to calculate the Associated Legendre
polynomials, particles in a narrow cone around the z-axis cannot be considered
accurately. This means that they give an erroneous contribution to the
gravitational field and also are assigned erroneous force and potential. The
size of this cone increases with the angular term of the expansion. To partly
solve this, the current (ugly) fix is to only consider particles with cos(θ) >
0.999 at the monopole level. This is not so bad because the monopole is always
the most dominant term (and is error free) and the number of particles in this
cone is small and they are transient (i.e. they come out of it usually in a
small number of steps). A somewhat better solution is to make this arbitrary
cutoff of 0.999 l-dependent, and an even better solution would be to use an
asymptotic expression for the Associated Legendre polynomials.

26
WORKPLAN.md Normal file
View file

@ -0,0 +1,26 @@
The idea is to be able to handle mergers and sinking galactic satellites by
having each subsystem centered on one (massless?) particle, and have the
expansion calculated separately in each center of mass system (only for
particles belonging to it). After the internal forces are calculated for each
subsystem, the mutual forces are calculated as well, and all the particles are
integrated in a global coordinate system.
Technically, most of the SCF module has to become a class. Some of the arrays
like RadCoeff and AngCoeff (anything else?) should remain global and would be
initialized by some global initialization routine. Then each subsystem would be
a different instance of the SCF class, with its own list of coefficients and
particle cache. It seems that all subsystems can (should?) share the integrator
object. Maybe (probably?) the integrator should be agnostic to how particles are
divided into subsystems, and the call to CalculateGravity() would take care of
everything. In this case we are talking about a class for the current SCF
routines but also some wrapper that knows how to CalculateGravity() by calling
the appropriate methods for each subsystem's object, and also work out the
mutual gravity. Need a good name for this intermediate level between the
integrator and the SCF core.
This intermediate level or even a different class would deal how to associate
particles to the different subsystems and how to figure out the initial
conditions. Also, the central particles might need some special treatment
(possibly the integrator considers them normal particles but this other level
knows not to use them for force calculation; I should think whether those should
be omitted from output, but it might be helpful to keep them).

1
__init__.py Normal file
View file

@ -0,0 +1 @@
# generated file

126
format-convert.py Normal file
View file

@ -0,0 +1,126 @@
#!/usr/bin/env python
# This script converts phiGRAPE snapshot files to HDF5.
# Author: Yohai Meiron
import argparse, os, sys, numpy, re
# Set commandline parameters
parser = argparse.ArgumentParser(description='Convert phiGRPAE snapshot files into HDF5 (h5part) format.')
parser.add_argument('InputFile', metavar='InputFile', type=str, help='first input file (numbers in the file name must represet numerical order)')
parser.add_argument('--num', type=int, help='number of snapshot to convert (by default, all are converted)')
parser.add_argument('-o', '--output', default='output', help='output file name (or pattern)')
parser.add_argument('--particles', type=int, help='number of particles (if not the whole thing)')
parser.add_argument('--maxfs', type=float, help='maximum size in MB of a single HDF5 output')
parser.add_argument('--single', action='store_true', help='use single precision instead of double')
parser.add_argument('--bh', type=int, nargs='?', const=1, default=0, help='number of black holes to grab from the end of the file')
# Print message and parse input
print 'Welcome to Yohai\'s format converter! (phiGRAPE ---> HDF5)\n'
args = parser.parse_args()
# Parameter conflict tests
if (args.bh > 0) and (args.particles==None):
parser.print_usage()
print '%s: error: cannot --bh without --particles' % sys.argv[0]
sys.exit(2)
Nbh = args.bh
# Verify h5py
try:
import h5py
except:
print '%s: error: booooo, can\'t find h5py.' % sys.argv[0]
sys.exit(1)
# Check if input file exists, separate name from path and get file name pattern
if not os.path.exists(args.InputFile):
print 'File `%s` not found!' % args.InputFile
sys.exit(1)
InputFileName = os.path.basename(args.InputFile)
InputFilePath = os.path.dirname(os.path.realpath(args.InputFile))
Pattern = re.sub(r'\d', r'\d', InputFileName)
Pattern = '^' + Pattern + "$"
r = re.compile(Pattern)
# Look for the other files
ListDir = os.listdir(InputFilePath)
FileList = []
for Filename in ListDir:
m = r.search(Filename)
if not m is None:
FileList.append(Filename)
FileList.sort()
FileList = FileList[FileList.index(InputFileName):]
# Verify that enough files were found; shorten list if needed
if (not args.num==None) and args.num > len(FileList):
print 'Requested to convert %d files, but only %d found!' % (args.num, len(FileList))
sys.exit(1)
if (not args.num==None): num = args.num
else: num = len(FileList)
FileList = FileList[:num]
print 'Will convert %d files in the directory `%s`.' % (num, InputFilePath)
NumperOfStepsPerFile = 2147483647
OutputFileNum = 2
StepCounter = 0
FirstTime = True
for Filename in FileList:
if StepCounter > NumperOfStepsPerFile:
OutputFile.close()
OutputFile = h5py.File('output(%d).h5part' % OutputFileNum, 'w')
StepCounter = 0
OutputFileNum += 1
FullFilename = os.path.join(InputFilePath, Filename)
with open(FullFilename, 'r') as File:
Line = File.readline()
try:
StepNumer = int(Line)
except:
print 'Error to read snapshot number (line 1 in file `%s`)' % Filename
sys.exit(1)
File.readline()
Line = File.readline()
try:
Time = numpy.double(Line)
except:
print 'Error to read snapshot time (line 3 in file `%s`)' % Filename
sys.exit(1)
Data = numpy.loadtxt(FullFilename, skiprows=3)
if (not args.particles==None) and (args.particles > Data.shape[0]):
print 'Requested to use %d paticles, but file %s has only %d!' % (args.particles, Filename, Data.shape[0])
sys.exit(1)
if (not args.particles==None): N = args.particles
else: N = Data.shape[0]
if FirstTime:
if Nbh == 0: print 'Reading %d particles.' % (N)
else: print 'Reading %d normal particles + %d black hole(s).' % (N, Nbh)
if args.single: EstimatedDataSize = 32*N/1024.0**2
else: EstimatedDataSize = 60*N/1024.0**2
if not args.maxfs==None: NumperOfStepsPerFile = (int)(args.maxfs/EstimatedDataSize)
FirstTime = False
if NumperOfStepsPerFile > num: OutputFile = h5py.File('output.h5part', 'w')
else: OutputFile = h5py.File('output(1).h5part', 'w')
DataBH = Data[-Nbh-1:-1]
Data = Data[:N]
Data = numpy.vstack((Data, DataBH))
Group = OutputFile.create_group('Step#%d' % (StepNumer))
Group.attrs['Time'] = Time
Group.attrs['TotalN'] = N + Nbh
if args.single: T='f'
else: T='d'
DataSet = Group.create_dataset('ID', (N+Nbh,), dtype='i'); DataSet[...] = Data[:,0]
DataSet = Group.create_dataset('Mass', (N+Nbh,), dtype=T); DataSet[...] = Data[:,1]
DataSet = Group.create_dataset('X', (N+Nbh,), dtype=T); DataSet[...] = Data[:,2]
DataSet = Group.create_dataset('Y', (N+Nbh,), dtype=T); DataSet[...] = Data[:,3]
DataSet = Group.create_dataset('Z', (N+Nbh,), dtype=T); DataSet[...] = Data[:,4]
DataSet = Group.create_dataset('VX', (N+Nbh,), dtype=T); DataSet[...] = Data[:,5]
DataSet = Group.create_dataset('VY', (N+Nbh,), dtype=T); DataSet[...] = Data[:,6]
DataSet = Group.create_dataset('VZ', (N+Nbh,), dtype=T); DataSet[...] = Data[:,7]
OutputFile.flush()
print 'Completed /Step#%d' % (StepNumer)
StepCounter += 1
OutputFile.close()

378
interface.cu Normal file
View file

@ -0,0 +1,378 @@
#include <iostream>
#include "src/common.hpp"
#include "src/scf.hpp"
#include "src/integrate.hpp"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
using namespace std;
using namespace etics;
Integrator IntegratorObj;
// GLOBAL VARIABLES
Real ConstantStep = 0.001953125;
Real T, Step, dT1, dT2, Tcrit;
int N;
extern Real mass;
extern int k3gs, k3bs, k4gs, k4bs;
/*extern*/ Particle *hostP;
thrust::host_vector<Particle> PPP;
/*extern*/ thrust::device_vector<Particle> PPPPP;
// /*extern*/ thrust::device_vector<vec3> F0xxxxxx;
// /*extern*/ thrust::device_vector<Real> PotPotPot; // ugly name
// /*extern*/ thrust::device_vector<vec3> F1xxxxxx;
/*extern*/ vec3 *F1_ptr;
// extern Particle *P_h;
// extern thrust::device_vector<Particle> P;
//
// extern thrust::device_vector<vec3> F0;
// extern thrust::device_vector<Real> Potential;
// extern thrust::device_vector<vec3> F1;
void CommitParticles();
// void InitSCF(int N);
// void ForceSCF(int N, Real *Potential, Particle *PPPPP, vec3 *F);
void DriftStep();
void KickStep();
void CommitForces();
int InitilizeIntegratorMemory();
#define PTR(x) (thrust::raw_pointer_cast((x).data()))
int initialize_code() {
#warning initscf should be here!!! just the problem is that N is requires, so fix it
return 0;
}
int recommit_parameters() {
return 0;
}
int commit_parameters() {
return 0;
}
int new_particle(int *id, double mass, double x, double y, double z, double vx, double vy, double vz, double radius) {
Particle p;
p.m = mass;
p.pos = vec3(x, y, z);
p.vel = vec3(vx, vy, vz);
PPP.push_back(p);
*id = N;
N++;
return 0;
}
int commit_particles() {
// cerr << "calling commit_particles" << endl;
cerr << "we did commit_particles()" << endl;
etics::scf::Init(N, 180, 64, 2605, 384);
#warning hardcoded launch configuration
IntegratorObj = Integrator(&PPP[0], N);
return 0;
}
struct CenterMassFunctor {
Real ConstantMass;
__host__ __device__ CenterMassFunctor(Real _ConstantMass) : ConstantMass(_ConstantMass) {}
__host__ __device__ vec3 operator() (const Particle &p) const {return p.pos;}
};
struct ShiftFunctor {
vec3 Shift;
__host__ __device__ ShiftFunctor(vec3 _Shift) : Shift(_Shift) {}
__host__ __device__ Particle operator() (Particle &p) const {
p.pos += Shift;
p.CalculateR2();
return p;
}
};
int counttt = 0;
bool FirstStep = true;
int evolve_model(double t) {
// PPPPP = PPP;
cerr << "call evolve_model t_end = " << t << " dt = " << t - T << "****************" << endl;
// vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
// CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass
// cerr << "CENTER OF MASS " << CenterMass.x << endl;
//
// // thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(-CenterMass));
//
// vec3 CenterMass2 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
// CenterMass2 = CenterMass2 * (1.0/N); //ugly should divide by the total mass
// cerr << "CENTER OF MASS after correction " << CenterMass2.x << endl;
//
Step = ConstantStep;
while (T <= t) {
// Take the drift step.
IntegratorObj.DriftStep(Step);
// Calculate the forces in the new positions.
// ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F1xxxxxx));
IntegratorObj.CalculateGravity();
// Finish by taking the kick step.
// The kick functor also "commits" the predicted forces into the "acc" member.
IntegratorObj.KickStep(Step);
// N particles were implicitly propagated in this iteration.
// Advance global time.
T += Step;
}
//
// vec3 CenterMass3 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
// CenterMass3 = CenterMass3 * (1.0/N); //ugly should divide by the total mass
// cerr << "CENTER OF MASS after evolve " << CenterMass3.x << endl;
//
// cerr << "done evolve; transform" << endl;
// // thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(+CenterMass)); // antishift
//
// vec3 CenterMass4 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
// CenterMass4 = CenterMass4 * (1.0/N); //ugly should divide by the total mass
// cerr << "CENTER OF MASS after antishift " << CenterMass4.x << endl;
//
// cerr << "done transform; download to RAM" << endl;
IntegratorObj.CopyParticlesToHost(&PPP[0]);
//
// cerr << "done download; return" << endl;
return 0;
}
int set_begin_time(double time_begin) {
// cerr << "called set_begin_time(" << time_begin << endl;
return 0;
}
int get_begin_time(double *time_begin) {
*time_begin = 0;
return 0;
}
int get_mass(int index_of_the_particle, double *mass) {
*mass = PPP[index_of_the_particle].m;
return 0;
}
int get_time(double *time) {
*time = T;
return 0;
}
int set_mass(int index_of_the_particle, double mass) {
// cerr << "calling set_mass" << endl;
PPP[index_of_the_particle].m = mass;
return 0;
}
int get_index_of_first_particle(int *index_of_the_particle) {
// cerr << "calling get_index_of_first_particle" << endl;
*index_of_the_particle = 0;
return 0;
}
int get_total_radius(double *radius) {
return -2;
}
int get_potential_at_point(double soft, double x, double y, double z, double *phi) {
return -2;
}
int get_total_mass(double *mass) {
return -2;
}
int set_eps2(double epsilon_squared) {
return -1;
}
int get_eps2(double *epsilon_squared) {
*epsilon_squared = 0;
return -1;
}
int get_number_of_particles(int *number_of_particles) {
// cerr << "calling get_number_of_particles" << endl;
*number_of_particles = PPP.size();
return 0;
}
int get_index_of_next_particle(int index_of_the_particle, int *index_of_the_next_particle) {
*index_of_the_next_particle = index_of_the_particle + 1;
return 0;
}
int delete_particle(int index_of_the_particle) {
return -2;
}
int get_potential(int index_of_the_particle, double *potential) {
return -2;
}
int synchronize_model() {
// cerr << "calling synchronize_model" << endl;
return 0;
}
int set_state(int index_of_the_particle, double mass, double radius, double x, double y, double z, double vx, double vy, double vz) {
cerr << "calling set_state" << endl;
// cerr << "calling set_state" << endl;
PPP[index_of_the_particle].pos = vec3(x, y, z);
PPP[index_of_the_particle].vel = vec3(vx, vy, vz);
return 0;
}
int get_state(int index_of_the_particle, double *mass, double *radius, double *x, double *y, double *z, double *vx, double *vy, double *vz) {
// cerr << "calling get_state" << endl;
Particle p = PPP[index_of_the_particle];
*mass = index_of_the_particle;
*x = p.pos.x;
*y = p.pos.y;
*z = p.pos.z;
*vx = p.vel.x;
*vy = p.vel.y;
*vz = p.vel.z;
return 0;
}
int get_time_step(double *time_step) {
// cerr << "calling get_time_step" << endl;
*time_step = ConstantStep;
return 0;
}
int set_time_step(double time_step) {
cerr << "calling set_time_step" << endl;
ConstantStep = time_step;
return 0;
}
int get_launch_config(int **launch_config) {
return 0;
}
int set_launch_config(int *launch_config) {
// k3gs = launch_config[0];
// k3bs = launch_config[1];
// k4gs = launch_config[2];
// k4bs = launch_config[3];
return -2;
}
int recommit_particles() {
// cerr << "calling recommit_particles" << endl;
#warning put something here
cerr << "hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" << endl;
PPPPP = PPP;
return -2;
}
int set_acceleration(int index_of_the_particle, double ax, double ay, double az) {
return -2;
}
int get_center_of_mass_position(double *x, double *y, double *z) {
// vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
// CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass
// *x = CenterMass.x;
// *y = CenterMass.y;
// *z = CenterMass.z;
return 0;
}
int get_center_of_mass_velocity(double *vx, double *vy, double *vz) {
return -2;
}
int get_radius(int index_of_the_particle, double *radius) {
*radius = 0;
return 0;
}
int set_radius(int index_of_the_particle, double radius) {
// should store the radius somewhere but completely ignored by code
// cerr << "calling set_radius" << endl;
return 0;
}
int cleanup_code() {
IntegratorObj.~Integrator();
cerr << "bye" << endl;
return 0;
}
int get_gravity_at_point(double soft, double x, double y, double z, double *forcex, double *forcey, double *forcez) {
return -2;
}
int get_velocity(int index_of_the_particle, double *vx, double *vy, double *vz) {
*vx = PPP[index_of_the_particle].vel.x;
*vy = PPP[index_of_the_particle].vel.y;
*vz = PPP[index_of_the_particle].vel.z;
return 0;
}
int get_position(int index_of_the_particle, double *x, double *y, double *z) {
*x = PPP[index_of_the_particle].pos.x;
*y = PPP[index_of_the_particle].pos.y;
*z = PPP[index_of_the_particle].pos.z;
return 0;
}
bool already_printed = false;
int set_position(int index_of_the_particle, double x, double y, double z) {
if (already_printed == false) {
cerr << "calling set_position" << endl;
cerr << "---------index_of_the_particle=" << index_of_the_particle << endl;
cerr << "--------- x" << PPP[index_of_the_particle].pos.x << "--->" << x << endl;
cerr << "--------- y" << PPP[index_of_the_particle].pos.y << "--->" << y << endl;
cerr << "--------- z" << PPP[index_of_the_particle].pos.z << "--->" << z << endl;
already_printed = true;
}
PPP[index_of_the_particle].pos = vec3(x, y, z);
counttt++;
return 0;
}
int get_acceleration(int index_of_the_particle, double *ax, double *ay, double *az) {
return -2;
}
int set_velocity(int index_of_the_particle, double vx, double vy, double vz) {
// cerr << "calling set_velocity" << endl;
PPP[index_of_the_particle].vel = vec3(vx, vy, vz);
return 0;
}
int get_kinetic_energy(double *kinetic_energy) {
*kinetic_energy = IntegratorObj.KineticEnergy();
return 0;
}
int get_potential_energy(double *potential_energy) {
*potential_energy = IntegratorObj.PotentialEnergy();
return 0;
}
int update_force_potential_arrays(double tttt) {
#warning time shouldnt be a parameter to this one
// ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F0xxxxxx));
return 0;
}

160
interface.py Normal file
View file

@ -0,0 +1,160 @@
from amuse.community import *
from amuse.community.interface.gd import GravitationalDynamics
from amuse.community.interface.gd import GravitationalDynamicsInterface
class EticsInterface(CodeInterface, GravitationalDynamicsInterface, LiteratureReferencesMixIn):
"""
.. [#] Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98:
.. [#] ... "Expansion techniques for collisionless stellar dynamical simulations"
"""
include_headers = ['worker_code.h']
def __init__(self, **keyword_arguments):
CodeInterface.__init__(self, name_of_the_worker='etics_worker', **keyword_arguments)
LiteratureReferencesMixIn.__init__(self)
@legacy_function
def new_particle():
function = LegacyFunctionSpecification()
function.can_handle_array = True
function.addParameter('index_of_the_particle', dtype='int32', direction=function.OUT)
function.addParameter('mass', dtype='float64', direction=function.IN,
description = 'The mass of the particle')
function.addParameter('x', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('y', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('z', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('vx', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.addParameter('vy', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.addParameter('vz', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.addParameter('radius', dtype='float64', direction=function.IN,
description = 'The radius of the particle', default = 0)
function.result_type = 'int32'
return function
@legacy_function
def set_state():
function = LegacyFunctionSpecification()
function.can_handle_array = True
function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN)
function.addParameter('mass', dtype='float64', direction=function.IN,
description = 'The mass of the particle')
function.addParameter('radius', dtype='float64', direction=function.IN,
description = 'The radius of the particle')
function.addParameter('x', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('y', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('z', dtype='float64', direction=function.IN,
description = 'The initial position vector of the particle')
function.addParameter('vx', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.addParameter('vy', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.addParameter('vz', dtype='float64', direction=function.IN,
description = 'The initial velocity vector of the particle')
function.result_type = 'int32'
return function
@legacy_function
def get_state():
function = LegacyFunctionSpecification()
function.can_handle_array = True
function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN)
function.addParameter('mass', dtype='float64', direction=function.OUT,
description = 'The mass of the particle')
function.addParameter('radius', dtype='float64', direction=function.OUT,
description = 'The radius of the particle')
function.addParameter('x', dtype='float64', direction=function.OUT,
description = 'The initial position vector of the particle')
function.addParameter('y', dtype='float64', direction=function.OUT,
description = 'The initial position vector of the particle')
function.addParameter('z', dtype='float64', direction=function.OUT,
description = 'The initial position vector of the particle')
function.addParameter('vx', dtype='float64', direction=function.OUT,
description = 'The initial velocity vector of the particle')
function.addParameter('vy', dtype='float64', direction=function.OUT,
description = 'The initial velocity vector of the particle')
function.addParameter('vz', dtype='float64', direction=function.OUT,
description = 'The initial velocity vector of the particle')
function.result_type = 'int32'
return function
@legacy_function
def evolve_model():
function = LegacyFunctionSpecification()
function.can_handle_array = True
function.addParameter('dt', dtype='float64', direction=function.IN)
function.result_type = 'int32'
return function
@legacy_function
def get_number_of_particles():
function = LegacyFunctionSpecification()
function.addParameter('number_of_particles', dtype='int32', direction=function.OUT, description = 'number of particles')
function.result_type = 'int32'
return function
@legacy_function
def set_time_step():
function = LegacyFunctionSpecification()
function.addParameter('time_step', dtype='float64', direction=function.IN, description = 'time step')
function.result_type = 'int32'
return function
@legacy_function
def update_force_potential_arrays():
function = LegacyFunctionSpecification()
function.can_handle_array = True
function.addParameter('dt', dtype='float64', direction=function.IN)
function.result_type = 'int32'
return function
class Etics(GravitationalDynamics):
def __init__(self, convert_nbody = None, **keyword_arguments):
legacy_interface = EticsInterface(**keyword_arguments)
GravitationalDynamics.__init__(self, legacy_interface, convert_nbody, **keyword_arguments)
def define_parameters(self, object):
object.add_method_parameter(
'get_begin_time',
'set_begin_time',
'begin_time',
'model time to start the simulation at',
default_value = 0.0 | nbody_system.time
)
object.add_method_parameter(
'get_time_step',
'set_time_step',
'time_step',
'constant timestep for iteration',
default_value = 0.001953125 | nbody_system.time
)
def define_methods(self, object):
GravitationalDynamics.define_methods(self, object)
# Define some shortcuts for better readability.
M = nbody_system.mass
L = nbody_system.length
V = nbody_system.speed
T = nbody_system.time
object.add_method('new_particle', (M,L,L,L,V,V,V,L), (object.INDEX, object.ERROR_CODE))
object.add_method('set_state', (object.INDEX, M,L,L,L,L,V,V,V), (object.ERROR_CODE))
object.add_method('get_state', (object.INDEX), (M,L,L,L,L,V,V,V, object.ERROR_CODE))
object.add_method('set_time_begin', (T), (object.ERROR_CODE))
object.add_method('get_time_begin', (), (T, object.ERROR_CODE))
object.add_method('get_number_of_particles', (), (units.none, object.ERROR_CODE))
object.add_method('get_time_step', (), (T, object.ERROR_CODE))
object.add_method('set_time_step', (T), (object.ERROR_CODE))
object.add_method('update_force_potential_arrays', (T), (object.ERROR_CODE))
def define_particle_sets(self, object):
GravitationalDynamics.define_particle_sets(self, object)

72
src/Makefile Normal file
View file

@ -0,0 +1,72 @@
ETICS_LMAX ?= 2
ETICS_NMAX ?= 10
GPUARCH ?= sm_75
CUDAHOME ?= /usr/local/cuda
OPTIMIZATION ?= 3
#Optional CPPFLAGS: -DETICS_HDF5 -DETICS_BOOST -DETICS_MPI
#Note that -DETICS_MPI is /not/ needed if ETICS is used as a library
CPPFLAGS = -DSCF -DLMAX=$(ETICS_LMAX) -DNMAX=$(ETICS_NMAX)
CXX ?= g++
NVCC ?= $(CUDAHOME)/bin/nvcc
LIB += -lm -L$(CUDAHOME)/lib64 -lcudadevrt -lcudart -lcuda
CXXFLAGS += -O$(OPTIMIZATION)
CUDAFLAGS += -arch=$(GPUARCH)
# If -DETICS_HDF5 is used, please specify the following
#H5HOME = /home/meiron/local
#INC += -I$(H5HOME)/include
#LIB += -L$(H5HOME)/lib $(H5HOME)/lib/libhdf5_hl_cpp.a $(H5HOME)/lib/libhdf5_cpp.a $(H5HOME)/lib/libhdf5_hl.a $(H5HOME)/lib/libhdf5.a -lz -ldl -lm
#LDFLAGS += -Wl,-rpath -Wl,$(H5HOME)/lib
# If -DETICS_BOOST is used, please specify the following
#BOOSTHOME ?= /home/meiron/local
#INC += -I$(BOOSTHOME)/include
# If -DETICS_MPI is used, please specify the following
#MPIHOME ?= /home/meiron/local
#INC += -I$(MPIHOME)/include
#LIB += -L$(MPIHOME)/lib -lmpi
#CXXFLAGS += -pthread
#LDFLAGS += -Wl,-rpath -Wl,$(MPIHOME) -Wl,--enable-new-dtags
CODELIB = libetics.a
# object files needed for the library and the standalone
CODEOBJS1 = integrate.o mathaux.o scf.o ic.o
# object files only needed for the standalone
CODEOBJS2 = io.o main.o
AR = ar ruv
RANLIB = ranlib
all: $(CODELIB) standalone
clean:
$(RM) *.o *.a etics
$(CODELIB): $(CODEOBJS1)
$(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) -o dlink.o
$(AR) $@ $(CODEOBJS1) dlink.o
$(RANLIB) $@
library: $(CODELIB)
standalone: $(CODEOBJS1) $(CODEOBJS2)
echo -e "\e[31mWARNING! \e[0mThe standalone program has not been tested for a long time. Use extreme caution."
$(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) $(CODEOBJS2) -o dlink.o
$(CXX) $(CXXFLAGS) -o etics $(CODEOBJS1) $(CODEOBJS2) dlink.o $(LIB) $(LDFLAGS)
.SUFFIXES: .cu .cpp .o
# only io.cpp should be compiled with all the weird HDF5 flags
io.o: io.cpp
$(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o io.o io.cpp
.cu.o: $<
$(NVCC) $(CPPFLAGS) $(INC) -Xcompiler "$(CXXFLAGS)" $(CUDAFLAGS) -dc -o $@ $<
.cpp.o: $<
$(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o $@ $<

117
src/common.hpp Normal file
View file

@ -0,0 +1,117 @@
#pragma once
#include <string>
#include <time.h>
#ifndef __CUDACC__
#define __host__
#define __device__
#endif
#if defined ETICS_SINGLE_PRECISION && defined ETICS_DOUBLE_PRECISION
#error Contradictory precision flags!
#endif
#ifndef ETICS_SINGLE_PRECISION
#define ETICS_DOUBLE_PRECISION
#define Real double
#define MPI_ETICS_REAL MPI_DOUBLE
#else
#define Real float
#define MPI_ETICS_REAL MPI_FLOAT
#endif
#if defined MEX && defined SCF
#error Contradictory method flags!
#endif
#if defined MEX && !defined LMAX
#error LMAX not defined.
#endif
#if defined SCF && ((!defined NMAX) || (!defined LMAX))
#error Both NMAX and LMAX should be defined!
#endif
#if defined MEX && defined NMAX
#warning NMAX is defined, but will be ignored since the method is MEX!
#endif
struct vec3 {
Real x, y, z;
__host__ __device__ vec3() : x(0), y(0), z(0) {}
__host__ __device__ vec3(Real _x, Real _y, Real _z) : x(_x), y(_y), z(_z) {}
__host__ __device__ Real abs2() const {return x*x + y*y + z*z;}
__host__ __device__ vec3 operator+ (const vec3& V) const {return vec3(this->x + V.x, this->y + V.y, this->z + V.z);}
__host__ __device__ vec3 operator- (const vec3& V) const {return vec3(this->x - V.x, this->y - V.y, this->z - V.z);}
__host__ __device__ vec3 operator* (const Real& C) const {return vec3(this->x*C, this->y*C, this->z*C);}
__host__ __device__ vec3& operator+= (const vec3& V) {this->x += V.x; this->y += V.y; this->z += V.z; return *this;}
__host__ __device__ vec3& operator-= (const vec3& V) {this->x -= V.x; this->y -= V.y; this->z -= V.z; return *this;}
__host__ __device__ vec3& operator/= (const Real& C) {this->x /= C; this->y /= C; this->z /= C; return *this;}
__host__ __device__ vec3 operator- () const {return vec3(-this->x, -this->y, -this->z);}
__host__ __device__ vec3 operator+ () const {return vec3(this->x, this->y, this->z);}
};
class Particle {
public:
int ID;
unsigned char Status;
Real m;
vec3 pos, vel, acc;
Real R2;
__host__ __device__ Particle() {}
__host__ __device__ void CalculateR2() {R2 = pos.abs2();}
__host__ __device__ bool operator< (const Particle& p) const {return (this->R2 < p.R2);}
};
#define ETICS_MY_CLOCK CLOCK_MONOTONIC
class HostTimer {
timespec StartTime, StopTime;
public:
void Start() {
clock_gettime(ETICS_MY_CLOCK, &StartTime);
}
void Stop() {
clock_gettime(ETICS_MY_CLOCK, &StopTime);
}
double Difference() {
timespec TimeDiff;
if ((StopTime.tv_nsec - StartTime.tv_nsec) < 0) {
TimeDiff.tv_sec = StopTime.tv_sec - StartTime.tv_sec - 1;
TimeDiff.tv_nsec = 1000000000 + StopTime.tv_nsec - StartTime.tv_nsec;
} else {
TimeDiff.tv_sec = StopTime.tv_sec - StartTime.tv_sec;
TimeDiff.tv_nsec = StopTime.tv_nsec - StartTime.tv_nsec;
}
return (double)TimeDiff.tv_sec + ((double)TimeDiff.tv_nsec)*1.0e-9;
}
};
#ifdef __CUDACC__
class DeviceTimer {
cudaEvent_t StartTime, StopTime;
public:
DeviceTimer() {
cudaEventCreate(&StartTime);
cudaEventCreate(&StopTime);
}
~DeviceTimer() {
cudaEventDestroy(StartTime);
cudaEventDestroy(StopTime);
}
void Start() {
cudaEventRecord(StartTime);
}
void Stop() {
cudaEventRecord(StopTime);
cudaEventSynchronize(StopTime);
}
double Difference(){
float TimeDiff;
cudaEventElapsedTime(&TimeDiff, StartTime, StopTime);
return (double)TimeDiff * 1.0e-3;
}
};
#endif

33
src/file.ini Normal file
View file

@ -0,0 +1,33 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is an ETICS parameter file
; Required parameters are:
; Filename, N, Tcrit, StepSize, dT1, dT2
;
; Special values for Filename:
; _nofile_ or _hernquist_ Generates a Hernquist sphere
; _plummer_ Generates a Plummer sphere
; The _xxx_ can be followed by a integer indicating the random seed, otherwise,
; the system time is used. The generated initial conditions are only saved at
; the t=0 snapshot.
;
; Optional parameters are:
; device (CUDA device ID; by default or if negative, cudaSetDevice is skipped)
; Prefix (="", prefix for the snapshot file names)
; OutputFormat (="ascii", the output format, either ascii or hdf5)
Filename = _hernquist_ 0
;N = 1000000
;Tcrit = 0.03125
;StepSize = 0.001953125
;dT1 = 0.001953125
;dT2 = 0.25
N = 1000000
Tcrit = 500
StepSize = 0.03125
dT1 = 1.0
dT2 = 5.0
Prefix = /home/ym/tmp/merger
OutputFormat = ascii

269
src/ic.cpp Normal file
View file

@ -0,0 +1,269 @@
/**
* @file ic.cpp
* @author Yohai Meiron <ymeiron@pku.edu.cn>
* @brief Functions to generate initial conditions.
*
* These are functions to generate a Hernquist or a Plummer sphere. In both
* cases the model has a total mass of 1 and the total energy is -0.25 Henon
* energy units. The Hernquist model generator was adapted from a program by
* John Dubinski (1999, original version) from the NEMO toolbox; the accessory
* functions are from Numerical Recipes in C (Press et al. 1992). The Plummer
* generator is adapted from the gen-plum program by Peter Berczik.
*/
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include "common.hpp"
#include "ic.hpp"
// Internal functions
namespace etics {
namespace ic {
double E;
void polint(double xa[], double ya[], int n, double x, double *y, double *dy);
double trapzd(double (*func)(double), double a, double b, int n);
double qromb(double (*func)(double), double a, double b);
double xrandom(double a, double b);
double xrandom();
double f(double E2), drho2(double u);
}
}
double etics::ic::trapzd(double (*func)(double), double a, double b, int n) {
double x, tnm, sum, del;
static double s;
int it, j;
if (n == 1) {
return (s=0.5*(b-a)*((*func)(a) + (*func)(b)));
} else {
for (it = 1, j = 1; j < n-1; j++) it <<= 1;
tnm = it;
del = (b-a)/tnm;
x = a + 0.5*del;
for (sum = 0, j = 1; j <= it; j++, x += del) sum += (*func)(x);
s = 0.5*(s+(b-a)*sum/tnm);
return s;
}
}
void etics::ic::polint(double xa[], double ya[], int n, double x, double *y, double *dy) {
int ns = 1;
double den, dif, dift, ho, hp, w;
double *c, *d;
dif = fabs(x-xa[1]);
c = new double[n+1];
d = new double[n+1];
for (int i = 1; i <= n; i++) {
if ((dift=fabs(x-xa[i])) < dif) {
ns = i;
dif = dift;
}
c[i] = ya[i];
d[i] = ya[i];
}
*y = ya[ns--];
for (int m = 1; m < n; m++) {
for (int i = 1; i <= n-m; i++) {
ho = xa[i]-x;
hp = xa[i+m]-x;
w = c[i+1]-d[i];
if ((den=ho-hp) == 0.0) {
std::cerr << "Error in routine polint" << std::endl;
exit(1);
}
den = w/den;
d[i] = hp*den;
c[i] = ho*den;
}
*y += (*dy=(2*ns < (n-m) ? c[ns+1] : d[ns--]));
}
delete[] c;
delete[] d;
}
#define EPS 1.0e-6
#define JMAX 20
#define JMAXP (JMAX+1)
#define K 5
double etics::ic::qromb(double (*func)(double), double a, double b) {
double ss, dss;
double s[JMAXP+1], h[JMAXP+1];
h[1] = 1.0;
for (int j = 1; j <= JMAX; j++) {
s[j] = trapzd(func, a, b, j);
if (j >= K) {
polint(&h[j-K], &s[j-K], K, 0, &ss, &dss);
if (fabs(dss) < EPS*fabs(ss)) return ss;
}
s[j+1] = s[j];
h[j+1] = 0.25*h[j];
}
std::cerr << "Too many steps in routine qromb" << std::endl;
exit(1);
return 0;
}
#undef EPS
#undef JMAX
#undef JMAXP
#undef K
double etics::ic::xrandom(double a, double b) {
double Result = ((double)rand())/((double)RAND_MAX);
Result *= (b - a);
Result += a;
return Result;
}
double etics::ic::xrandom() {
return xrandom(0, 1);
}
/* Distribution function of a Hernquist model */
double etics::ic::f(double E2) {
E = E2;
return qromb(drho2, 0.0, 2.0*sqrt(E))*(sqrt(32.)*(M_PI*M_PI*M_PI));
}
double etics::ic::drho2(double u) {
double p = E - 0.25*u*u;
double r;
if (p <= 0) return 0;
else {
double p1 = 1/p;
double b0 = 0.5*(2-p1);
double c0 = (1-p1);
r = -b0 + sqrt(b0*b0 - c0);
}
double r2 = r*r, r3 = r2*r;
double c0 = 1.0 + r;
double c02 = c0*c0;
double c03 = c02*c0;
double c04 = c02*c02;
double dp1 = -1/c02;
double dp2 = 2/c03;
double drho1r = -(1+4*r)/(r2*c04);
double drho2r = 2.0*(14*r2 + 5*r + 1)/(r3*c04*c0);
return drho2r/(dp1*dp1) - drho1r/(dp1*dp1*dp1)*dp2;
}
void etics::ic::hernquist(int N, int Seed, Particle **ParticleList) {
using namespace etics::ic;
double x, y, z, vx, vy, vz;
double fmax, f0, f1, v2, vmax, vmax2, Ep=0, Ek=0;
srand(Seed);
*ParticleList = new Particle[N];
for(int i = 0; i < N; i++) {
double eta = xrandom(0.0, 1.0);
double radius = sqrt(eta)/(1-sqrt(eta));
double phi = xrandom(0.0, 2*M_PI);
double cth = xrandom(-1.0, 1.0);
double sth = sqrt(1.0 - cth*cth);
x = radius*sth*cos(phi);
y = radius*sth*sin(phi);
z = radius*cth;
double psi0 = 1/(1 + radius);
Ep += -psi0;
vmax2 = 2.0*psi0;
vmax = sqrt(vmax2);
fmax = f(psi0);
f0 = fmax; f1 = 1.1*fmax; // just some initial values
while(f1 > f0) {
// pick a velocity
v2 = 2.0*vmax2;
while(v2 > vmax2) { // rejection technique
vx = vmax*xrandom(-1.0, 1.0);
vy = vmax*xrandom(-1.0, 1.0);
vz = vmax*xrandom(-1.0, 1.0);
v2 = vx*vx + vy*vy + vz*vz;
}
f0 = f(psi0 - 0.5*v2);
f1 = fmax*xrandom(0.0, 1.0);
}
Ek += 0.5*v2;
Particle p;
p.m = 1.0/N;
p.pos = vec3(x, y, z);
p.vel = vec3(vx, vy, vz);
p.ID = i;
p.Status = 0;
p.CalculateR2();
(*ParticleList)[i] = p;
}
Ep /= 2*N; Ek /= N;
double E = Ep + Ek;
double RadiusFactor = -4*E; // should be close to 1/3
double VelocityFactor = 1/sqrt(-4*E); // should be close to sqrt(3)
for(int i = 0; i < N; i++) {
(*ParticleList)[i].pos = (*ParticleList)[i].pos * RadiusFactor;
(*ParticleList)[i].vel = (*ParticleList)[i].vel * VelocityFactor;
}
}
#define RMAX 10.0
void etics::ic::plummer(int N, int Seed, Particle **ParticleList) {
double X, Y, Z, Vx, Vy, Vz,
X1, X2, X3, X4, X5, X6, X7,
R, Ve, V;
srand(Seed);
*ParticleList = new Particle[N];
int i = 0;
while(i < N) {
X1 = xrandom(); X2 = xrandom(); X3 = xrandom();
R = 1.0/sqrt( (pow(X1,-2.0/3.0) - 1.0) );
if (R > RMAX) continue;
Z = (1.0 - 2.0*X2)*R;
X = sqrt(R*R - Z*Z) * cos(2.0*M_PI*X3);
Y = sqrt(R*R - Z*Z) * sin(2.0*M_PI*X3);
Ve = sqrt(2.0)*pow( (1.0 + R*R), -0.25 );
X4 = 0.0;
X5 = 0.0;
while( 0.1*X5 >= X4*X4*pow( (1.0-X4*X4), 3.5) ) {
X4 = xrandom();
X5 = xrandom();
}
V = Ve*X4;
X6 = xrandom();
X7 = xrandom();
Vz = (1.0 - 2.0*X6)*V;
Vx = sqrt(V*V - Vz*Vz) * cos(2.0*M_PI*X7);
Vy = sqrt(V*V - Vz*Vz) * sin(2.0*M_PI*X7);
#define CONV (3.0*M_PI/16.0)
X *= CONV; Y *= CONV; Z *= CONV;
Vx /= sqrt(CONV); Vy /= sqrt(CONV); Vz /= sqrt(CONV);
#undef CONV
Particle p;
p.m = 1.0/N;
p.pos = vec3(X, Y, Z);
p.vel = vec3(Vx, Vy, Vz);
p.ID = i;
p.Status = 0;
p.CalculateR2();
(*ParticleList)[i] = p;
i++;
} /* while(i < N) */
}
#undef RMAX

7
src/ic.hpp Normal file
View file

@ -0,0 +1,7 @@
#pragma once
namespace etics {
namespace ic {
void hernquist(int N, int Seed, Particle **ParticleList);
void plummer(int N, int Seed, Particle **ParticleList);
}
}

127
src/integrate.cu Normal file
View file

@ -0,0 +1,127 @@
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include "common.hpp"
#include "integrate.hpp"
#ifdef MEX
#define method mex
#include "mex.hpp"
#elif defined(SCF)
#define method scf
#include "scf.hpp"
#elif defined(MULTICENTER)
#define method multicenter
#include "multicenter.hpp"
#endif
#define PTR(x) (thrust::raw_pointer_cast((x).data()))
namespace etics
{
// The 'drift' step is performed using the 'acc' member.
struct DriftFunctor {
Real Step, Step2;
__host__ __device__ DriftFunctor(Real _Step) : Step(_Step), Step2(_Step*_Step) {}
__host__ __device__ Particle operator() (Particle &p) const {
p.pos += p.vel*Step + p.acc*0.5*Step2;
p.CalculateR2(); // needed for both MEX and SCF, but not generally needed in leapfrog
return p;
}
};
// The 'kick' step is performed using the 'acc' member and also the force F,
// calculated at the new (predicted) position.
struct KickFunctor {
Real Step;
__host__ __device__ KickFunctor(Real _Step) : Step(_Step) {}
__host__ __device__ Particle operator() (Particle& p, const vec3& F) const {
p.vel += (p.acc + F)*0.5*Step;
p.acc = F;
return p;
}
};
struct KineticEnergyFunctor {
__host__ __device__ Real operator() (const Particle &p) const {return 0.5*p.m*p.vel.abs2();}
};
struct PotentialEnergyFunctor {
__host__ __device__ Real operator() (const Particle &p, const Real &Potential) const {return p.m*Potential;}
};
Integrator::Integrator() {
N = 0;
Time = 0;
}
Integrator::Integrator(Particle *P_h, int _N) {
N = _N;
Time = 0;
P = thrust::device_vector<Particle>(P_h, P_h+N);
Potential = thrust::device_vector<Real>(N);
Force = thrust::device_vector<vec3>(N);
Method = &etics::method::CalculateGravity;
CalculateGravity();
KickStep(0); // Just to "commit" the forces to the particle list.
}
Integrator::~Integrator() {
N = 0;
// Weird way to free Thrust memory.
P.clear(); P.shrink_to_fit();
Potential.clear(); Potential.shrink_to_fit();
Force.clear(); Force.shrink_to_fit();
}
void Integrator::CalculateGravity() {
(*Method)(PTR(P), N, PTR(Potential), PTR(Force));
}
void Integrator::DriftStep(Real Step) {
thrust::transform(P.begin(), P.end(), P.begin(), DriftFunctor(Step));
}
void Integrator::KickStep(Real Step) {
thrust::transform(P.begin(), P.end(), Force.begin(), P.begin(), KickFunctor(Step));
}
Real Integrator::GetTime() {
return Time;
}
int Integrator::GetN() {
return N;
}
Real Integrator::KineticEnergy() {
return thrust::transform_reduce(
P.begin(), P.end(),
KineticEnergyFunctor(),
(Real)0, // It must be clear to the function that this zero is a Real.
thrust::plus<Real>()
);
}
Real Integrator::PotentialEnergy() {
return 0.5*thrust::inner_product(
P.begin(), P.end(),
Potential.begin(),
(Real)0,
thrust::plus<Real>(),
PotentialEnergyFunctor()
);
}
void Integrator::CopyParticlesToHost(Particle *P_h) {
thrust::copy(P.begin(), P.end(), P_h);
}
void Integrator::CopyParticlesToHost(Particle **P_h, int *_N) {
Particle *LocalList = new Particle[N];
thrust::copy(P.begin(), P.end(), LocalList);
*P_h = LocalList;
*_N = N;
}
} // namespace etics

27
src/integrate.hpp Normal file
View file

@ -0,0 +1,27 @@
#pragma once
#include <thrust/device_vector.h>
namespace etics {
class Integrator {
public:
Integrator();
Integrator(Particle *P_h, int _N);
~Integrator();
void CalculateGravity();
void DriftStep(Real Step);
void KickStep(Real Step);
Real GetTime();
int GetN();
Real KineticEnergy();
Real PotentialEnergy();
void CopyParticlesToHost(Particle *P_h);
void CopyParticlesToHost(Particle **P_h, int *_N);
private:
int N;
double Time;
thrust::device_vector<Particle> P;
thrust::device_vector<Real> Potential;
thrust::device_vector<vec3> Force;
void (*Method)(Particle*, int, Real*, vec3*);
};
}

367
src/io.cpp Normal file
View file

@ -0,0 +1,367 @@
#include <iostream>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <climits>
#include <fstream>
#include <sstream>
#include "common.hpp"
#include "io.hpp"
#ifdef ETICS_BOOST
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/ini_parser.hpp>
#endif
using namespace std;
void ReadICsASCII(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) {
*P_h = new Particle[N];
string Line;
ifstream InputFile(Filename.c_str());
int LineNum;
int i = 0;
if (InputFile.is_open()) {
getline(InputFile, Line); // First line is the snapshot number.
int Count = sscanf(Line.c_str(), "%d", FileSnapshotNum);
if (Count != 1) {
cerr << "Problem in input file, line 1." << endl;
exit(1);
}
getline(InputFile, Line); // Second line is the expected number of particles.
int ExpectedN;
Count = sscanf(Line.c_str(), "%d", &ExpectedN); // We ignore it actually.
if (Count != 1) {
cerr << "Problem in input file, line 2." << endl;
exit(1);
}
getline(InputFile, Line);
double T0;
Count = sscanf(Line.c_str(), "%lf", &T0);
if (Count != 1) {
cerr << "Problem in input file, line 3." << endl;
exit(1);
}
*FileTime = (Real)T0;
LineNum = 3;
while (getline(InputFile, Line))
{
double m, x, y, z, vx, vy, vz;
int id;
Particle p;
Count = sscanf(Line.c_str(), "%d %lf %lf %lf %lf %lf %lf %lf", &id, &m, &x, &y, &z, &vx, &vy, &vz);
if (Count != 8) {
cerr << "Problem in input file, line " << LineNum + 1 << "." << endl;
exit(1);
}
p.ID = id;
p.m = (Real)m;
p.pos = vec3((Real)x, (Real)y, (Real)z); // This is to ensure proper casting in the case of single prec.
p.vel = vec3((Real)vx, (Real)vy, (Real)vz);
p.Status = 0;
p.CalculateR2();
(*P_h)[i] = p;
i++;
LineNum++;
if (i == N) break;
}
InputFile.close();
} else {
cerr << "Can't open file" << Filename << "." << endl;
exit(1);
}
if (i < N) {
cerr << "Was only able to read " << i << " particles while " << N << " were requested." << endl;
exit(1);
}
}
void WriteSnapshotASCII(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) {
char S[512];
sprintf(S, "%s%04d.dat", Prefix.c_str(), SnapNumber);
ofstream SnapshotFile;
SnapshotFile.open(S);
sprintf(S, "%06d\n", SnapNumber); SnapshotFile << S;
sprintf(S, "%06d\n", N); SnapshotFile << S;
sprintf(S, "%.16E\n", T); SnapshotFile << S;
for (int i = 0; i < N; i++) {
sprintf(S, "%06d%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e\n", P_h[i].ID, P_h[i].m, P_h[i].pos.x, P_h[i].pos.y, P_h[i].pos.z, P_h[i].vel.x, P_h[i].vel.y, P_h[i].vel.z); SnapshotFile << S;
}
SnapshotFile.close();
}
#define THROW_EXCEPTION(str, stat) {cerr << str << endl; exit((stat));}
void ParseInput(int argc, char *argv[], ParametersStruct *Params) {
ParametersStruct P;
#ifdef ETICS_BOOST
if (argc < 2) THROW_EXCEPTION("Usage: etics [FILE]...", 1)
ifstream TestFileObject(argv[1]);
if (!TestFileObject) THROW_EXCEPTION("Problem reading file...", 1)
boost::property_tree::ptree pt;
boost::property_tree::ini_parser::read_ini(argv[1], pt); // what if file doesn't exist?
P.N = pt.get<int>("N", 0);
if (P.N <= 0) THROW_EXCEPTION("Could not read number of particles (N) from ini file.", 1)
P.Tcrit = pt.get<Real>("Tcrit", -1);
if (P.Tcrit < 0) THROW_EXCEPTION("Could not read finish time (Tcrit) from ini file.", 1)
P.dT1 = pt.get<Real>("dT1", 0);
if (P.dT1 <= 0) THROW_EXCEPTION("Could output time interval (dT1) from ini file.", 1)
P.dT2 = pt.get<Real>("dT2", 0);
if (P.dT2 <= 0) THROW_EXCEPTION("Could snapshot time interval (dT2) from ini file.", 1)
P.ConstantStep = pt.get<Real>("StepSize", -1);
if (P.ConstantStep < 0) THROW_EXCEPTION("Could not read step size (StepSize) from ini file.", 1)
P.Filename = pt.get<string>("Filename", "\n");
if (P.Filename == "\n") THROW_EXCEPTION("Could not read initial condition file name (Filename) from ini file.", 1)
P.Prefix = pt.get<string>("Prefix", "");
P.OutputFormat = pt.get<string>("OutputFormat", "ascii");
P.DeviceID = pt.get<int>("device", -1);
if (argc >= 3) { // If there is a argument after the file, it must be either --device or -d
int Version1 = strncmp(argv[2], "--device", 8);
int Version2 = strncmp(argv[2], "-d", 2);
if ((Version1!=0) && (Version2!=0)) THROW_EXCEPTION("Commandline argument after input file must be device number.", 1)
char *DeviceIDStr = strchr(argv[2], '=');
if (DeviceIDStr!=NULL) DeviceIDStr = DeviceIDStr + 1;
else {
if (argc <= 3) THROW_EXCEPTION("Device number must follow.", 1)
DeviceIDStr = argv[3];
}
P.DeviceID = atoi(DeviceIDStr);
if ((P.DeviceID==0) && (strcmp(DeviceIDStr, "0")!=0)) THROW_EXCEPTION("Error understanding device number.", 1)
}
#else
// If no BOOST, we include a file with the parameters and compile it.
int N, DeviceID = -1;
Real dT1, dT2, Tcrit, StepSize;
std::string Filename, Prefix = "", OutputFormat="";
#include "noboost.inc"
P.N = N;
P.Filename = Filename;
P.dT1 = dT1;
P.dT2 = dT2;
P.Tcrit = Tcrit;
P.ConstantStep = StepSize;
P.DeviceID = DeviceID;
P.OutputFormat = OutputFormat;
#endif
P.Seed = INT_MIN;
if ((P.Filename[0]=='_') && (P.Filename.rfind("_") > 0)) {
int Break = P.Filename.rfind("_") + 1;
string SeedStr = P.Filename.substr(Break, P.Filename.length() - Break);
int SeedInt;
istringstream iss(SeedStr);
iss >> ws >> P.Seed >> ws;
if(!iss.eof()) THROW_EXCEPTION("Could not understand random seed (in Filename).", 1)
P.Filename = P.Filename.substr(0, Break);
}
if (P.Seed == INT_MIN) P.Seed = (int)time(NULL);
*Params = P;
}
// make safety: if single precision, fail to compile
#ifdef ETICS_HDF5
#include "H5Cpp.h"
int H5IterNum;
int *H5IterSnapNumbers;
string *H5IterGroupNames;
herr_t file_info(hid_t loc_id, const char *name, void *opdata) {
int res = sscanf(name, "Step#%d", H5IterSnapNumbers + H5IterNum);
if (res != 1) {
cerr << "Problem understanding group \"" << name << "\" in HDF5 file" << endl;
exit(1);
}
H5IterGroupNames[H5IterNum] = name;
H5IterNum++;
return 0;
}
void ReadICsHDF5(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) {
double *Mass, *X, *Y, *Z, *VX, *VY, *VZ;
int *ID;
Mass = new double[N];
X = new double[N];
Y = new double[N];
Z = new double[N];
VX = new double[N];
VY = new double[N];
VZ = new double[N];
ID = new int[N];
H5::H5File file;
file = H5::H5File(Filename, H5F_ACC_RDONLY);
H5::Group group = file.openGroup("/");
int NumberOfGroups = group.getNumObjs();
H5IterSnapNumbers = new int[NumberOfGroups];
H5IterGroupNames = new string[NumberOfGroups];
H5IterNum = 0;
int ret = file.iterateElems("/", NULL, file_info, NULL);
int MaxSnapIndex = 0;
for (int i=1; i < NumberOfGroups; i++) MaxSnapIndex = (H5IterSnapNumbers[i]>H5IterSnapNumbers[MaxSnapIndex])?i:MaxSnapIndex;
string GroupName = H5IterGroupNames[MaxSnapIndex];
group = file.openGroup("/" + GroupName);
H5::Attribute attribute = group.openAttribute("Time");
attribute.read(H5::PredType::NATIVE_DOUBLE, FileTime);
*FileSnapshotNum = H5IterSnapNumbers[MaxSnapIndex];
H5::DataSet dataset = file.openDataSet("/" + GroupName + "/ID");
int NfromFile = dataset.getStorageSize()/sizeof(int);
if (NfromFile < N) {
cerr << "Was only able to read " << NfromFile << " particles while " << N << " were requested." << endl;
exit(1);
}
dataset.read(ID, H5::PredType::NATIVE_UINT32);
dataset = file.openDataSet("/" + GroupName + "/Mass"); dataset.read(Mass, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/X"); dataset.read(X, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/Y"); dataset.read(Y, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/Z"); dataset.read(Z, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/VX"); dataset.read(VX, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/VY"); dataset.read(VY, H5::PredType::NATIVE_DOUBLE);
dataset = file.openDataSet("/" + GroupName + "/VZ"); dataset.read(VZ, H5::PredType::NATIVE_DOUBLE);
dataset.close();
group.close();
file.close();
*P_h = new Particle[N];
for (int i = 0; i < N; i++) {
Particle p;
p.ID = ID[i];
p.m = (Real)Mass[i];
p.pos = vec3((Real)X[i], (Real)Y[i], (Real)Z[i]); // This is to ensure proper casting in the case of single prec.
p.vel = vec3((Real)VX[i], (Real)VY[i], (Real)VZ[i]);
p.Status = 0;
p.CalculateR2();
(*P_h)[i] = p;
}
}
bool H5FirstSnapshot = true;
void WriteSnapshotHDF5(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) {
string Filename = Prefix + ".h5part";
H5::H5File file;
if (H5FirstSnapshot && std::ifstream(Filename.c_str())) {
cerr << "File \"" << Filename << "\" already exist!" << endl;
exit(1);
}
H5FirstSnapshot = false;
if (std::ifstream(Filename.c_str())) {
file = H5::H5File(Filename, H5F_ACC_RDWR);
}
else file = H5::H5File(Filename, H5F_ACC_TRUNC);
char GroupName[64];
sprintf(GroupName, "/Step#%d", SnapNumber);
H5::Group group;
try {
H5::Exception::dontPrint();
group = H5::Group(file.createGroup(GroupName));
}
catch ( H5::FileIException error ) {
cerr << "Couldn't create group \"" << GroupName << "\" in HDF5 file, maybe it already exists?" << endl;
exit(1);
}
double *Mass, *X, *Y, *Z, *VX, *VY, *VZ, *AX, *AY, *AZ;
Mass = new double[N];
X = new double[N];
Y = new double[N];
Z = new double[N];
VX = new double[N];
VY = new double[N];
VZ = new double[N];
AX = new double[N];
AY = new double[N];
AZ = new double[N];
int *ID;
ID = new int[N];
for (int i = 0; i < N; i++) {
Particle p = P_h[i];
ID[i] = p.ID;
Mass[i] = p.m;
X[i] = p.pos.x;
Y[i] = p.pos.y;
Z[i] = p.pos.z;
VX[i] = p.vel.x;
VY[i] = p.vel.y;
VZ[i] = p.vel.z;
AX[i] = p.acc.x;
AY[i] = p.acc.y;
AZ[i] = p.acc.z;
}
H5::DataSpace attr_dataspace = H5::DataSpace(H5S_SCALAR);
H5::Attribute attribute = group.createAttribute("Time", H5::PredType::NATIVE_DOUBLE, attr_dataspace);
attribute.write( H5::PredType::NATIVE_DOUBLE, &T);
attribute = group.createAttribute("TotalN", H5::PredType::NATIVE_UINT32, attr_dataspace);
attribute.write( H5::PredType::NATIVE_UINT32, &N);
hsize_t dimsxxx = N;
H5::DataSpace dataspacexxx(1, &dimsxxx);
H5::DataSet dataset3;
dataset3 = H5::DataSet(group.createDataSet("ID", H5::PredType::NATIVE_UINT32, dataspacexxx));
dataset3.write(ID, H5::PredType::NATIVE_UINT32);
dataset3 = H5::DataSet(group.createDataSet("Mass", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(Mass, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("X", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(X, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("Y", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(Y, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("Z", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(Z, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("VX", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(VX, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("VY", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(VY, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("VZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(VZ, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("AX", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(AX, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("AY", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(AY, H5::PredType::NATIVE_DOUBLE);
dataset3 = H5::DataSet(group.createDataSet("AZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
dataset3.write(AZ, H5::PredType::NATIVE_DOUBLE);
dataset3.close();
dataspacexxx.close();
attr_dataspace.close();
attribute.close();
group.close();
file.close();
delete [] Mass;
delete [] X;
delete [] Y;
delete [] Z;
delete [] VX;
delete [] VY;
delete [] VZ;
delete [] AX;
delete [] AY;
delete [] AZ;
delete [] ID;
}
#endif

19
src/io.hpp Normal file
View file

@ -0,0 +1,19 @@
#pragma once
struct ParametersStruct {
int N;
std::string Filename;
int Seed;
Real dT1, dT2, Tcrit, ConstantStep;
std::string Prefix;
std::string OutputFormat;
int DeviceID;
};
void ReadICsASCII(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime);
void WriteSnapshotASCII(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T);
void ParseInput(int argc, char *argv[], ParametersStruct *Params);
#ifdef ETICS_HDF5
void ReadICsHDF5(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime);
void WriteSnapshotHDF5(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T);
#endif

347
src/main.cu Normal file
View file

@ -0,0 +1,347 @@
/**
* @file main.cu
* @author Yohai Meiron <ymeiron@pku.edu.cn>
* @version 1.0
*/
// If you have CUDA then compile with:
// nvcc main.cu -lm -O3 -arch=sm_20 -o output
// Otherwise enable OpenMP and compile with GCC:
// g++ -x c++ -O3 -o output main.cu -DOMP -fopenmp -lgomp -I/home/ym
// The -I is the path to the parent directory where thrust is.
#include <iostream>
#include <fstream>
#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#ifdef ETICS_MPI
#include <mpi.h>
#endif
#define CUDA
#ifdef OMP
#error Sorry, OpenMP is currently disabled.
#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND_OMP
#undef CUDA
#define PARALLEL_GET_TID omp_get_thread_num()
#define PARALLEL_ADVANCE omp_get_num_threads()
#define __global__
// #include <complex>
#else
#define PARALLEL_GET_TID threadIdx.x + blockIdx.x * blockDim.x
#define PARALLEL_ADVANCE blockDim.x * gridDim.x
// #include "cuda_complex.hpp"
#endif
// #define complex complex<Real>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/partition.h>
#include <thrust/inner_product.h>
#include "common.hpp"
#include "io.hpp"
#include "ic.hpp"
#include "integrate.hpp"
#ifdef MEX
#define method mex
#include "mex.hpp"
#elif defined(SCF)
#define method scf
#include "scf.hpp"
#elif defined(MULTICENTER)
#define method multicenter
#include "multicenter.hpp"
#endif
using namespace std;
using namespace etics;
// GLOBAL VARIABLES
int MyRank, NumProcs;
Real ConstantStep = 0.001953125;
Real T, Step, dT1, dT2, Tcrit, FileTime;
int NSteps = 0, FileSnapshotNum;
struct ReorderingFunctor {
__host__ __device__ bool operator() (const Particle &lhs, const Particle &rhs) {
return (lhs.ID <= rhs.ID);
}
};
Real CalculateStepSize() {
return ConstantStep;
}
void DisplayInformation(Integrator IntegratorObj) {
Real Ek = IntegratorObj.KineticEnergy();
Real Ep = IntegratorObj.PotentialEnergy();
Real Energy = Ek + Ep;
int N=IntegratorObj.GetN();
Real TotalEnergy;
int TotalN;
#ifdef ETICS_MPI
MPI_Reduce(&Energy, &TotalEnergy, 1, MPI_ETICS_REAL, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&N, &TotalN, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
#else
TotalEnergy = Energy;
TotalN = N;
#endif
if (MyRank==0) {
printf(" TIME =%6.2f NSTEPS =%6d ENERGY =%20.16f N = %d\n", T, NSteps, TotalEnergy, TotalN);
fflush(stdout);
}
}
void PrepareSnapshot(Integrator IntegratorObj, Particle **ParticleList, int *CurrentTotalN) {
Particle *LocalList;
int LocalBufferSize;
IntegratorObj.CopyParticlesToHost(&LocalList, &LocalBufferSize);
int TotalN = 0;
#ifdef ETICS_MPI
LocalBufferSize *= sizeof(Particle);
int BufferSizes[NumProcs];
MPI_Gather(&LocalBufferSize, 1, MPI_INT, BufferSizes, 1, MPI_INT, 0, MPI_COMM_WORLD);
int Displacements[NumProcs];
if (MyRank==0) {
for (int p = 0; p < NumProcs; p++) TotalN += BufferSizes[p]/sizeof(Particle);
Displacements[0] = 0;
for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1];
*ParticleList = new Particle[TotalN];
}
MPI_Gatherv(LocalList, LocalBufferSize, MPI_BYTE, *ParticleList, BufferSizes, Displacements, MPI_BYTE, 0, MPI_COMM_WORLD);
#else
TotalN = LocalBufferSize/sizeof(Particle);
*ParticleList = new Particle[TotalN];
std::copy(LocalList, LocalList+TotalN, *ParticleList);
#endif
#ifdef MEX
thrust::sort(*ParticleList, (*ParticleList)+TotalN, ReorderingFunctor());
#endif
*CurrentTotalN = TotalN;
free(LocalList);
//WARNING Honestly I dont get why I use allocate memory inside the fuction, its very confusing and makes it harder to find leaks.
//Future Yohai (2019): Yeah, it's done both here for **ParticleList and internally for *LocalList in Integrator::CopyParticlesToHost that does it, pretty ugly.
}
int main(int argc, char *argv[]) {
#ifdef ETICS_MPI
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
MPI_Comm_size(MPI_COMM_WORLD, &NumProcs);
#endif
if (MyRank==0) {
cerr << "Welcome to ETICS..." << endl;
#ifdef MEX
cerr << "Using method: MEX" << endl;
cerr << "LMAX=" << LMAX << endl;
#elif defined(SCF)
cerr << "Using method: SCF" << endl;
cerr << "LMAX=" << LMAX << endl;
cerr << "NMAX=" << NMAX << endl;
#endif
}
string Filename;
int DeviceID = 0;
ParametersStruct Params;
// Instead of reading the input file with MyRank=0 and broadcast the result, we let every rank read the file. This probably saves ~20 lines of ugly MPI code.
ParseInput(argc, argv, &Params);
int N = Params.N; // total; will be divided by number of processes
Filename = Params.Filename;
Tcrit = Params.Tcrit;
ConstantStep = Params.ConstantStep;
DeviceID = Params.DeviceID;
dT1 = Params.dT1;
dT2 = Params.dT2;
if (DeviceID >= 0) {
if (cudaSetDevice(DeviceID) != cudaSuccess) {
cerr << "Problem opening device (ID=" << DeviceID << ")" << endl;
exit(1);
}
} else {
cerr << "Skipping call to cudaSetDevice." << endl;
}
// Read an input file and initialize the global particle structure.
Particle *FullList;
if (MyRank==0) {
if ((Filename == "_nofile_") || (Filename == "_hernquist_")) {
cout << "Generating a Hernquist sphere..." << endl;
etics::ic::hernquist(N, Params.Seed, &FullList);
FileSnapshotNum = 0;
FileTime = 0;
cout << "Done." << endl;
} else if (Filename == "_plummer_") {
cout << "Generating a Plummer sphere..." << endl;
etics::ic::plummer(N, Params.Seed, &FullList);
FileSnapshotNum = 0;
FileTime = 0;
cout << "Done." << endl;
}
else {
string InputFileSuffix = Filename.substr(Filename.find_last_of("."), Filename.length()-Filename.find_last_of("."));
if ((InputFileSuffix==".h5part") || (InputFileSuffix==".hdf5") || (InputFileSuffix==".h5")) {
#ifndef ETICS_HDF5
cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot read input in this format." << endl;
exit(1);
#else
ReadICsHDF5(Filename, N, &FullList, &FileSnapshotNum, &FileTime);
#endif
} else ReadICsASCII(Filename, N, &FullList, &FileSnapshotNum, &FileTime);
}
}
#ifndef ETICS_HDF5
if (Params.OutputFormat == "hdf5") {
cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot output in requested format." << endl;
exit(1);
}
#endif
if (!(Params.OutputFormat == "hdf5") && !(Params.OutputFormat == "ascii")) {
cerr << "Requested output format unrecognized." << endl;
exit(1);
}
#ifdef ETICS_MPI
int LocalN = N / NumProcs;
int Remainder = N - LocalN*NumProcs;
if (MyRank==NumProcs-1) LocalN += Remainder;
Particle *LocalList = new Particle[LocalN];
int BufferSizes[NumProcs];
int Displacements[NumProcs];
if (MyRank==0) {
for (int p = 0; p < NumProcs; p++) BufferSizes[p] = (N / NumProcs)*sizeof(Particle);
BufferSizes[NumProcs-1] += Remainder*sizeof(Particle);
Displacements[0] = 0;
for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1];
}
MPI_Scatterv(FullList, BufferSizes, Displacements, MPI_BYTE, LocalList, LocalN*sizeof(Particle), MPI_BYTE, 0, MPI_COMM_WORLD);
if (MyRank==0) free(FullList);
N = LocalN;
#else
Particle *LocalList = new Particle[N];
copy(FullList, FullList+N, LocalList);
#endif
#ifdef ETICS_MPI
// This code block is just to ask each MPI process to report, and find if there is a conflict (i.e. multiple processes using the same physical device)
cudaDeviceProp DeviceProperties;
const int etics_str_len = 256;
cudaGetDeviceProperties(&DeviceProperties, 0);
char ProcessorName[etics_str_len];
int tmp;
MPI_Get_processor_name(ProcessorName, &tmp);
char UniqueDeviceID[etics_str_len];
sprintf(UniqueDeviceID, "%d$$$%s", DeviceProperties.pciBusID, ProcessorName);
char Message[etics_str_len];
sprintf(Message, "Hello from rank %d (of %d) on %s, using \"%s\" with PCI bus ID %d; this rank has %d particles.\n", MyRank, NumProcs, ProcessorName, DeviceProperties.name, DeviceProperties.pciBusID, LocalN);
if (MyRank == 0) {
printf("%s", Message);
fflush(stdout);
for (int Rank = 1; Rank < NumProcs; Rank++) {
MPI_Recv(Message, etics_str_len, MPI_CHAR, Rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("%s", Message);
fflush(stdout);
}
} else {
MPI_Send(Message, etics_str_len, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
}
// Here we collect the GPU IDs from all MPI processes and print a warning if one GPU is assigned to more than one process.
char *StrBuf;
StrBuf = (char*)malloc(NumProcs*etics_str_len*sizeof(char));
MPI_Gather( UniqueDeviceID, etics_str_len, MPI_CHAR, StrBuf, etics_str_len, MPI_CHAR, 0, MPI_COMM_WORLD);
if (MyRank == 0) {
bool DuplicateFound = false;
for (int i = 0; i < NumProcs; i++) {
for (int j = i+1; j < NumProcs; j++) {
if (strcmp(StrBuf+i*etics_str_len, StrBuf+j*etics_str_len) == 0) {
DuplicateFound = true;
break;
}
if (DuplicateFound) break;
}
}
if (DuplicateFound) {
printf("\x1B[31m!!SEVERE WARNING!!\x1B[0m It seems the same physical GPU device was assigned to multiple processes; check the submission script.\n");
}
}
free(StrBuf);
#endif
// Now initiate the code
method::Init(N, 0, 0, 0, 0);
// Initiate the integrator
Integrator IntegratorObj(LocalList, N);
// More initializations.
Real NextOutput = 0, NextSnapshot = 0;
T = FileTime;
int SnapNumber = FileSnapshotNum;
Step = CalculateStepSize();
while (T <= Tcrit) {
if (T >= NextOutput) {
DisplayInformation(IntegratorObj);
NextOutput += dT1;
}
if (T >= NextSnapshot) {
// cerr << "TESTING MODE!!! NOT WRITING SNAPSHOTS!!!" << endl;
int CurrentTotalN;
PrepareSnapshot(IntegratorObj, &FullList, &CurrentTotalN);
if (MyRank==0) {
if (Params.OutputFormat == "ascii") WriteSnapshotASCII(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T);
#ifdef ETICS_HDF5
else if (Params.OutputFormat == "hdf5") WriteSnapshotHDF5(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T);
#endif
else {cerr << "Error" << endl; exit(1);}
free(FullList);
}
SnapNumber++;
NextSnapshot += dT2;
}
// Take the drift step.
IntegratorObj.DriftStep(Step);
// Calculate the forces in the new positions.
IntegratorObj.CalculateGravity();
// Finish by taking the kick step.
// The kick functor also "commits" the predicted forces into the "acc" member.
IntegratorObj.KickStep(Step);
// N particles were implicitly propagated in this iteration.
NSteps += 1;
// Advance global time.
T += Step;
// Calculate the next step.
Step = CalculateStepSize();
}
IntegratorObj.~Integrator();
#ifdef ETICS_MPI
MPI_Finalize();
#endif
return 0;
}

57
src/mathaux.cu Normal file
View file

@ -0,0 +1,57 @@
#include "common.hpp"
// #include "mathaux.hpp"
double FactorialSCF_tmpname(int x) {
return (x <= 1) ? 1 : x*FactorialSCF_tmpname(x - 1);
}
double GammaPlusHalf(int x) {
return pow(0.25, x) * sqrt(M_PI) * FactorialSCF_tmpname(2*x) / FactorialSCF_tmpname(x);
}
void RadialCoefficients(Real *Result) {
for (int n = 0; n <= NMAX; n++) {
for (int l = 0; l <= LMAX; l++) {
double AnlTilde = - pow(2.0, 8*l + 6) * FactorialSCF_tmpname(n) * (n + 2*l + 1.5);
AnlTilde *= GammaPlusHalf(2*l + 1) * GammaPlusHalf(2*l + 1);
AnlTilde /= (4*M_PI * (n*(n + 3 + 4*l)/2 + (l+1)*(2*l+1)) * FactorialSCF_tmpname(n+4*l+2));
Result[(LMAX+1)*n+l] = (Real)AnlTilde;
}
}
}
__host__ __device__ Real Pl(int l, Real x) {
// No range check or anything... The first 10 polynomials are hardcoded
switch (l) {
case 0 : return 1;
case 1 : return x;
case 2 : return (1.5*x*x - 0.5);
case 3 : return (2.5*x*x*x - 1.5*x);
case 4 : {Real x2=x*x; return (4.375*x2*x2 - 3.75*x2 + 0.375);}
case 5 : {Real x2=x*x, x3=x2*x; return (7.875*x3*x2 - 8.75*x3 + 1.875*x);}
case 6 : {Real x2=x*x, x4=x2*x2; return (14.4375*x4*x2 - 19.6875*x4 + 6.5625*x2 - 0.3125);}
case 7 : {Real x2=x*x, x3=x2*x, x5=x3*x2; return (26.8125*x5*x2 - 43.3125*x5 + 19.6875*x3 - 2.1875*x);}
case 8 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4; return (50.2734375*x8 - 93.84375*x6 + 54.140625*x4 - 9.84375*x2 + 0.2734375);}
case 9 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2;
return (94.9609375*x9 - 201.09375*x7 + 140.765625*x5 - 36.09375*x3 + 2.4609375*x);}
case 10 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4;
return (180.42578125*x10 - 427.32421875*x8 + 351.9140625*x6 - 117.3046875*x4 + 13.53515625*x2 - 0.24609375);}
case 11 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2;
return (344.44921875*x11 - 902.12890625*x9 + 854.6484375*x7 - 351.9140625*x5 + 58.65234375*x3 - 2.70703125*x);}
case 12 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6;
return (660.1943359375*x12 - 1894.470703125*x10 + 2029.7900390625*x8 - 997.08984375*x6 + 219.9462890625*x4 - 17.595703125*x2 + 0.2255859375);}
case 13 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2, x13=x9*x2;
return (1269.6044921875*x13 - 3961.166015625*x11 + 4736.1767578125*x9 - 2706.38671875*x7 + 747.8173828125*x5 -87.978515625*x3 + 2.9326171875*x);}
case 14 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6, x14=x8*x6;
return (2448.52294921875*x14 - 8252.42919921875*x12 + 10893.2065429688*x10 - 7104.26513671875*x8 + 2368.08837890625*x6 - 373.90869140625*x4 + 21.99462890625*x2 - 0.20947265625);}
default : return -1e30;
}
}
void AngularCoefficients(Real *Result) {
for (int l = 0; l <= LMAX; l++) {
for (int m = 0; m <= l; m++) {
Result[(l+1)*l/2+m] = (sqrt((2*l+1)/(4*M_PI)) * sqrt(FactorialSCF_tmpname(l-m)/FactorialSCF_tmpname(l+m)));
}
}
}

30
src/mathaux.hpp Normal file
View file

@ -0,0 +1,30 @@
#pragma once
#include "common.hpp"
#include <cuComplex.h>
#ifdef ETICS_DOUBLE_PRECISION
#define Complex cuDoubleComplex
#define make_Complex make_cuDoubleComplex
#define Complex_add cuCadd
#define Complex_mul cuCmul
#define Complex_imag cuCimag
#define Complex_real cuCreal
#define Complex_conj cuConj
#else
#define Complex cuFloatComplex
#define make_Complex make_cuFloatComplex
#define Complex_add cuCaddf
#define Complex_mul cuCmulf
#define Complex_imag cuCimagf
#define Complex_real cuCrealf
#define Complex_conj cuConjf
#endif
#define SQRT_4_PI 3.5449077018110320545963349666822903655950989122447742564276
double FactorialSCF_tmpname(int x);
double GammaPlusHalf(int x);
void RadialCoefficients(Real *Result);
#define MAX_HARDCODED_PL 14
__host__ __device__ Real Pl(int l, Real x);
void AngularCoefficients(Real *Result);

131
src/multicenter.cu Normal file
View file

@ -0,0 +1,131 @@
#include "common.hpp"
#include "mathaux.hpp"
#include "scf.hpp"
#include "integrate.hpp"
#include "multicenter.hpp"
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>
etics::scf::scfclass SCFObject1;
etics::scf::scfclass SCFObject2;
vec3 *F_tmp;
vec3 *F_tmp2;
Real *Potential_tmp;
Real *Potential_tmp2;
Particle Origins[2];
void etics::multicenter::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
cudaMalloc((void**)&F_tmp, Nmax*sizeof(vec3));
cudaMalloc((void**)&F_tmp2, Nmax*sizeof(vec3));
cudaMalloc((void**)&Potential_tmp, Nmax*sizeof(Real));
cudaMalloc((void**)&Potential_tmp2, Nmax*sizeof(Real));
SCFObject1.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
// SCFObject1.OriginPos = vec3(5,0,0);
// SCFObject1.OriginVel = vec3(0,0.1,0);
SCFObject2.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
Particle p;
p.m = 0;
p.pos = vec3(-6.000000e-01 , 0 , 0);
p.vel = vec3(0 , -3.803629e-03 , 0);
Origins[0] = p;
p.pos = vec3(2.940000e+01 , 0 , 0);
p.vel = vec3(0 , 1.863778e-01 , 0);
Origins[1] = p;
// OriginsIntegrator = etics::Integrator(Origins, 2);
}
void etics::multicenter::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
Particle PP[2];
cudaMemcpy(PP, P+N-2, 2*sizeof(Particle), cudaMemcpyDeviceToHost);
// std::cerr << PP[0].pos.x << std::endl;
// std::cerr << PP[0].pos.y << std::endl;
// std::cerr << PP[0].pos.z << std::endl;
// std::cerr << PP[1].pos.x << std::endl;
// std::cerr << PP[1].pos.y << std::endl;
// std::cerr << PP[1].pos.z << std::endl;
// #error all we did last time was to add the offset to the subroutines, now the energy is nan. must be some kind of self force
SCFObject1.GetGpuLock();
SCFObject1.SendCachePointersToGPU();
SCFObject1.LoadParticlesToCache(P, 800000, PP[0].pos);
SCFObject1.CalculateCoefficients();
SCFObject1.ReleaseGpuLock();
SCFObject2.GetGpuLock();
SCFObject2.SendCachePointersToGPU();
SCFObject2.LoadParticlesToCache(P+800000, 200000-2, PP[1].pos);
SCFObject2.CalculateCoefficients();
SCFObject2.ReleaseGpuLock();
SCFObject1.GetGpuLock();
SCFObject1.SendCoeffsToGPU();
SCFObject1.SendCachePointersToGPU();
SCFObject1.LoadParticlesToCache(P, N-2, PP[0].pos);
SCFObject1.CalculateGravityFromCoefficients(Potential_tmp, F_tmp);
SCFObject1.ReleaseGpuLock();
SCFObject2.GetGpuLock();
SCFObject2.SendCoeffsToGPU();
SCFObject2.SendCachePointersToGPU();
SCFObject2.LoadParticlesToCache(P, N-2, PP[1].pos);
SCFObject2.CalculateGravityFromCoefficients(Potential_tmp2, F_tmp2);
SCFObject2.ReleaseGpuLock();
// now deal just with the central particles
SCFObject1.GetGpuLock();
SCFObject1.SendCoeffsToGPU();
SCFObject1.SendCachePointersToGPU();
SCFObject1.LoadParticlesToCache(P+N-1, 1, PP[0].pos); //instead of doing that, you need to "push" it at the end of the list
SCFObject1.CalculateGravityFromCoefficients(Potential_tmp+N-1, F_tmp+N-1); // the last particle is the second center, so should feel the first's force
SCFObject1.ReleaseGpuLock();
SCFObject2.GetGpuLock();
SCFObject2.SendCoeffsToGPU();
SCFObject2.SendCachePointersToGPU();
SCFObject2.LoadParticlesToCache(P+N-2, 1, PP[1].pos);
SCFObject2.CalculateGravityFromCoefficients(Potential_tmp+N-2, F_tmp+N-2);
SCFObject2.ReleaseGpuLock();
thrust::plus<Real> op;
thrust::transform(thrust::device, Potential_tmp, Potential_tmp+N-2, Potential_tmp2, Potential_tmp, thrust::plus<Real>());
thrust::transform(thrust::device, F_tmp, F_tmp+N-2, F_tmp2, F_tmp, thrust::plus<vec3>());
/*
vec3 FFF;
Real PPP;
SCFObject1.CalculateGravityAtPoint(vec3(100,0.001,0.001), &PPP, &FFF);
std::cerr << PPP << std::endl;
std::cerr << FFF.x << std::endl;
std::cerr << FFF.y << std::endl;
std::cerr << FFF.z << std::endl;*/
// vec3 FFF;
// Real PPP;
// cudaMemcpy(&PPP, Potential_tmp+N-2, 1*sizeof(Real), cudaMemcpyDeviceToHost);
// cudaMemcpy(&FFF, F_tmp+N-2, 1*sizeof(vec3), cudaMemcpyDeviceToHost);
// std::cerr << "===" << std::endl;
// std::cerr << PPP << std::endl;
// std::cerr << FFF.x << std::endl;
// std::cerr << FFF.y << std::endl;
// std::cerr << FFF.z << std::endl;
// cudaMemcpy(&PPP, Potential_tmp+N-1, 1*sizeof(Real), cudaMemcpyDeviceToHost);
// cudaMemcpy(&FFF, F_tmp+N-1, 1*sizeof(vec3), cudaMemcpyDeviceToHost);
// std::cerr << "+++" << std::endl;
// std::cerr << PPP << std::endl;
// std::cerr << FFF.x << std::endl;
// std::cerr << FFF.y << std::endl;
// std::cerr << FFF.z << std::endl;
cudaMemcpy(Potential, Potential_tmp, N*sizeof(Real), cudaMemcpyDeviceToDevice);
cudaMemcpy(F, F_tmp, N*sizeof(vec3), cudaMemcpyDeviceToDevice);
}

6
src/multicenter.hpp Normal file
View file

@ -0,0 +1,6 @@
namespace etics {
namespace multicenter {
void Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
}
}

15
src/noboost.inc Normal file
View file

@ -0,0 +1,15 @@
/*******************************************************************************
* This is an ETICS parameter file used when the BOOST library is not available.
* When changing a parameter, the code has to be recompiled; if using BOOST, this
* is not required. The syntax is C++, just fill in the variables used in the
* regular initialization file. No error messages will be shown if input is bad.
*******************************************************************************/
Filename = "_nofile_ 0";
N = 1048576;
Tcrit = 16;
StepSize = 0.001953125;
dT1 = 0.001953125;
dT2 = 0.1;
Prefix = "test_del_me";
OutputFormat = "ascii";

518
src/scf.cu Normal file
View file

@ -0,0 +1,518 @@
/**
* @file
* @author Yohai Meiron <ymeiron@pku.edu.cn>
* @brief Functions to calculate gravitational force using the SCF method.
*/
#include "common.hpp"
#include "mathaux.hpp"
#include "scf.hpp"
#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <unistd.h>
// Use the MPI flag only for the standalone program
#ifdef ETICS_MPI
#include <mpi.h>
#endif
namespace etics {
namespace scf {
scfclass SCFObject;
// These are truly global variables that are initialized once and can be read by every scfclass object.
bool ScfGloballyInitialized = false;
__constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)];
__constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2];
Real RadCoeff_h[(NMAX+1)*(LMAX+1)];
Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2];
__constant__ Real LengthScale;
Real LengthScale_h;
// These are global variables that are read/written by each scfclass object; they must be locked before use.
__constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
__constant__ CacheStruct Cache;
// These variables provide the mechanism to lock and release the global constant memory variables above.
scfclass *GpuLockOwner = NULL;
#define GPU_LOCK_PANIC \
{ \
std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \
exit(1); \
}
}
}
namespace etics {
namespace scf {
int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function
return (LMAX+1)*sizeof(Complex)*BlockSize;
}
}
}
// Class implementation
etics::scf::scfclass::scfclass() {
N=0;
Nmax=0;
k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1;
}
etics::scf::scfclass::~scfclass() {
if (etics::scf::GpuLockOwner == this) ReleaseGpuLock();
cudaFree(Cache_h.xi);
cudaFree(Cache_h.Phi0l);
cudaFree(Cache_h.Wprev1);
cudaFree(Cache_h.Wprev2);
cudaFree(Cache_h.costheta);
cudaFree(Cache_h.sintheta_I);
cudaFree(Cache_h.Exponent);
cudaFree(Cache_h.mass);
free(PartialSum_h);
cudaFree(PartialSum);
}
void etics::scf::scfclass::GetGpuLock() {
if (etics::scf::GpuLockOwner == this) return;
if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this;
else GPU_LOCK_PANIC;
}
void etics::scf::scfclass::ReleaseGpuLock() {
if (etics::scf::GpuLockOwner == NULL) return;
if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL;
else GPU_LOCK_PANIC;
}
void etics::scf::scfclass::CalculateCoefficients(int n, int l) {
int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2;
etics::scf::CalculateCoefficientsPartial<<<k3gs,k3bs,k3bs*sizeof(Complex)*(LMAX+1)>>>(N, n, l, PartialSum);
cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost);
for (int m = 0; m <= l; m++)
for (int Block=0; Block<k3gs; Block++)
A_h[BaseAddress + m] = Complex_add(A_h[BaseAddress + m], PartialSum_h[Block*(l+1) + m]);
}
void etics::scf::scfclass::CalculateCoefficients() {
memset(A_h, 0, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
for (int l = 0; l <= LMAX; l++) {
if (l > 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter
for (int n = 0; n <= NMAX; n++) {
CalculateCoefficients(n, l);
}
}
}
void etics::scf::scfclass::GetCoefficients(Complex *A) {
std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A);
}
void etics::scf::scfclass::SendCoeffsToGPU() {
if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
}
void etics::scf::scfclass::SendCachePointersToGPU() {
if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct));
}
void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) {
if (N > Nmax) {
fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax);
exit(1);
}
this->N = N;
etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset);
}
void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) {
etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F);
}
void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) {
static bool first = true;
if (first) {
printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n");
first = false;
}
Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
Real xi = (r-1)/(r+1);
Real costheta = Pos.z/r;
Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I);
etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential);
}
void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
GetGpuLock();
SendCachePointersToGPU();
LoadParticlesToCache(P, N);
CalculateCoefficients();
#ifdef ETICS_MPI
Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD);
std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h);
// not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs
#endif
SendCoeffsToGPU();
CalculateGravityFromCoefficients(Potential, F);
ReleaseGpuLock();
}
void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
if (k3gs_new != k3gs) {
if (PartialSum_h != NULL) free(PartialSum_h);
if (PartialSum != NULL) cudaFree(PartialSum_h);
PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex));
cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex));
}
k3gs = k3gs_new;
k3bs = k3bs_new;
k4gs = k4gs_new;
k4bs = k4bs_new;
}
void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
// First, initialize global arrays if not already initialized.
if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit();
// Next, set the launch configuation (either guess it or use what the user specified).
if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) {
int k3gs_new, k3bs_new, k4gs_new, k4bs_new;
etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new);
SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
}
else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
// If Nmax > 0 then initialize the cahche memory.
if (Nmax > 0) {
this->Nmax = Nmax;
cudaMalloc((void**)&Cache_h.xi, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.Phi0l, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.Wprev1, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.Wprev2, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.costheta, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real));
cudaMalloc((void**)&Cache_h.Exponent, Nmax * sizeof(Complex));
cudaMalloc((void**)&Cache_h.mass, Nmax * sizeof(Real));
} else {
Cache_h.xi = NULL;
Cache_h.Phi0l = NULL;
Cache_h.Wprev1 = NULL;
Cache_h.Wprev2 = NULL;
Cache_h.costheta = NULL;
Cache_h.sintheta_I = NULL;
Cache_h.Exponent = NULL;
Cache_h.mass = NULL;
}
}
// Kernels
__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1"
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < N) {
vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale;
Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
Real xi = (r-1)/(r+1);
Real costheta = Pos.z/r;
Real sintheta_I = rsqrt(1-costheta*costheta);
Cache.xi[i] = xi;
Cache.Phi0l[i] = 0.5 * (1 - xi);
Cache.costheta[i] = costheta;
Cache.sintheta_I[i] = sintheta_I;
Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I);
Cache.Exponent[i] = Exponent;
Cache.mass[i] = P[i].m;
i += blockDim.x * gridDim.x;
}
}
__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2"
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < N) {
Real xi = Cache.xi[i];
Cache.Phi0l[i] *= 0.25*(1-xi*xi);
i += blockDim.x * gridDim.x;
}
}
__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3"
extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch
int tid = threadIdx.x;
for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0);
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < N) {
Real xi = Cache.xi[i];
Real Wnl;
if (n == 0) Wnl = 1;
else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;}
else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;}
else {
Real Wprev1 = Cache.Wprev1[i];
Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n;
if (n < NMAX) { // Writing is expensive, avoid if possible.
Cache.Wprev2[i] = Wprev1;
Cache.Wprev1[i] = Wnl;
}
}
Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l];
Real costheta = Cache.costheta[i];
Real Plm = Pl(l, costheta);
ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0));
if (l == 0) {i += blockDim.x * gridDim.x; continue;}
//////////////////////////////// ugly fix
if ((costheta < -0.999) || (costheta > +0.999)) {
i += blockDim.x * gridDim.x;
continue;
}
//////////////////////////////// ugly fix
Real Plm_prev1 = Plm;
Real sintheta_I = Cache.sintheta_I[i];
Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I;
Complex Exponent = Cache.Exponent[i];
Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1];
ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y));
if (l == 1) {i += blockDim.x * gridDim.x; continue;}
Complex TorodialPart = Exponent;
for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop
Real Plm_prev2 = Plm_prev1;
Plm_prev1 = Plm;
Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
TorodialPart = Complex_mul(TorodialPart, Exponent);
tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m];
ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y));
}
i += blockDim.x * gridDim.x;
}
__syncthreads();
for (int m = 0; m <= l; m++) {
i = blockDim.x/2;
while (i != 0) {
if (tid < i)
ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]);
__syncthreads();
i /= 2;
}
if (tid == 0)
PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x];
}
}
template<int Mode>
__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) {
// it gets A as parameter because it can be either on host or device
// 0 = both force and potential, 1 = only force, 2 = only pot
//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!!
#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m]
#ifndef __CUDA_ARCH__
#define AngCoeff AngCoeff_h
#define LengthScale LengthScale_h
#endif
Real dPhiLeft;
Real dPhiLeftMul;
Real dPhiRight;
Real dPhiRightAdd;
Real dPhi;
Real RadialPart2;
Real PlmDerivTheta;
Real Pot = 0;
Real Fr = 0, Ftheta = 0, Fphi = 0;
Real OneOverXiPlusOne = 1/(1+xi);
Real r_I = (1-xi)*OneOverXiPlusOne;
Real r = 1/r_I; // It's quite likely we can do everything without r.
Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite
Complex ExponentTmp[LMAX];
ExponentTmp[0] = Exponent;
for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent);
if (Mode != 2) {
Real xi2 = xi*xi;
Real xi3 = xi2*xi;
dPhiLeft = -0.25*OneOverXiPlusOne;
dPhiLeftMul = 0.25*(1-xi2);
dPhiRight = xi3 - xi2 - xi + 1;
dPhiRightAdd = 2*(xi3 - 2*xi2 + xi);
}
Real Phi0l = 1/(1+r);
Real tmp1 = Phi0l*Phi0l*r;
for (int l = 0; l <= LMAX; l++) {
if (Mode != 2) {
if (l > 0) {
dPhiLeft *= dPhiLeftMul;
dPhiRight += dPhiRightAdd;
}
}
if (Mode != 2) dPhi = dPhiLeft * dPhiRight;
for (int n = 0; n <= NMAX; n++) {
Real Wnl, Wprev1, Wprev2;
if (n == 0) Wnl = 1;
else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;}
else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;}
else {
Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n;
Wprev2 = Wprev1;
Wprev1 = Wnl;
}
Real Wderiv = 0;
if (n == 1) {Wderiv = 4*l + 3;}
else if (n > 1) {
Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi);
} // From an unknown reason it's faster to have this Block separate from the previous one.
Real RadialPart = - SQRT_4_PI * Phi0l * Wnl;
if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2));
Real Plm = Pl(l, costheta);
Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm;
if (Mode != 1) Pot += RadialPart * tmp2;
if (Mode != 2) Fr += RadialPart2 * tmp2;
if (l == 0) continue;
//////////////////////////////// ugly fix
if ((costheta < -0.999) || (costheta > +0.999)) {
continue;
}
//////////////////////////////// ugly fix
// The Block below is l>=1, m=0.
if (Mode != 2) {
PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster
Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I;
}
// The Block below is l>=1, m=1.
if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1)
Real Plm_prev1 = Plm;
Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1.
if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1;
tmp2 = 2 * AngCoeff[(l+1)*l/2+1];
Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1));
Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
Complex tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y);
Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
if (Mode != 1) Pot += Complex_real(tmp6);
if (Mode != 2) {
Fr += RadialPart2 * Complex_real(tmp5);
Fphi += Complex_imag(tmp6) * sintheta_I * r_I;
Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I;
}
if (l == 1) continue;
for (int m = 2; m <= l; m++) {
Real Plm_prev2 = Plm_prev1;
Plm_prev1 = Plm;
Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
tmp2 = 2 * AngCoeff[(l+1)*l/2+m];
tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m));
tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y);
tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
if (Mode != 1) Pot += Complex_real(tmp6);
if (Mode != 2) {
PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1;
Fr += RadialPart2 * Complex_real(tmp5);
Fphi += m * Complex_imag(tmp6) * sintheta_I * r_I;
Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I;
}
}
}
Phi0l *= tmp1;
}
if (Mode != 2) {
Real sintheta = 1/sintheta_I;
Real tanphi = Exponent.y/Exponent.x;
Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit?
Real sinphi = tanphi*cosphi;
*F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi, costheta*Fr - sintheta*Ftheta);
*F = (*F)*LengthScale*LengthScale;
}
if (Mode != 1) *Potential = Pot*LengthScale;
#ifndef __CUDA_ARCH__
#undef AngCoeff
#undef LengthScale
#endif
#undef A
}
__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4"
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < N) {
CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]);
i += blockDim.x * gridDim.x;
}
}
// Global initialization
void etics::scf::GlobalInit() {
int pid = getpid();
int devid;
cudaGetDevice(&devid);
char hostname[256];
gethostname(hostname, 256);
char output_string[256];
sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname);
std::cout << output_string << std::endl;
RadialCoefficients(etics::scf::RadCoeff_h);
cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real));
AngularCoefficients(etics::scf::AngCoeff_h);
cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real));
etics::scf::LengthScale_h = 1;
cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
ScfGloballyInitialized = true;
}
void etics::scf::SetLengthScale(Real NewLengthScale) {
etics::scf::LengthScale_h = NewLengthScale;
cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
}
// Old API
void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) {
int blockSize;
int minGridSize;
int gridSize;
//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem.
cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128);
gridSize = minGridSize;
*k3gs_new = gridSize;
*k3bs_new = blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N);
gridSize = (N + blockSize - 1) / blockSize;
*k4gs_new = gridSize;
*k4bs_new = blockSize;
}
void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
}
void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
SCFObject.CalculateGravity(P, N, Potential, F);
}

61
src/scf.hpp Normal file
View file

@ -0,0 +1,61 @@
#pragma once
#include "common.hpp"
#include "mathaux.hpp"
namespace etics {
namespace scf {
// CUDA Kernels and device functions cannot be in class space
__global__ void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0));
__global__ void CalculatePhi0l(int N, int l);
__global__ void CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum);
template<int Mode> __host__ __device__ void CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential);
__global__ void CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F);
void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
void Init(int Nmax, int k3gs_new=0, int k3bs_new=0, int k4gs_new=0, int k4bs_new=0); // to be removed!
void GlobalInit();
void GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new);
void SetLengthScale(Real NewLengthScale);
struct CacheStruct {
Real *xi;
Real *Phi0l;
Real *Wprev1;
Real *Wprev2;
Real *costheta;
Real *sintheta_I;
Complex *Exponent;
Real *mass;
};
class scfclass {
public:
scfclass();
~scfclass();
void SendCachePointersToGPU();
void SendCoeffsToGPU();
void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0));
void CalculateCoefficients();
void CalculateGravityFromCoefficients(Real *Potential, vec3 *F);
void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
void CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F);
void Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
void SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
void GetCoefficients(Complex *A);
void GetGpuLock();
void ReleaseGpuLock();
//WARNING: the following two functions are not implemented but they should be
int GetNMAX();
int GetLMAX();
//TODO We'll make them private in the final version, now we need access for debugging
int N;
int Nmax;
//WARNING we just hardcode 1024 as the maximum size of the coefficient structure. This is because having NMAX and LMAX here causes problems.
Complex A_h[1024];
CacheStruct Cache_h;
Complex *PartialSum = NULL;
Complex *PartialSum_h = NULL;
int k3gs, k3bs, k4gs, k4bs;
void CalculateCoefficients(int n, int l);
};
}
}