From 1718e6287bee102f3f46f868e405811c87c18887 Mon Sep 17 00:00:00 2001
From: Yohai <yohai@meiron.org>
Date: Sun, 25 Aug 2019 16:08:44 +0800
Subject: [PATCH] added project files

---
 Makefile            |  49 +++++
 README.md           | 102 +++++++++
 WORKPLAN.md         |  26 +++
 __init__.py         |   1 +
 format-convert.py   | 126 +++++++++++
 interface.cu        | 378 ++++++++++++++++++++++++++++++++
 interface.py        | 160 ++++++++++++++
 src/Makefile        |  72 ++++++
 src/common.hpp      | 117 ++++++++++
 src/file.ini        |  33 +++
 src/ic.cpp          | 269 +++++++++++++++++++++++
 src/ic.hpp          |   7 +
 src/integrate.cu    | 127 +++++++++++
 src/integrate.hpp   |  27 +++
 src/io.cpp          | 367 +++++++++++++++++++++++++++++++
 src/io.hpp          |  19 ++
 src/main.cu         | 347 +++++++++++++++++++++++++++++
 src/mathaux.cu      |  57 +++++
 src/mathaux.hpp     |  30 +++
 src/multicenter.cu  | 131 +++++++++++
 src/multicenter.hpp |   6 +
 src/noboost.inc     |  15 ++
 src/scf.cu          | 518 ++++++++++++++++++++++++++++++++++++++++++++
 src/scf.hpp         |  61 ++++++
 24 files changed, 3045 insertions(+)
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 WORKPLAN.md
 create mode 100644 __init__.py
 create mode 100644 format-convert.py
 create mode 100644 interface.cu
 create mode 100644 interface.py
 create mode 100644 src/Makefile
 create mode 100644 src/common.hpp
 create mode 100644 src/file.ini
 create mode 100644 src/ic.cpp
 create mode 100644 src/ic.hpp
 create mode 100644 src/integrate.cu
 create mode 100644 src/integrate.hpp
 create mode 100644 src/io.cpp
 create mode 100644 src/io.hpp
 create mode 100644 src/main.cu
 create mode 100644 src/mathaux.cu
 create mode 100644 src/mathaux.hpp
 create mode 100644 src/multicenter.cu
 create mode 100644 src/multicenter.hpp
 create mode 100644 src/noboost.inc
 create mode 100644 src/scf.cu
 create mode 100644 src/scf.hpp

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..fe49095
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,49 @@
+# standard amuse configuration include
+# config.mk will be made after ./configure has run
+AMUSE_DIR?=../../../..
+-include $(AMUSE_DIR)/config.mk
+GPUARCH ?= 60
+CUDAFLAGS += -arch=sm_$(GPUARCH)
+######### ugly!!!! should be in the amuse base make file
+
+
+MPICXX   ?= mpicxx
+
+CFLAGS   += -Wall -g
+CXXFLAGS += $(CFLAGS) 
+LDFLAGS  += -lm $(MUSE_LD_FLAGS)
+
+OBJS = interface.o
+
+CODELIB = src/libetics.a
+
+CODE_GENERATOR = $(AMUSE_DIR)/build.py
+
+all: etics_worker
+
+clean:
+	$(RM) *.so *.o *.pyc worker_code.cc worker_code.h 
+	$(RM) *~ etics_worker worker_code.cc
+	make -C src clean
+
+$(CODELIB):
+	make -C src libetics.a
+
+worker_code.cc: interface.py
+	$(CODE_GENERATOR) --type=c interface.py EticsInterface -o $@
+
+worker_code.h: interface.py
+	$(CODE_GENERATOR) --type=H interface.py EticsInterface -o $@
+
+etics_worker: worker_code.cc worker_code.h $(CODELIB) $(OBJS)
+	$(MPICXX) $(CXXFLAGS) -L/usr/local/cuda/lib64 $< $(OBJS) $(CODELIB) -o $@ -lcudart
+	echo the above compilation directive is not nice
+
+.SUFFIXES: .cu .o
+
+.cu.o: $<
+	$(NVCC) -Xcompiler="$(CXXFLAGS)" -c -o $@ $<
+
+
+# .cc.o: $<
+# 	$(CXX) $(CXXFLAGS) -c -o $@ $<
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6475b1f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+ETICS
+=====
+
+This is ETICS (Expansion Techniques In Collisionless Systems), a GPU (currently
+**CUDA only**) N-body code which uses series expansion to calculate the
+gravitational field. See more details in this publication:
+
+Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98
+
+http://adsabs.harvard.edu/abs/2014ApJ...792...98M
+
+
+What's inside
+-------------
+
+- ETICS standalone program
+- ETICS static library
+- ETICS module for AMUSE
+
+
+Prerequisites
+-------------
+
+- CUDA (>= 6; mandatory)
+- HDF5 (optional)
+- Boost (optional)
+- AMUSE (mandatory only for AMUSE module)
+
+To disable HDF5 [insert explanation here]
+To disable Boost [insert explanation here]
+
+
+Compilation
+-----------
+
+### Standalone program
+
+    make standalone
+
+builds in `src/`.
+
+
+### Static library
+
+    make library
+
+builds in `src/`.
+
+
+### AMUSE module
+
+    make
+
+builds in top level directory. The whole `etics` directory has to be placed (or
+linked) inside:
+
+    $AMUSE_DIR/src/amuse/community/
+
+
+How to use
+----------
+
+The `file.ini` is a self-explanatory input file; if compiled without Boost, fill
+in the relevant variables in file `noboost.inc` which is compiled into the
+executable (any change of parameters requires re-compilation). Start simulation
+with:
+
+    ./etics file.ini
+
+Any input file name is acceptable.
+
+
+Known issues
+------------
+
+* No MEX
+
+The MEX (Multipole Expansion) method is not available in this version; thus, the
+SCF (Self-Consistent Field) method is the only expansion technique available.
+The ETICS program has been heavily restructured and the MEX routines are no
+longer compatible. Hopefully this will be fixed.
+
+* Hardcoded launch configuration
+
+For at least one of the CUDA kernels, for various reasons, it seems that "brute
+force" search is needed to find the optimal launch configuration. Currently it
+is hardcoded, and a primitive search routine is provided.
+
+* Problem for particles with |θ| << 1
+
+Due to using an unstable recursion relation to calculate the Associated Legendre
+polynomials, particles in a narrow cone around the z-axis cannot be considered
+accurately. This means that they give an erroneous contribution to the
+gravitational field and also are assigned erroneous force and potential. The
+size of this cone increases with the angular term of the expansion. To partly
+solve this, the current (ugly) fix is to only consider particles with cos(θ) >
+0.999 at the monopole level. This is not so bad because the monopole is always
+the most dominant term (and is error free) and the number of particles in this
+cone is small and they are transient (i.e. they come out of it usually in a
+small number of steps). A somewhat better solution is to make this arbitrary
+cutoff of 0.999 l-dependent, and an even better solution would be to use an
+asymptotic expression for the Associated Legendre polynomials.
diff --git a/WORKPLAN.md b/WORKPLAN.md
new file mode 100644
index 0000000..591474c
--- /dev/null
+++ b/WORKPLAN.md
@@ -0,0 +1,26 @@
+The idea is to be able to handle mergers and sinking galactic satellites by 
+having each subsystem centered on one (massless?) particle, and have the 
+expansion calculated separately in each center of mass system (only for 
+particles belonging to it). After the internal forces are calculated for each 
+subsystem, the mutual forces are calculated as well, and all the particles are 
+integrated in a global coordinate system.
+
+Technically, most of the SCF module has to become a class. Some of the arrays 
+like RadCoeff and AngCoeff (anything else?) should remain global and would be 
+initialized by some global initialization routine. Then each subsystem would be 
+a different instance of the SCF class, with its own list of coefficients and 
+particle cache. It seems that all subsystems can (should?) share the integrator 
+object. Maybe (probably?) the integrator should be agnostic to how particles are 
+divided into subsystems, and the call to CalculateGravity() would take care of 
+everything. In this case we are talking about a class for the current SCF 
+routines but also some wrapper that knows how to CalculateGravity() by calling 
+the appropriate methods for each subsystem's object, and also work out the 
+mutual gravity. Need a good name for this intermediate level between the 
+integrator and the SCF core.
+
+This intermediate level or even a different class would deal how to associate 
+particles to the different subsystems and how to figure out the initial 
+conditions. Also, the central particles might need some special treatment 
+(possibly the integrator considers them normal particles but this other level 
+knows not to use them for force calculation; I should think whether those should 
+be omitted from output, but it might be helpful to keep them).
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..abe3ba8
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+# generated file
\ No newline at end of file
diff --git a/format-convert.py b/format-convert.py
new file mode 100644
index 0000000..bab62eb
--- /dev/null
+++ b/format-convert.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# This script converts phiGRAPE snapshot files to HDF5.
+# Author: Yohai Meiron
+
+import argparse, os, sys, numpy, re
+
+# Set commandline parameters
+parser = argparse.ArgumentParser(description='Convert phiGRPAE snapshot files into HDF5 (h5part) format.')
+parser.add_argument('InputFile', metavar='InputFile', type=str, help='first input file (numbers in the file name must represet numerical order)')
+parser.add_argument('--num', type=int, help='number of snapshot to convert (by default, all are converted)')
+parser.add_argument('-o', '--output', default='output', help='output file name (or pattern)')
+parser.add_argument('--particles', type=int, help='number of particles (if not the whole thing)')
+parser.add_argument('--maxfs', type=float, help='maximum size in MB of a single HDF5 output')
+parser.add_argument('--single', action='store_true', help='use single precision instead of double')
+parser.add_argument('--bh', type=int, nargs='?', const=1, default=0, help='number of black holes to grab from the end of the file')
+
+# Print message and parse input
+print 'Welcome to Yohai\'s format converter! (phiGRAPE ---> HDF5)\n'
+args = parser.parse_args()
+
+# Parameter conflict tests
+if (args.bh > 0) and (args.particles==None):
+    parser.print_usage()
+    print '%s: error: cannot --bh without --particles' % sys.argv[0]
+    sys.exit(2)
+Nbh = args.bh
+
+# Verify h5py
+try:
+    import h5py
+except:
+    print '%s: error: booooo, can\'t find h5py.' % sys.argv[0]
+    sys.exit(1)
+
+# Check if input file exists, separate name from path and get file name pattern
+if not os.path.exists(args.InputFile):
+    print 'File `%s` not found!' % args.InputFile
+    sys.exit(1)
+InputFileName = os.path.basename(args.InputFile)
+InputFilePath = os.path.dirname(os.path.realpath(args.InputFile))
+Pattern = re.sub(r'\d', r'\d', InputFileName)
+Pattern = '^' + Pattern + "$"
+r = re.compile(Pattern)
+
+# Look for the other files
+ListDir = os.listdir(InputFilePath)
+FileList = []
+for Filename in ListDir:
+    m = r.search(Filename)
+    if not m is None:
+        FileList.append(Filename)
+FileList.sort()
+FileList = FileList[FileList.index(InputFileName):]
+
+# Verify that enough files were found; shorten list if needed
+if (not args.num==None) and args.num > len(FileList):
+    print 'Requested to convert %d files, but only %d found!' % (args.num, len(FileList))
+    sys.exit(1)
+if (not args.num==None): num = args.num
+else: num = len(FileList)
+FileList = FileList[:num]
+
+print 'Will convert %d files in the directory `%s`.' % (num, InputFilePath)
+
+NumperOfStepsPerFile = 2147483647
+OutputFileNum = 2
+StepCounter = 0
+FirstTime = True
+
+for Filename in FileList:
+    if StepCounter > NumperOfStepsPerFile:
+        OutputFile.close()
+        OutputFile = h5py.File('output(%d).h5part' % OutputFileNum, 'w')
+        StepCounter = 0
+        OutputFileNum += 1
+    FullFilename = os.path.join(InputFilePath, Filename)
+    with open(FullFilename, 'r') as File:
+        Line = File.readline()
+        try:
+            StepNumer = int(Line)
+        except:
+            print 'Error to read snapshot number (line 1 in file `%s`)' % Filename
+            sys.exit(1)
+        File.readline()
+        Line = File.readline()
+        try:
+            Time = numpy.double(Line)
+        except:
+            print 'Error to read snapshot time (line 3 in file `%s`)' % Filename
+            sys.exit(1)
+    Data = numpy.loadtxt(FullFilename, skiprows=3)
+    if (not args.particles==None) and (args.particles > Data.shape[0]):
+        print 'Requested to use %d paticles, but file %s has only %d!' % (args.particles, Filename, Data.shape[0])
+        sys.exit(1)
+    if (not args.particles==None): N = args.particles
+    else: N = Data.shape[0]
+    if FirstTime:
+        if Nbh == 0: print 'Reading %d particles.' % (N)
+        else: print 'Reading %d normal particles + %d black hole(s).' % (N, Nbh)
+        if args.single: EstimatedDataSize = 32*N/1024.0**2
+        else: EstimatedDataSize = 60*N/1024.0**2
+        if not args.maxfs==None: NumperOfStepsPerFile = (int)(args.maxfs/EstimatedDataSize)
+        FirstTime = False
+        if NumperOfStepsPerFile > num: OutputFile = h5py.File('output.h5part', 'w')
+        else: OutputFile = h5py.File('output(1).h5part', 'w')
+
+    DataBH = Data[-Nbh-1:-1]
+    Data = Data[:N]
+    Data = numpy.vstack((Data, DataBH))
+    Group = OutputFile.create_group('Step#%d' % (StepNumer))
+    Group.attrs['Time'] = Time
+    Group.attrs['TotalN'] = N + Nbh
+    if args.single: T='f'
+    else: T='d'
+    DataSet = Group.create_dataset('ID',   (N+Nbh,), dtype='i'); DataSet[...] = Data[:,0]
+    DataSet = Group.create_dataset('Mass', (N+Nbh,), dtype=T);   DataSet[...] = Data[:,1]
+    DataSet = Group.create_dataset('X',    (N+Nbh,), dtype=T);   DataSet[...] = Data[:,2]
+    DataSet = Group.create_dataset('Y',    (N+Nbh,), dtype=T);   DataSet[...] = Data[:,3]
+    DataSet = Group.create_dataset('Z',    (N+Nbh,), dtype=T);   DataSet[...] = Data[:,4]
+    DataSet = Group.create_dataset('VX',   (N+Nbh,), dtype=T);   DataSet[...] = Data[:,5]
+    DataSet = Group.create_dataset('VY',   (N+Nbh,), dtype=T);   DataSet[...] = Data[:,6]
+    DataSet = Group.create_dataset('VZ',   (N+Nbh,), dtype=T);   DataSet[...] = Data[:,7]
+    OutputFile.flush()
+    print 'Completed /Step#%d' % (StepNumer)
+    StepCounter += 1
+OutputFile.close()
diff --git a/interface.cu b/interface.cu
new file mode 100644
index 0000000..203ce2a
--- /dev/null
+++ b/interface.cu
@@ -0,0 +1,378 @@
+#include <iostream>
+
+#include "src/common.hpp"
+#include "src/scf.hpp"
+#include "src/integrate.hpp"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/inner_product.h>
+
+using namespace std;
+using namespace etics;
+
+Integrator IntegratorObj;
+
+// GLOBAL VARIABLES
+Real ConstantStep = 0.001953125;
+Real T, Step, dT1, dT2, Tcrit;
+int N;
+extern Real mass;
+extern int k3gs, k3bs, k4gs, k4bs;
+
+/*extern*/ Particle *hostP;
+thrust::host_vector<Particle> PPP;
+/*extern*/ thrust::device_vector<Particle> PPPPP;
+
+// /*extern*/   thrust::device_vector<vec3>   F0xxxxxx;
+// /*extern*/   thrust::device_vector<Real>   PotPotPot; // ugly name
+// /*extern*/   thrust::device_vector<vec3>   F1xxxxxx;
+/*extern*/ vec3 *F1_ptr;
+
+
+
+// extern Particle *P_h;
+// extern thrust::device_vector<Particle> P;
+// 
+// extern thrust::device_vector<vec3>   F0;
+// extern thrust::device_vector<Real>   Potential;
+// extern thrust::device_vector<vec3>   F1;
+
+
+void CommitParticles();
+// void InitSCF(int N);
+// void ForceSCF(int N, Real *Potential, Particle *PPPPP, vec3 *F);
+void DriftStep();
+void KickStep();
+void CommitForces();
+int InitilizeIntegratorMemory();
+#define PTR(x) (thrust::raw_pointer_cast((x).data()))
+
+
+int initialize_code() {
+#warning initscf should be here!!! just the problem is that N is requires, so fix it
+    return 0;
+}
+
+int recommit_parameters() {
+    return 0;
+}
+
+int commit_parameters() {
+    return 0;
+}
+
+int new_particle(int *id, double mass, double x, double y, double z, double vx, double vy, double vz, double radius) {
+    Particle p;
+    p.m = mass;
+    p.pos = vec3(x, y, z);
+    p.vel = vec3(vx, vy, vz);
+    PPP.push_back(p);
+    *id = N;
+    N++;
+    return 0;
+}
+
+int commit_particles() {
+//     cerr << "calling commit_particles" << endl;
+    cerr << "we did commit_particles()" << endl;
+    etics::scf::Init(N, 180, 64, 2605, 384);
+#warning hardcoded launch configuration
+    IntegratorObj = Integrator(&PPP[0], N);
+    return 0;
+}
+
+struct CenterMassFunctor {
+    Real ConstantMass;
+    __host__ __device__ CenterMassFunctor(Real _ConstantMass) : ConstantMass(_ConstantMass) {}
+    __host__ __device__ vec3 operator() (const Particle &p) const {return p.pos;}
+};
+
+struct ShiftFunctor {
+    vec3 Shift;
+    __host__ __device__ ShiftFunctor(vec3 _Shift) : Shift(_Shift) {}
+    __host__ __device__ Particle operator() (Particle &p) const {
+        p.pos += Shift;
+        p.CalculateR2();
+        return p;
+    }
+};
+
+int counttt = 0;
+bool FirstStep = true;
+int evolve_model(double t) {
+//     PPPPP = PPP;
+    cerr << "call evolve_model t_end = " << t << " dt = " << t - T  << "****************" << endl;
+
+//     vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
+//     CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass
+//     cerr << "CENTER OF MASS " << CenterMass.x << endl;
+//     
+// //     thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(-CenterMass));
+//     
+//     vec3 CenterMass2 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
+//     CenterMass2 = CenterMass2 * (1.0/N); //ugly should divide by the total mass
+//     cerr << "CENTER OF MASS after correction " << CenterMass2.x << endl;
+// 
+
+    Step = ConstantStep;
+    while (T <= t) {
+        // Take the drift step.
+        IntegratorObj.DriftStep(Step);
+
+        // Calculate the forces in the new positions.
+//         ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F1xxxxxx));
+        IntegratorObj.CalculateGravity();
+
+        // Finish by taking the kick step.
+        // The kick functor also "commits" the predicted forces into the "acc" member.
+        IntegratorObj.KickStep(Step);
+
+        // N particles were implicitly propagated in this iteration.
+
+        // Advance global time.
+        T += Step;
+    }
+// 
+//     vec3 CenterMass3 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
+//     CenterMass3 = CenterMass3 * (1.0/N); //ugly should divide by the total mass
+//     cerr << "CENTER OF MASS after evolve " << CenterMass3.x << endl;
+//     
+//     cerr << "done evolve; transform" << endl;
+// //     thrust::transform(PPPPP.begin(), PPPPP.end(), PPPPP.begin(), ShiftFunctor(+CenterMass)); // antishift
+//     
+//     vec3 CenterMass4 = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
+//     CenterMass4 = CenterMass4 * (1.0/N); //ugly should divide by the total mass
+//     cerr << "CENTER OF MASS after antishift " << CenterMass4.x << endl;
+// 
+//     cerr << "done transform; download to RAM" << endl;
+    IntegratorObj.CopyParticlesToHost(&PPP[0]);
+//     
+//     cerr << "done download; return" << endl;
+    return 0;
+}
+
+int set_begin_time(double time_begin) {
+//     cerr << "called set_begin_time(" << time_begin << endl;
+    return 0;
+}
+
+
+int get_begin_time(double *time_begin) {
+    *time_begin = 0;
+    return 0;
+}
+
+int get_mass(int index_of_the_particle, double *mass) {
+    *mass = PPP[index_of_the_particle].m;
+    return 0;
+}
+
+int get_time(double *time) {
+    *time = T;
+    return 0;
+}
+
+int set_mass(int index_of_the_particle, double mass) {
+//     cerr << "calling set_mass" << endl;
+    PPP[index_of_the_particle].m = mass;
+    return 0;
+}
+
+int get_index_of_first_particle(int *index_of_the_particle) {
+//     cerr << "calling get_index_of_first_particle" << endl;
+    *index_of_the_particle = 0;
+    return 0;
+}
+
+int get_total_radius(double *radius) {
+    return -2;
+}
+
+int get_potential_at_point(double soft, double x, double y, double z, double *phi) {
+    return -2;
+}
+
+int get_total_mass(double *mass) {
+    return -2;
+}
+
+int set_eps2(double epsilon_squared) {
+    return -1;
+}
+
+int get_eps2(double *epsilon_squared) {
+    *epsilon_squared = 0;
+    return -1;
+}
+
+int get_number_of_particles(int *number_of_particles) {
+//     cerr << "calling get_number_of_particles" << endl;
+    *number_of_particles = PPP.size();
+    return 0;
+}
+
+int get_index_of_next_particle(int index_of_the_particle, int *index_of_the_next_particle) {
+    *index_of_the_next_particle = index_of_the_particle + 1;
+    return 0;
+}
+
+int delete_particle(int index_of_the_particle) {
+    return -2;
+}
+
+int get_potential(int index_of_the_particle, double *potential) {
+    return -2;
+}
+
+int synchronize_model() {
+//     cerr << "calling synchronize_model" << endl;
+    return 0;
+}
+
+int set_state(int index_of_the_particle, double mass, double radius, double x, double y, double z, double vx, double vy, double vz) {
+    cerr << "calling set_state" << endl;
+//     cerr << "calling set_state" << endl;
+    PPP[index_of_the_particle].pos = vec3(x, y, z);
+    PPP[index_of_the_particle].vel = vec3(vx, vy, vz);
+    return 0;
+}
+
+int get_state(int index_of_the_particle, double *mass, double *radius, double *x, double *y, double *z, double *vx, double *vy, double *vz) {
+//     cerr << "calling get_state" << endl;
+    Particle p = PPP[index_of_the_particle];
+    *mass = index_of_the_particle;
+    *x = p.pos.x;
+    *y = p.pos.y;
+    *z = p.pos.z;
+    *vx = p.vel.x;
+    *vy = p.vel.y;
+    *vz = p.vel.z;
+    return 0;
+}
+
+int get_time_step(double *time_step) {
+//     cerr << "calling get_time_step" << endl;
+    *time_step = ConstantStep;
+    return 0;
+}
+
+int set_time_step(double time_step) {
+    cerr << "calling set_time_step" << endl;
+    ConstantStep = time_step;
+    return 0;
+}
+
+int get_launch_config(int **launch_config) {
+    return 0;
+}
+
+int set_launch_config(int *launch_config) {
+//     k3gs = launch_config[0];
+//     k3bs = launch_config[1];
+//     k4gs = launch_config[2];
+//     k4bs = launch_config[3];
+    return -2;
+}
+
+int recommit_particles() {
+//     cerr << "calling recommit_particles" << endl;
+#warning put something here
+    cerr << "hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" << endl;
+    PPPPP = PPP;
+    return -2;
+}
+
+int set_acceleration(int index_of_the_particle, double ax, double ay, double az) {
+    return -2;
+}
+
+int get_center_of_mass_position(double *x, double *y, double *z) {
+//     vec3 CenterMass = thrust::transform_reduce(PPPPP.begin(), PPPPP.end(), CenterMassFunctor(mass), vec3(0,0,0), thrust::plus<vec3>());
+//     CenterMass = CenterMass * (1.0/N); //ugly should divide by the total mass
+//     *x = CenterMass.x;
+//     *y = CenterMass.y;
+//     *z = CenterMass.z;
+    return 0;
+}
+
+int get_center_of_mass_velocity(double *vx, double *vy, double *vz) {
+    return -2;
+}
+
+int get_radius(int index_of_the_particle, double *radius) {
+    *radius = 0;
+    return 0;
+}
+
+int set_radius(int index_of_the_particle, double radius) {
+    // should store the radius somewhere but completely ignored by code
+//     cerr << "calling set_radius" << endl;
+    return 0;
+}
+
+int cleanup_code() {
+    IntegratorObj.~Integrator();
+    cerr << "bye" << endl;
+    return 0;
+}
+
+int get_gravity_at_point(double soft, double x, double y, double z, double *forcex, double *forcey, double *forcez) {
+   return -2;
+}
+
+int get_velocity(int index_of_the_particle, double *vx, double *vy, double *vz) {
+    *vx = PPP[index_of_the_particle].vel.x;
+    *vy = PPP[index_of_the_particle].vel.y;
+    *vz = PPP[index_of_the_particle].vel.z;
+    return 0;
+}
+
+int get_position(int index_of_the_particle, double *x, double *y, double *z) {
+    *x = PPP[index_of_the_particle].pos.x;
+    *y = PPP[index_of_the_particle].pos.y;
+    *z = PPP[index_of_the_particle].pos.z;
+    return 0;
+}
+
+bool already_printed = false;
+
+int set_position(int index_of_the_particle, double x, double y, double z) {
+    if (already_printed == false) {
+        cerr << "calling set_position" << endl;
+        cerr << "---------index_of_the_particle=" << index_of_the_particle << endl;
+        cerr << "--------- x" << PPP[index_of_the_particle].pos.x << "--->" << x << endl;
+        cerr << "--------- y" << PPP[index_of_the_particle].pos.y << "--->" << y << endl;
+        cerr << "--------- z" << PPP[index_of_the_particle].pos.z << "--->" << z << endl;
+        already_printed = true;
+    }
+    PPP[index_of_the_particle].pos = vec3(x, y, z);
+    counttt++;
+    return 0;
+}
+
+int get_acceleration(int index_of_the_particle, double *ax, double *ay, double *az) {
+    return -2;
+}
+
+int set_velocity(int index_of_the_particle, double vx, double vy, double vz) {
+//     cerr << "calling set_velocity" << endl;
+    PPP[index_of_the_particle].vel = vec3(vx, vy, vz);
+    return 0;
+}
+
+
+int get_kinetic_energy(double *kinetic_energy) {
+    *kinetic_energy = IntegratorObj.KineticEnergy();
+    return 0;
+}
+
+int get_potential_energy(double *potential_energy) {
+    *potential_energy = IntegratorObj.PotentialEnergy();
+    return 0;
+}
+
+int update_force_potential_arrays(double tttt) {
+#warning time shouldnt be a parameter to this one
+//     ForceSCF(N, PTR(PotPotPot), PTR(PPPPP), PTR(F0xxxxxx));
+    return 0;
+}
\ No newline at end of file
diff --git a/interface.py b/interface.py
new file mode 100644
index 0000000..4db3881
--- /dev/null
+++ b/interface.py
@@ -0,0 +1,160 @@
+from amuse.community import *
+from amuse.community.interface.gd import GravitationalDynamics
+from amuse.community.interface.gd import GravitationalDynamicsInterface
+
+class EticsInterface(CodeInterface, GravitationalDynamicsInterface, LiteratureReferencesMixIn):
+    """
+        .. [#] Meiron, Y., Li, B., Holley-Bockelmann, K., & Spurzem, R. 2014, ApJ, 792, 98:
+        .. [#] ... "Expansion techniques for collisionless stellar dynamical simulations"
+    """
+
+    include_headers = ['worker_code.h']
+
+    def __init__(self, **keyword_arguments):
+        CodeInterface.__init__(self, name_of_the_worker='etics_worker', **keyword_arguments)
+        LiteratureReferencesMixIn.__init__(self)
+
+    @legacy_function
+    def new_particle():
+        function = LegacyFunctionSpecification()
+        function.can_handle_array = True
+        function.addParameter('index_of_the_particle', dtype='int32', direction=function.OUT)
+        function.addParameter('mass', dtype='float64', direction=function.IN,
+            description = 'The mass of the particle')
+        function.addParameter('x', dtype='float64', direction=function.IN,
+            description = 'The initial position vector of the particle')
+        function.addParameter('y', dtype='float64', direction=function.IN,
+            description = 'The initial position vector of the particle')
+        function.addParameter('z', dtype='float64', direction=function.IN,
+            description = 'The initial position vector of the particle')
+        function.addParameter('vx', dtype='float64', direction=function.IN,
+            description = 'The initial velocity vector of the particle')
+        function.addParameter('vy', dtype='float64', direction=function.IN,
+            description = 'The initial velocity vector of the particle')
+        function.addParameter('vz', dtype='float64', direction=function.IN,
+            description = 'The initial velocity vector of the particle')
+        function.addParameter('radius', dtype='float64', direction=function.IN,
+            description = 'The radius of the particle', default = 0)
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def set_state():
+        function = LegacyFunctionSpecification()
+        function.can_handle_array = True
+        function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN)
+        function.addParameter('mass', dtype='float64', direction=function.IN,
+                 description = 'The mass of the particle')
+        function.addParameter('radius', dtype='float64', direction=function.IN,
+                 description = 'The radius of the particle')
+        function.addParameter('x', dtype='float64', direction=function.IN,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('y', dtype='float64', direction=function.IN,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('z', dtype='float64', direction=function.IN,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('vx', dtype='float64', direction=function.IN,
+                 description = 'The initial velocity vector of the particle')
+        function.addParameter('vy', dtype='float64', direction=function.IN,
+                 description = 'The initial velocity vector of the particle')
+        function.addParameter('vz', dtype='float64', direction=function.IN,
+                 description = 'The initial velocity vector of the particle')
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def get_state():
+        function = LegacyFunctionSpecification()
+        function.can_handle_array = True
+        function.addParameter('index_of_the_particle', dtype='int32', direction=function.IN)
+        function.addParameter('mass', dtype='float64', direction=function.OUT,
+                 description = 'The mass of the particle')
+        function.addParameter('radius', dtype='float64', direction=function.OUT,
+                 description = 'The radius of the particle')
+        function.addParameter('x', dtype='float64', direction=function.OUT,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('y', dtype='float64', direction=function.OUT,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('z', dtype='float64', direction=function.OUT,
+                 description = 'The initial position vector of the particle')
+        function.addParameter('vx', dtype='float64', direction=function.OUT,
+                 description = 'The initial velocity vector of the particle')
+        function.addParameter('vy', dtype='float64', direction=function.OUT,
+                 description = 'The initial velocity vector of the particle')
+        function.addParameter('vz', dtype='float64', direction=function.OUT,
+                 description = 'The initial velocity vector of the particle')
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def evolve_model():
+        function = LegacyFunctionSpecification()
+        function.can_handle_array = True
+        function.addParameter('dt', dtype='float64', direction=function.IN)
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def get_number_of_particles():
+        function = LegacyFunctionSpecification()
+        function.addParameter('number_of_particles', dtype='int32', direction=function.OUT, description = 'number of particles')
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def set_time_step():
+        function = LegacyFunctionSpecification()
+        function.addParameter('time_step', dtype='float64', direction=function.IN, description = 'time step')
+        function.result_type = 'int32'
+        return function
+
+    @legacy_function
+    def update_force_potential_arrays():
+        function = LegacyFunctionSpecification()
+        function.can_handle_array = True
+        function.addParameter('dt', dtype='float64', direction=function.IN)
+        function.result_type = 'int32'
+        return function
+
+class Etics(GravitationalDynamics):
+    def __init__(self, convert_nbody = None, **keyword_arguments):
+        legacy_interface = EticsInterface(**keyword_arguments)
+        GravitationalDynamics.__init__(self, legacy_interface, convert_nbody, **keyword_arguments)
+
+    def define_parameters(self, object):
+        object.add_method_parameter(
+            'get_begin_time',
+            'set_begin_time',
+            'begin_time',
+            'model time to start the simulation at',
+            default_value = 0.0 | nbody_system.time
+        )
+        object.add_method_parameter(
+            'get_time_step',
+            'set_time_step',
+            'time_step',
+            'constant timestep for iteration',
+            default_value = 0.001953125 | nbody_system.time
+        )
+
+    def define_methods(self, object):
+        GravitationalDynamics.define_methods(self, object)
+        # Define some shortcuts for better readability.
+        M = nbody_system.mass
+        L = nbody_system.length
+        V = nbody_system.speed
+        T = nbody_system.time
+        object.add_method('new_particle', (M,L,L,L,V,V,V,L), (object.INDEX, object.ERROR_CODE))
+        object.add_method('set_state', (object.INDEX, M,L,L,L,L,V,V,V), (object.ERROR_CODE))
+        object.add_method('get_state', (object.INDEX), (M,L,L,L,L,V,V,V, object.ERROR_CODE))
+        object.add_method('set_time_begin', (T), (object.ERROR_CODE))
+        object.add_method('get_time_begin', (), (T, object.ERROR_CODE))
+        object.add_method('get_number_of_particles', (), (units.none, object.ERROR_CODE))
+        object.add_method('get_time_step', (), (T, object.ERROR_CODE))
+        object.add_method('set_time_step', (T), (object.ERROR_CODE))
+        object.add_method('update_force_potential_arrays', (T), (object.ERROR_CODE))
+
+    def define_particle_sets(self, object):
+        GravitationalDynamics.define_particle_sets(self, object)
+
+
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..a65224e
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,72 @@
+ETICS_LMAX   ?= 2
+ETICS_NMAX   ?= 10
+GPUARCH      ?= sm_75
+CUDAHOME     ?= /usr/local/cuda
+OPTIMIZATION ?= 3
+
+#Optional CPPFLAGS: -DETICS_HDF5 -DETICS_BOOST -DETICS_MPI
+#Note that -DETICS_MPI is /not/ needed if ETICS is used as a library
+CPPFLAGS   = -DSCF -DLMAX=$(ETICS_LMAX) -DNMAX=$(ETICS_NMAX)
+
+CXX       ?= g++
+NVCC      ?= $(CUDAHOME)/bin/nvcc
+LIB       += -lm -L$(CUDAHOME)/lib64 -lcudadevrt -lcudart -lcuda
+CXXFLAGS  += -O$(OPTIMIZATION)
+CUDAFLAGS += -arch=$(GPUARCH)
+
+# If -DETICS_HDF5 is used, please specify the following
+#H5HOME   = /home/meiron/local
+#INC     += -I$(H5HOME)/include
+#LIB     += -L$(H5HOME)/lib $(H5HOME)/lib/libhdf5_hl_cpp.a $(H5HOME)/lib/libhdf5_cpp.a $(H5HOME)/lib/libhdf5_hl.a $(H5HOME)/lib/libhdf5.a -lz -ldl -lm
+#LDFLAGS += -Wl,-rpath -Wl,$(H5HOME)/lib
+
+# If -DETICS_BOOST is used, please specify the following
+#BOOSTHOME ?= /home/meiron/local
+#INC += -I$(BOOSTHOME)/include
+
+# If -DETICS_MPI is used, please specify the following
+#MPIHOME  ?= /home/meiron/local
+#INC      += -I$(MPIHOME)/include
+#LIB      += -L$(MPIHOME)/lib -lmpi
+#CXXFLAGS +=  -pthread
+#LDFLAGS  += -Wl,-rpath -Wl,$(MPIHOME) -Wl,--enable-new-dtags
+
+CODELIB = libetics.a
+
+# object files needed for the library and the standalone
+CODEOBJS1 = integrate.o mathaux.o scf.o ic.o
+
+# object files only needed for the standalone
+CODEOBJS2 = io.o main.o
+
+AR = ar ruv
+RANLIB = ranlib
+
+all: $(CODELIB) standalone
+
+clean:
+	$(RM) *.o *.a etics
+
+$(CODELIB): $(CODEOBJS1)
+	$(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) -o dlink.o
+	$(AR) $@ $(CODEOBJS1) dlink.o
+	$(RANLIB) $@
+
+library: $(CODELIB)
+
+standalone: $(CODEOBJS1) $(CODEOBJS2)
+	echo -e "\e[31mWARNING! \e[0mThe standalone program has not been tested for a long time. Use extreme caution."
+	$(NVCC) $(CUDAFLAGS) -dlink $(CODEOBJS1) $(CODEOBJS2) -o dlink.o
+	$(CXX) $(CXXFLAGS) -o etics $(CODEOBJS1) $(CODEOBJS2) dlink.o $(LIB) $(LDFLAGS)
+
+.SUFFIXES: .cu .cpp .o
+
+# only io.cpp should be compiled with all the weird HDF5 flags
+io.o: io.cpp
+	$(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o io.o io.cpp
+
+.cu.o: $<
+	$(NVCC) $(CPPFLAGS) $(INC) -Xcompiler "$(CXXFLAGS)" $(CUDAFLAGS) -dc -o $@ $<
+
+.cpp.o: $<
+	$(CXX) $(CPPFLAGS) $(INC) $(CXXFLAGS) -c -o $@ $<
diff --git a/src/common.hpp b/src/common.hpp
new file mode 100644
index 0000000..2724751
--- /dev/null
+++ b/src/common.hpp
@@ -0,0 +1,117 @@
+#pragma once
+#include <string>
+#include <time.h>
+
+#ifndef __CUDACC__
+    #define __host__
+    #define __device__
+#endif
+
+#if defined ETICS_SINGLE_PRECISION && defined ETICS_DOUBLE_PRECISION
+    #error Contradictory precision flags!
+#endif
+
+#ifndef ETICS_SINGLE_PRECISION
+    #define ETICS_DOUBLE_PRECISION
+    #define Real double
+    #define MPI_ETICS_REAL MPI_DOUBLE
+#else
+    #define Real float
+    #define MPI_ETICS_REAL MPI_FLOAT
+#endif
+
+#if defined MEX && defined SCF
+    #error Contradictory method flags!
+#endif
+
+#if defined MEX && !defined LMAX
+    #error LMAX not defined.
+#endif
+
+#if defined SCF && ((!defined NMAX) || (!defined LMAX))
+    #error Both NMAX and LMAX should be defined!
+#endif
+
+#if defined MEX && defined NMAX
+    #warning NMAX is defined, but will be ignored since the method is MEX!
+#endif
+
+struct vec3 {
+    Real x, y, z;
+
+    __host__ __device__ vec3() : x(0), y(0), z(0) {}
+    __host__ __device__ vec3(Real _x, Real _y, Real _z) : x(_x), y(_y), z(_z) {}
+    __host__ __device__ Real abs2() const {return x*x + y*y + z*z;}
+    __host__ __device__ vec3 operator+ (const vec3& V)   const {return vec3(this->x + V.x, this->y + V.y, this->z + V.z);}
+    __host__ __device__ vec3 operator- (const vec3& V)   const {return vec3(this->x - V.x, this->y - V.y, this->z - V.z);}
+    __host__ __device__ vec3 operator* (const Real& C) const {return vec3(this->x*C, this->y*C, this->z*C);}
+    __host__ __device__ vec3& operator+= (const vec3& V) {this->x += V.x; this->y += V.y; this->z += V.z; return *this;}
+    __host__ __device__ vec3& operator-= (const vec3& V) {this->x -= V.x; this->y -= V.y; this->z -= V.z; return *this;}
+    __host__ __device__ vec3& operator/= (const Real& C) {this->x /= C; this->y /= C; this->z /= C; return *this;}
+    __host__ __device__ vec3 operator- () const {return vec3(-this->x, -this->y, -this->z);}
+    __host__ __device__ vec3 operator+ () const {return vec3(this->x, this->y, this->z);}
+};
+
+
+class Particle {
+    public:
+        int ID;
+        unsigned char Status;
+        Real m;
+        vec3 pos, vel, acc;
+        Real R2;
+
+        __host__ __device__ Particle() {}
+        __host__ __device__ void CalculateR2() {R2 = pos.abs2();}
+        __host__ __device__ bool operator< (const Particle& p) const {return (this->R2 < p.R2);}
+};
+
+#define ETICS_MY_CLOCK CLOCK_MONOTONIC
+class HostTimer {
+    timespec StartTime, StopTime;
+  public:
+    void Start() {
+        clock_gettime(ETICS_MY_CLOCK, &StartTime);
+    }
+    void Stop() {
+        clock_gettime(ETICS_MY_CLOCK, &StopTime);
+    }
+    double Difference() {
+        timespec TimeDiff;
+        if ((StopTime.tv_nsec - StartTime.tv_nsec) < 0) {
+            TimeDiff.tv_sec  = StopTime.tv_sec - StartTime.tv_sec - 1;
+            TimeDiff.tv_nsec = 1000000000 + StopTime.tv_nsec - StartTime.tv_nsec;
+        } else {
+            TimeDiff.tv_sec  = StopTime.tv_sec - StartTime.tv_sec;
+            TimeDiff.tv_nsec = StopTime.tv_nsec - StartTime.tv_nsec;
+        }
+        return (double)TimeDiff.tv_sec + ((double)TimeDiff.tv_nsec)*1.0e-9;
+    }
+};
+
+#ifdef __CUDACC__
+class DeviceTimer {
+    cudaEvent_t StartTime, StopTime;
+  public:
+    DeviceTimer() {
+        cudaEventCreate(&StartTime);
+        cudaEventCreate(&StopTime);
+    }
+    ~DeviceTimer() {
+        cudaEventDestroy(StartTime);
+        cudaEventDestroy(StopTime);
+    }
+    void Start() {
+        cudaEventRecord(StartTime);
+    }
+    void Stop() {
+        cudaEventRecord(StopTime);
+        cudaEventSynchronize(StopTime);
+    }
+    double Difference(){
+        float TimeDiff;
+        cudaEventElapsedTime(&TimeDiff, StartTime, StopTime);
+        return (double)TimeDiff * 1.0e-3;
+    }
+};
+#endif
diff --git a/src/file.ini b/src/file.ini
new file mode 100644
index 0000000..3188564
--- /dev/null
+++ b/src/file.ini
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This is an ETICS parameter file
+; Required parameters are:
+;     Filename, N, Tcrit, StepSize, dT1, dT2
+;
+; Special values for Filename:
+;    _nofile_ or _hernquist_   Generates a Hernquist sphere
+;    _plummer_                 Generates a Plummer sphere
+; The _xxx_ can be followed by a integer indicating the random seed, otherwise,
+; the system time is used. The generated initial conditions are only saved at
+; the t=0 snapshot.
+;
+; Optional parameters are:
+;    device (CUDA device ID; by default or if negative, cudaSetDevice is skipped)
+;    Prefix (="", prefix for the snapshot file names)
+;    OutputFormat (="ascii", the output format, either ascii or hdf5)
+
+
+Filename = _hernquist_ 0
+;N        = 1000000
+;Tcrit    = 0.03125
+;StepSize = 0.001953125
+;dT1 = 0.001953125
+;dT2 = 0.25
+
+N        = 1000000
+Tcrit    = 500
+StepSize = 0.03125
+dT1 = 1.0
+dT2 = 5.0
+
+Prefix = /home/ym/tmp/merger
+OutputFormat = ascii
diff --git a/src/ic.cpp b/src/ic.cpp
new file mode 100644
index 0000000..f093713
--- /dev/null
+++ b/src/ic.cpp
@@ -0,0 +1,269 @@
+/**
+ * @file    ic.cpp
+ * @author  Yohai Meiron <ymeiron@pku.edu.cn>
+ * @brief   Functions to generate initial conditions.
+ *
+ * These are functions to generate a Hernquist or a Plummer sphere. In both
+ * cases the model has a total mass of 1 and the total energy is -0.25 Henon
+ * energy units. The Hernquist model generator was adapted from a program by
+ * John Dubinski (1999, original version) from the NEMO toolbox; the accessory
+ * functions are from Numerical Recipes in C (Press et al. 1992). The Plummer
+ * generator is adapted from the gen-plum program by Peter Berczik.
+ */
+
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include "common.hpp"
+#include "ic.hpp"
+
+// Internal functions
+namespace etics {
+    namespace ic {
+        double E;
+
+        void polint(double xa[], double ya[], int n, double x, double *y, double *dy);
+        double trapzd(double (*func)(double), double a, double b, int n);
+        double qromb(double (*func)(double), double a, double b);
+        double xrandom(double a, double b);
+        double xrandom();
+        double f(double E2), drho2(double u);
+    }
+}
+
+double etics::ic::trapzd(double (*func)(double), double a, double b, int n) {
+    double x, tnm, sum, del;
+    static double s;
+    int it, j;
+
+    if (n == 1) {
+        return (s=0.5*(b-a)*((*func)(a) + (*func)(b)));
+    } else {
+        for (it = 1, j = 1; j < n-1; j++) it <<= 1;
+        tnm = it;
+        del = (b-a)/tnm;
+        x = a + 0.5*del;
+        for (sum = 0, j = 1; j <= it; j++, x += del) sum += (*func)(x);
+        s = 0.5*(s+(b-a)*sum/tnm);
+        return s;
+    }
+}
+
+void etics::ic::polint(double xa[], double ya[], int n, double x, double *y, double *dy) {
+    int ns = 1;
+    double den, dif, dift, ho, hp, w;
+    double *c, *d;
+
+    dif = fabs(x-xa[1]);
+    c = new double[n+1];
+    d = new double[n+1];
+    for (int i = 1; i <= n; i++) {
+        if ((dift=fabs(x-xa[i])) < dif) {
+            ns = i;
+            dif = dift;
+        }
+        c[i] = ya[i];
+        d[i] = ya[i];
+    }
+    *y = ya[ns--];
+    for (int m = 1; m < n; m++) {
+        for (int i = 1; i <= n-m; i++) {
+            ho = xa[i]-x;
+            hp = xa[i+m]-x;
+            w = c[i+1]-d[i];
+            if ((den=ho-hp) == 0.0) {
+                std::cerr << "Error in routine polint" << std::endl;
+                exit(1);
+            }
+            den = w/den;
+            d[i] = hp*den;
+            c[i] = ho*den;
+        }
+        *y += (*dy=(2*ns < (n-m) ? c[ns+1] : d[ns--]));
+    }
+    delete[] c;
+    delete[] d;
+}
+
+#define EPS 1.0e-6
+#define JMAX 20
+#define JMAXP (JMAX+1)
+#define K 5
+double etics::ic::qromb(double (*func)(double), double a, double b) {
+    double ss, dss;
+    double s[JMAXP+1], h[JMAXP+1];
+
+    h[1] = 1.0;
+    for (int j = 1; j <= JMAX; j++) {
+        s[j] = trapzd(func, a, b, j);
+        if (j >= K) {
+            polint(&h[j-K], &s[j-K], K, 0, &ss, &dss);
+            if (fabs(dss) < EPS*fabs(ss)) return ss;
+        }
+        s[j+1] = s[j];
+        h[j+1] = 0.25*h[j];
+    }
+    std::cerr << "Too many steps in routine qromb" << std::endl;
+    exit(1);
+    return 0;
+}
+#undef EPS
+#undef JMAX
+#undef JMAXP
+#undef K
+
+double etics::ic::xrandom(double a, double b) {
+    double Result = ((double)rand())/((double)RAND_MAX);
+    Result *= (b - a);
+    Result += a;
+    return Result;
+}
+
+double etics::ic::xrandom() {
+    return xrandom(0, 1);
+}
+
+/* Distribution function of a Hernquist model */
+double etics::ic::f(double E2) {
+    E = E2;
+    return qromb(drho2, 0.0, 2.0*sqrt(E))*(sqrt(32.)*(M_PI*M_PI*M_PI));
+}
+
+double etics::ic::drho2(double u) {
+    double p = E - 0.25*u*u;
+    double r;
+    if (p <= 0) return 0;
+    else {
+        double p1 = 1/p;
+        double b0 = 0.5*(2-p1);
+        double c0 = (1-p1);
+        r = -b0 + sqrt(b0*b0 - c0);
+    }
+
+    double r2 = r*r, r3 = r2*r;
+
+    double c0 = 1.0 + r;
+    double c02 = c0*c0;
+    double c03 = c02*c0;
+    double c04 = c02*c02;
+    double dp1 = -1/c02;
+    double dp2 = 2/c03;
+
+    double drho1r = -(1+4*r)/(r2*c04);
+    double drho2r =  2.0*(14*r2 + 5*r + 1)/(r3*c04*c0);
+
+    return drho2r/(dp1*dp1) - drho1r/(dp1*dp1*dp1)*dp2;
+}
+
+void etics::ic::hernquist(int N, int Seed, Particle **ParticleList) {
+    using namespace etics::ic;
+    double x, y, z, vx, vy, vz;
+    double fmax, f0, f1, v2, vmax, vmax2, Ep=0, Ek=0;
+
+    srand(Seed);
+
+    *ParticleList = new Particle[N];
+
+    for(int i = 0; i < N; i++) {
+        double eta = xrandom(0.0, 1.0);
+        double radius = sqrt(eta)/(1-sqrt(eta));
+        double phi = xrandom(0.0, 2*M_PI);
+        double cth = xrandom(-1.0, 1.0);
+        double sth = sqrt(1.0 - cth*cth);
+        x = radius*sth*cos(phi);
+        y = radius*sth*sin(phi);
+        z = radius*cth;
+        double psi0 = 1/(1 + radius);
+        Ep += -psi0;
+        vmax2 = 2.0*psi0;
+        vmax = sqrt(vmax2);
+        fmax = f(psi0);
+        f0 = fmax; f1 = 1.1*fmax; // just some initial values
+        while(f1 > f0) {
+            // pick a velocity
+            v2 = 2.0*vmax2;
+            while(v2 > vmax2) { // rejection technique
+                vx = vmax*xrandom(-1.0, 1.0);
+                vy = vmax*xrandom(-1.0, 1.0);
+                vz = vmax*xrandom(-1.0, 1.0);
+                v2 = vx*vx + vy*vy + vz*vz;
+            }
+            f0 = f(psi0 - 0.5*v2);
+            f1 = fmax*xrandom(0.0, 1.0);
+        }
+        Ek += 0.5*v2;
+        Particle p;
+        p.m = 1.0/N;
+        p.pos = vec3(x, y, z);
+        p.vel = vec3(vx, vy, vz);
+        p.ID = i;
+        p.Status = 0;
+        p.CalculateR2();
+        (*ParticleList)[i] = p;
+    }
+    Ep /= 2*N; Ek /= N;
+    double E = Ep + Ek;
+    double RadiusFactor = -4*E; // should be close to 1/3
+    double VelocityFactor = 1/sqrt(-4*E); // should be close to sqrt(3)
+    for(int i = 0; i < N; i++) {
+        (*ParticleList)[i].pos = (*ParticleList)[i].pos * RadiusFactor;
+        (*ParticleList)[i].vel = (*ParticleList)[i].vel * VelocityFactor;
+    }
+}
+
+#define RMAX 10.0
+void etics::ic::plummer(int N, int Seed, Particle **ParticleList) {
+    double X,  Y,  Z, Vx, Vy, Vz,
+           X1, X2, X3, X4, X5, X6, X7,
+           R, Ve, V;
+
+    srand(Seed);
+
+    *ParticleList = new Particle[N];
+
+    int i = 0;
+    while(i < N) {
+        X1 = xrandom(); X2 = xrandom(); X3 = xrandom();
+        R = 1.0/sqrt( (pow(X1,-2.0/3.0) - 1.0) );
+        if (R > RMAX) continue;
+        Z = (1.0 - 2.0*X2)*R;
+        X = sqrt(R*R - Z*Z) * cos(2.0*M_PI*X3);
+        Y = sqrt(R*R - Z*Z) * sin(2.0*M_PI*X3);
+
+        Ve = sqrt(2.0)*pow( (1.0 + R*R), -0.25 );
+
+        X4 = 0.0;
+        X5 = 0.0;
+        while( 0.1*X5 >= X4*X4*pow( (1.0-X4*X4), 3.5) ) {
+            X4 = xrandom();
+            X5 = xrandom();
+        }
+
+        V = Ve*X4;
+
+        X6 = xrandom();
+        X7 = xrandom();
+
+        Vz = (1.0 - 2.0*X6)*V;
+        Vx = sqrt(V*V - Vz*Vz) * cos(2.0*M_PI*X7);
+        Vy = sqrt(V*V - Vz*Vz) * sin(2.0*M_PI*X7);
+
+#define CONV (3.0*M_PI/16.0)
+        X *= CONV; Y *= CONV; Z *= CONV;
+        Vx /= sqrt(CONV); Vy /= sqrt(CONV); Vz /= sqrt(CONV);
+#undef CONV
+
+        Particle p;
+        p.m = 1.0/N;
+        p.pos = vec3(X, Y, Z);
+        p.vel = vec3(Vx, Vy, Vz);
+        p.ID = i;
+        p.Status = 0;
+        p.CalculateR2();
+        (*ParticleList)[i] = p;
+
+        i++;
+    } /* while(i < N) */
+}
+#undef RMAX
diff --git a/src/ic.hpp b/src/ic.hpp
new file mode 100644
index 0000000..f9d121d
--- /dev/null
+++ b/src/ic.hpp
@@ -0,0 +1,7 @@
+#pragma once
+namespace etics {
+    namespace ic {
+        void hernquist(int N, int Seed, Particle **ParticleList);
+        void plummer(int N, int Seed, Particle **ParticleList);
+    }
+}
\ No newline at end of file
diff --git a/src/integrate.cu b/src/integrate.cu
new file mode 100644
index 0000000..a6638d9
--- /dev/null
+++ b/src/integrate.cu
@@ -0,0 +1,127 @@
+#include <thrust/device_vector.h>
+#include <thrust/inner_product.h>
+#include "common.hpp"
+#include "integrate.hpp"
+
+#ifdef MEX
+    #define method mex
+    #include "mex.hpp"
+#elif defined(SCF)
+    #define method scf
+    #include "scf.hpp"
+#elif defined(MULTICENTER)
+    #define method multicenter
+    #include "multicenter.hpp"
+#endif
+
+#define PTR(x) (thrust::raw_pointer_cast((x).data()))
+
+namespace etics
+{
+
+// The 'drift' step is performed using the 'acc' member.
+struct DriftFunctor {
+    Real Step, Step2;
+    __host__ __device__ DriftFunctor(Real _Step) : Step(_Step), Step2(_Step*_Step) {}
+    __host__ __device__ Particle operator() (Particle &p) const {
+        p.pos += p.vel*Step + p.acc*0.5*Step2;
+        p.CalculateR2(); // needed for both MEX and SCF, but not generally needed in leapfrog
+        return p;
+    }
+};
+
+// The 'kick' step is performed using the 'acc' member and also the force F,
+// calculated at the new (predicted) position.
+struct KickFunctor {
+    Real Step;
+    __host__ __device__ KickFunctor(Real _Step) : Step(_Step) {}
+    __host__ __device__ Particle operator() (Particle& p, const vec3& F) const {
+        p.vel += (p.acc + F)*0.5*Step;
+        p.acc = F;
+        return p;
+    }
+};
+
+struct KineticEnergyFunctor {
+    __host__ __device__ Real operator() (const Particle &p) const {return 0.5*p.m*p.vel.abs2();}
+};
+
+struct PotentialEnergyFunctor {
+    __host__ __device__ Real operator() (const Particle &p, const Real &Potential) const {return p.m*Potential;}
+};
+
+Integrator::Integrator() {
+    N = 0;
+    Time = 0;
+}
+
+Integrator::Integrator(Particle *P_h, int _N) {
+    N = _N;
+    Time = 0;
+    P = thrust::device_vector<Particle>(P_h, P_h+N);
+    Potential = thrust::device_vector<Real>(N);
+    Force = thrust::device_vector<vec3>(N);
+    Method = &etics::method::CalculateGravity;
+    CalculateGravity();
+    KickStep(0); // Just to "commit" the forces to the particle list.
+}
+
+Integrator::~Integrator() {
+    N = 0;
+    // Weird way to free Thrust memory.
+    P.clear();           P.shrink_to_fit();
+    Potential.clear();   Potential.shrink_to_fit();
+    Force.clear();       Force.shrink_to_fit();
+}
+
+void Integrator::CalculateGravity() {
+    (*Method)(PTR(P), N, PTR(Potential), PTR(Force));
+}
+
+void Integrator::DriftStep(Real Step) {
+    thrust::transform(P.begin(), P.end(), P.begin(), DriftFunctor(Step));
+}
+
+void Integrator::KickStep(Real Step) {
+    thrust::transform(P.begin(), P.end(), Force.begin(), P.begin(), KickFunctor(Step));
+}
+
+Real Integrator::GetTime() {
+    return Time;
+}
+
+int Integrator::GetN() {
+    return N;
+}
+
+Real Integrator::KineticEnergy() {
+    return thrust::transform_reduce(
+      P.begin(), P.end(),
+      KineticEnergyFunctor(),
+      (Real)0, // It must be clear to the function that this zero is a Real.
+      thrust::plus<Real>()
+    );
+}
+
+Real Integrator::PotentialEnergy() {
+    return 0.5*thrust::inner_product(
+      P.begin(), P.end(),
+      Potential.begin(),
+      (Real)0,
+      thrust::plus<Real>(),
+      PotentialEnergyFunctor()
+    );
+}
+
+void Integrator::CopyParticlesToHost(Particle *P_h) {
+    thrust::copy(P.begin(), P.end(), P_h);
+}
+
+void Integrator::CopyParticlesToHost(Particle **P_h, int *_N) {
+    Particle *LocalList = new Particle[N];
+    thrust::copy(P.begin(), P.end(), LocalList);
+    *P_h = LocalList;
+    *_N = N;
+}
+
+} // namespace etics
diff --git a/src/integrate.hpp b/src/integrate.hpp
new file mode 100644
index 0000000..69c4bab
--- /dev/null
+++ b/src/integrate.hpp
@@ -0,0 +1,27 @@
+#pragma once
+#include <thrust/device_vector.h>
+
+namespace etics {
+    class Integrator {
+      public:
+        Integrator();
+        Integrator(Particle *P_h, int _N);
+        ~Integrator();
+        void CalculateGravity();
+        void DriftStep(Real Step);
+        void KickStep(Real Step);
+        Real GetTime();
+        int GetN();
+        Real KineticEnergy();
+        Real PotentialEnergy();
+        void CopyParticlesToHost(Particle *P_h);
+        void CopyParticlesToHost(Particle **P_h, int *_N);
+      private:
+        int N;
+        double Time;
+        thrust::device_vector<Particle> P;
+        thrust::device_vector<Real> Potential;
+        thrust::device_vector<vec3> Force;
+        void (*Method)(Particle*, int, Real*, vec3*);
+    };
+}
diff --git a/src/io.cpp b/src/io.cpp
new file mode 100644
index 0000000..a4df58a
--- /dev/null
+++ b/src/io.cpp
@@ -0,0 +1,367 @@
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <climits>
+#include <fstream>
+#include <sstream>
+#include "common.hpp"
+#include "io.hpp"
+
+#ifdef ETICS_BOOST
+    #include <boost/property_tree/ptree.hpp>
+    #include <boost/property_tree/ini_parser.hpp>
+#endif
+
+using namespace std;
+
+void ReadICsASCII(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) {
+    *P_h = new Particle[N];
+    string Line;
+    ifstream InputFile(Filename.c_str());
+    int LineNum;
+
+    int i = 0;
+    if (InputFile.is_open()) {
+        getline(InputFile, Line); // First line is the snapshot number.
+        int Count = sscanf(Line.c_str(), "%d", FileSnapshotNum);
+        if (Count != 1) {
+            cerr << "Problem in input file, line 1." << endl;
+            exit(1);
+        }
+        getline(InputFile, Line); // Second line is the expected number of particles.
+        int ExpectedN;
+        Count = sscanf(Line.c_str(), "%d", &ExpectedN); // We ignore it actually.
+        if (Count != 1) {
+            cerr << "Problem in input file, line 2." << endl;
+            exit(1);
+        }
+        getline(InputFile, Line);
+        double T0;
+        Count = sscanf(Line.c_str(), "%lf", &T0);
+        if (Count != 1) {
+            cerr << "Problem in input file, line 3." << endl;
+            exit(1);
+        }
+        *FileTime = (Real)T0;
+        LineNum = 3;
+        while (getline(InputFile, Line))
+        {
+            double m, x, y, z, vx, vy, vz;
+            int id;
+            Particle p;
+            Count = sscanf(Line.c_str(), "%d %lf %lf %lf %lf %lf %lf %lf", &id, &m, &x, &y, &z, &vx, &vy, &vz);
+            if (Count != 8) {
+                cerr << "Problem in input file, line " << LineNum + 1 << "." << endl;
+                exit(1);
+            }
+            p.ID = id;
+            p.m = (Real)m;
+            p.pos = vec3((Real)x, (Real)y, (Real)z);    // This is to ensure proper casting in the case of single prec.
+            p.vel = vec3((Real)vx, (Real)vy, (Real)vz);
+            p.Status = 0;
+            p.CalculateR2();
+            (*P_h)[i] = p;
+            i++;
+            LineNum++;
+            if (i == N) break;
+        }
+        InputFile.close();
+    } else {
+        cerr << "Can't open file" << Filename << "." << endl;
+        exit(1);
+    }
+    if (i < N) {
+        cerr << "Was only able to read " << i << " particles while " << N << " were requested." << endl;
+        exit(1);
+    }
+}
+
+void WriteSnapshotASCII(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) {
+    char S[512];
+    sprintf(S, "%s%04d.dat", Prefix.c_str(), SnapNumber);
+    ofstream SnapshotFile;
+    SnapshotFile.open(S);
+    sprintf(S, "%06d\n", SnapNumber); SnapshotFile << S;
+    sprintf(S, "%06d\n", N); SnapshotFile << S;
+    sprintf(S, "%.16E\n", T); SnapshotFile << S;
+    for (int i = 0; i < N; i++) {
+        sprintf(S, "%06d%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e%14.6e\n", P_h[i].ID, P_h[i].m, P_h[i].pos.x, P_h[i].pos.y, P_h[i].pos.z, P_h[i].vel.x, P_h[i].vel.y, P_h[i].vel.z); SnapshotFile << S;
+    }
+    SnapshotFile.close();
+}
+
+#define THROW_EXCEPTION(str, stat) {cerr << str << endl; exit((stat));}
+void ParseInput(int argc, char *argv[], ParametersStruct *Params) {
+    ParametersStruct P;
+#ifdef ETICS_BOOST
+    if (argc < 2) THROW_EXCEPTION("Usage: etics [FILE]...", 1)
+    ifstream TestFileObject(argv[1]);
+    if (!TestFileObject) THROW_EXCEPTION("Problem reading file...", 1)
+    boost::property_tree::ptree pt;
+    boost::property_tree::ini_parser::read_ini(argv[1], pt); // what if file doesn't exist?
+
+    P.N = pt.get<int>("N", 0);
+    if (P.N <= 0) THROW_EXCEPTION("Could not read number of particles (N) from ini file.", 1)
+
+    P.Tcrit = pt.get<Real>("Tcrit", -1);
+    if (P.Tcrit < 0) THROW_EXCEPTION("Could not read finish time (Tcrit) from ini file.", 1)
+
+    P.dT1 = pt.get<Real>("dT1", 0);
+    if (P.dT1 <= 0) THROW_EXCEPTION("Could output time interval (dT1) from ini file.", 1)
+
+    P.dT2 = pt.get<Real>("dT2", 0);
+    if (P.dT2 <= 0) THROW_EXCEPTION("Could snapshot time interval (dT2) from ini file.", 1)
+
+    P.ConstantStep = pt.get<Real>("StepSize", -1);
+    if (P.ConstantStep < 0) THROW_EXCEPTION("Could not read step size (StepSize) from ini file.", 1)
+
+    P.Filename = pt.get<string>("Filename", "\n");
+    if (P.Filename == "\n") THROW_EXCEPTION("Could not read initial condition file name (Filename) from ini file.", 1)
+
+    P.Prefix = pt.get<string>("Prefix", "");
+    P.OutputFormat = pt.get<string>("OutputFormat", "ascii");
+    P.DeviceID = pt.get<int>("device", -1);
+    if (argc >= 3) { // If there is a argument after the file, it must be either --device or -d
+        int Version1 = strncmp(argv[2], "--device", 8);
+        int Version2 = strncmp(argv[2], "-d", 2);
+        if ((Version1!=0) && (Version2!=0)) THROW_EXCEPTION("Commandline argument after input file must be device number.", 1)
+        char *DeviceIDStr = strchr(argv[2], '=');
+        if (DeviceIDStr!=NULL) DeviceIDStr = DeviceIDStr + 1;
+        else {
+            if (argc <= 3) THROW_EXCEPTION("Device number must follow.", 1)
+            DeviceIDStr = argv[3];
+        }
+        P.DeviceID = atoi(DeviceIDStr);
+        if ((P.DeviceID==0) && (strcmp(DeviceIDStr, "0")!=0)) THROW_EXCEPTION("Error understanding device number.", 1)
+    }
+#else
+    // If no BOOST, we include a file with the parameters and compile it.
+    int N, DeviceID = -1;
+    Real dT1, dT2, Tcrit, StepSize;
+    std::string Filename, Prefix = "", OutputFormat="";
+    #include "noboost.inc"
+    P.N = N;
+    P.Filename = Filename;
+    P.dT1 = dT1;
+    P.dT2 = dT2;
+    P.Tcrit = Tcrit;
+    P.ConstantStep = StepSize;
+    P.DeviceID = DeviceID;
+    P.OutputFormat = OutputFormat;
+#endif
+    P.Seed = INT_MIN;
+    if ((P.Filename[0]=='_') && (P.Filename.rfind("_") > 0)) {
+        int Break = P.Filename.rfind("_") + 1;
+        string SeedStr = P.Filename.substr(Break, P.Filename.length() - Break);
+        int SeedInt;
+        istringstream iss(SeedStr);
+        iss >> ws >> P.Seed >> ws;
+        if(!iss.eof()) THROW_EXCEPTION("Could not understand random seed (in Filename).", 1)
+        P.Filename = P.Filename.substr(0, Break);
+    }
+    if (P.Seed == INT_MIN) P.Seed = (int)time(NULL);
+    *Params = P;
+}
+
+
+// make safety: if single precision, fail to compile
+#ifdef ETICS_HDF5
+#include "H5Cpp.h"
+
+int H5IterNum;
+int *H5IterSnapNumbers;
+string *H5IterGroupNames;
+
+herr_t file_info(hid_t loc_id, const char *name, void *opdata) {
+    int res = sscanf(name, "Step#%d", H5IterSnapNumbers + H5IterNum);
+    if (res != 1) {
+        cerr << "Problem understanding group \"" << name << "\" in HDF5 file" << endl;
+        exit(1);
+    }
+    H5IterGroupNames[H5IterNum] = name;
+    H5IterNum++;
+    return 0;
+}
+
+
+void ReadICsHDF5(string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime) {
+    double *Mass, *X, *Y, *Z, *VX, *VY, *VZ;
+    int *ID;
+    Mass  = new double[N];
+    X  = new double[N];
+    Y  = new double[N];
+    Z  = new double[N];
+    VX = new double[N];
+    VY = new double[N];
+    VZ = new double[N];
+    ID = new int[N];
+
+    H5::H5File file;
+    file = H5::H5File(Filename, H5F_ACC_RDONLY);
+
+    H5::Group group = file.openGroup("/");
+    int NumberOfGroups = group.getNumObjs();
+    H5IterSnapNumbers = new int[NumberOfGroups];
+    H5IterGroupNames = new string[NumberOfGroups];
+    H5IterNum = 0;
+    int ret = file.iterateElems("/", NULL, file_info, NULL);
+    int MaxSnapIndex = 0;
+    for (int i=1; i < NumberOfGroups; i++) MaxSnapIndex = (H5IterSnapNumbers[i]>H5IterSnapNumbers[MaxSnapIndex])?i:MaxSnapIndex;
+    string GroupName = H5IterGroupNames[MaxSnapIndex];
+
+
+    group = file.openGroup("/" + GroupName);
+    H5::Attribute attribute = group.openAttribute("Time");
+    attribute.read(H5::PredType::NATIVE_DOUBLE, FileTime);
+    *FileSnapshotNum = H5IterSnapNumbers[MaxSnapIndex];
+
+    H5::DataSet dataset = file.openDataSet("/" + GroupName + "/ID");
+    int NfromFile = dataset.getStorageSize()/sizeof(int);
+    if (NfromFile < N) {
+        cerr << "Was only able to read " << NfromFile << " particles while " << N << " were requested." << endl;
+        exit(1);
+    }
+    dataset.read(ID, H5::PredType::NATIVE_UINT32);
+
+    dataset = file.openDataSet("/" + GroupName + "/Mass");  dataset.read(Mass,  H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/X");     dataset.read(X,  H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/Y");     dataset.read(Y,  H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/Z");     dataset.read(Z,  H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/VX");    dataset.read(VX, H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/VY");    dataset.read(VY, H5::PredType::NATIVE_DOUBLE);
+    dataset = file.openDataSet("/" + GroupName + "/VZ");    dataset.read(VZ, H5::PredType::NATIVE_DOUBLE);
+
+    dataset.close();
+    group.close();
+    file.close();
+
+    *P_h = new Particle[N];
+    for (int i = 0; i < N; i++) {
+        Particle p;
+        p.ID = ID[i];
+        p.m = (Real)Mass[i];
+        p.pos = vec3((Real)X[i], (Real)Y[i], (Real)Z[i]);    // This is to ensure proper casting in the case of single prec.
+        p.vel = vec3((Real)VX[i], (Real)VY[i], (Real)VZ[i]);
+        p.Status = 0;
+        p.CalculateR2();
+        (*P_h)[i] = p;
+    }
+}
+
+bool H5FirstSnapshot = true;
+
+void WriteSnapshotHDF5(string Prefix, int SnapNumber, Particle *P_h, int N, Real T) {
+    string Filename = Prefix + ".h5part";
+
+    H5::H5File file;
+
+    if (H5FirstSnapshot && std::ifstream(Filename.c_str())) {
+        cerr << "File \"" << Filename << "\" already exist!" << endl;
+        exit(1);
+    }
+    H5FirstSnapshot = false;
+
+    if (std::ifstream(Filename.c_str())) {
+        file = H5::H5File(Filename, H5F_ACC_RDWR);
+    }
+    else file = H5::H5File(Filename, H5F_ACC_TRUNC);
+
+    char GroupName[64];
+    sprintf(GroupName, "/Step#%d", SnapNumber);
+
+    H5::Group group;
+    try {
+        H5::Exception::dontPrint();
+        group = H5::Group(file.createGroup(GroupName));
+    }
+    catch (  H5::FileIException error ) {
+        cerr << "Couldn't create group \"" << GroupName << "\" in HDF5 file, maybe it already exists?"  << endl;
+        exit(1);
+    }
+
+    double *Mass, *X, *Y, *Z, *VX, *VY, *VZ, *AX, *AY, *AZ;
+    Mass  = new double[N];
+    X  = new double[N];
+    Y  = new double[N];
+    Z  = new double[N];
+    VX = new double[N];
+    VY = new double[N];
+    VZ = new double[N];
+    AX = new double[N];
+    AY = new double[N];
+    AZ = new double[N];
+    int *ID;
+    ID = new int[N];
+
+    for (int i = 0; i < N; i++) {
+        Particle p = P_h[i];
+        ID[i] = p.ID;
+        Mass[i]  = p.m;
+        X[i]  = p.pos.x;
+        Y[i]  = p.pos.y;
+        Z[i]  = p.pos.z;
+        VX[i] = p.vel.x;
+        VY[i] = p.vel.y;
+        VZ[i] = p.vel.z;
+        AX[i] = p.acc.x;
+        AY[i] = p.acc.y;
+        AZ[i] = p.acc.z;
+    }
+
+    H5::DataSpace attr_dataspace = H5::DataSpace(H5S_SCALAR);
+
+    H5::Attribute attribute = group.createAttribute("Time", H5::PredType::NATIVE_DOUBLE, attr_dataspace);
+    attribute.write( H5::PredType::NATIVE_DOUBLE, &T);
+
+    attribute = group.createAttribute("TotalN", H5::PredType::NATIVE_UINT32, attr_dataspace);
+    attribute.write( H5::PredType::NATIVE_UINT32, &N);
+
+    hsize_t dimsxxx = N;
+    H5::DataSpace dataspacexxx(1, &dimsxxx);
+    H5::DataSet dataset3;
+    dataset3 = H5::DataSet(group.createDataSet("ID", H5::PredType::NATIVE_UINT32, dataspacexxx));
+    dataset3.write(ID, H5::PredType::NATIVE_UINT32);
+    dataset3 = H5::DataSet(group.createDataSet("Mass",  H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(Mass, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("X",  H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(X, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("Y",  H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(Y, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("Z",  H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(Z, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("VX", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(VX, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("VY", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(VY, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("VZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(VZ, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("AX", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(AX, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("AY", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(AY, H5::PredType::NATIVE_DOUBLE);
+    dataset3 = H5::DataSet(group.createDataSet("AZ", H5::PredType::NATIVE_DOUBLE, dataspacexxx));
+    dataset3.write(AZ, H5::PredType::NATIVE_DOUBLE);
+
+    dataset3.close();
+    dataspacexxx.close();
+
+    attr_dataspace.close();
+    attribute.close();
+    group.close();
+    file.close();
+
+    delete [] Mass;
+    delete [] X;
+    delete [] Y;
+    delete [] Z;
+    delete [] VX;
+    delete [] VY;
+    delete [] VZ;
+    delete [] AX;
+    delete [] AY;
+    delete [] AZ;
+    delete [] ID;
+}
+
+#endif
diff --git a/src/io.hpp b/src/io.hpp
new file mode 100644
index 0000000..8b02b5e
--- /dev/null
+++ b/src/io.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+struct ParametersStruct {
+    int N;
+    std::string Filename;
+    int Seed;
+    Real dT1, dT2, Tcrit, ConstantStep;
+    std::string Prefix;
+    std::string OutputFormat;
+    int DeviceID;
+};
+
+void ReadICsASCII(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime);
+void WriteSnapshotASCII(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T);
+void ParseInput(int argc, char *argv[], ParametersStruct *Params);
+#ifdef ETICS_HDF5
+void ReadICsHDF5(std::string Filename, int N, Particle **P_h, int *FileSnapshotNum, Real *FileTime);
+void WriteSnapshotHDF5(std::string Prefix, int SnapNumber, Particle *hostP, int N, Real T);
+#endif
diff --git a/src/main.cu b/src/main.cu
new file mode 100644
index 0000000..33f3bf0
--- /dev/null
+++ b/src/main.cu
@@ -0,0 +1,347 @@
+/**
+ * @file    main.cu
+ * @author  Yohai Meiron <ymeiron@pku.edu.cn>
+ * @version 1.0
+ */
+
+// If you have CUDA then compile with:
+// nvcc main.cu -lm -O3 -arch=sm_20 -o output
+// Otherwise enable OpenMP and compile with GCC:
+// g++ -x c++ -O3 -o output main.cu -DOMP -fopenmp -lgomp -I/home/ym
+// The -I is the path to the parent directory where thrust is.
+
+#include <iostream>
+#include <fstream>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#ifdef ETICS_MPI
+#include <mpi.h>
+#endif
+
+#define CUDA
+
+#ifdef OMP
+    #error Sorry, OpenMP is currently disabled.
+    #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND_OMP
+    #undef CUDA
+    #define PARALLEL_GET_TID omp_get_thread_num()
+    #define PARALLEL_ADVANCE omp_get_num_threads()
+    #define __global__
+//     #include <complex>
+#else
+    #define PARALLEL_GET_TID threadIdx.x + blockIdx.x * blockDim.x
+    #define PARALLEL_ADVANCE blockDim.x * gridDim.x
+//     #include "cuda_complex.hpp"
+#endif
+// #define complex complex<Real>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+#include <thrust/partition.h>
+#include <thrust/inner_product.h>
+
+#include "common.hpp"
+#include "io.hpp"
+#include "ic.hpp"
+#include "integrate.hpp"
+
+#ifdef MEX
+    #define method mex
+    #include "mex.hpp"
+#elif defined(SCF)
+    #define method scf
+    #include "scf.hpp"
+#elif defined(MULTICENTER)
+    #define method multicenter
+    #include "multicenter.hpp"
+#endif
+
+using namespace std;
+using namespace etics;
+
+// GLOBAL VARIABLES
+int MyRank, NumProcs;
+Real ConstantStep = 0.001953125;
+Real T, Step, dT1, dT2, Tcrit, FileTime;
+int NSteps = 0, FileSnapshotNum;
+
+struct ReorderingFunctor {
+    __host__ __device__ bool operator() (const Particle &lhs, const Particle &rhs) {
+        return (lhs.ID <= rhs.ID);
+    }
+};
+
+Real CalculateStepSize() {
+    return ConstantStep;
+}
+
+void DisplayInformation(Integrator IntegratorObj) {
+    Real Ek = IntegratorObj.KineticEnergy();
+    Real Ep = IntegratorObj.PotentialEnergy();
+    Real Energy = Ek + Ep;
+    int N=IntegratorObj.GetN();
+
+    Real TotalEnergy;
+    int TotalN;
+
+#ifdef ETICS_MPI
+    MPI_Reduce(&Energy, &TotalEnergy, 1, MPI_ETICS_REAL, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&N, &TotalN, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+#else
+    TotalEnergy = Energy;
+    TotalN = N;
+#endif
+
+    if (MyRank==0) {
+        printf(" TIME =%6.2f  NSTEPS =%6d  ENERGY =%20.16f   N = %d\n", T, NSteps, TotalEnergy, TotalN);
+        fflush(stdout);
+    }
+}
+
+void PrepareSnapshot(Integrator IntegratorObj, Particle **ParticleList, int *CurrentTotalN) {
+    Particle *LocalList;
+    int LocalBufferSize;
+    IntegratorObj.CopyParticlesToHost(&LocalList, &LocalBufferSize);
+    int TotalN = 0;
+
+#ifdef ETICS_MPI
+    LocalBufferSize *= sizeof(Particle);
+    int BufferSizes[NumProcs];
+    MPI_Gather(&LocalBufferSize, 1, MPI_INT, BufferSizes, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    int Displacements[NumProcs];
+    if (MyRank==0) {
+        for (int p = 0; p < NumProcs; p++) TotalN += BufferSizes[p]/sizeof(Particle);
+        Displacements[0] = 0;
+        for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1];
+        *ParticleList = new Particle[TotalN];
+    }
+    MPI_Gatherv(LocalList, LocalBufferSize, MPI_BYTE, *ParticleList, BufferSizes, Displacements, MPI_BYTE, 0, MPI_COMM_WORLD);
+#else
+    TotalN = LocalBufferSize/sizeof(Particle);
+    *ParticleList = new Particle[TotalN];
+    std::copy(LocalList, LocalList+TotalN, *ParticleList);
+#endif
+
+#ifdef MEX
+    thrust::sort(*ParticleList, (*ParticleList)+TotalN, ReorderingFunctor());
+#endif
+    *CurrentTotalN = TotalN;
+    free(LocalList);
+//WARNING Honestly I dont get why I use allocate memory inside the fuction, its very confusing and makes it harder to find leaks.
+//Future Yohai (2019): Yeah, it's done both here for **ParticleList and internally for *LocalList in Integrator::CopyParticlesToHost that does it, pretty ugly.
+}
+
+int main(int argc, char *argv[]) {
+#ifdef ETICS_MPI
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &NumProcs);
+#endif
+
+    if (MyRank==0) {
+        cerr << "Welcome to ETICS..." << endl;
+#ifdef MEX
+        cerr << "Using method: MEX" << endl;
+        cerr << "LMAX=" << LMAX << endl;
+#elif defined(SCF)
+        cerr << "Using method: SCF" << endl;
+        cerr << "LMAX=" << LMAX << endl;
+        cerr << "NMAX=" << NMAX << endl;
+#endif
+    }
+
+    string Filename;
+    int DeviceID = 0;
+
+    ParametersStruct Params;
+    // Instead of reading the input file with MyRank=0 and broadcast the result, we let every rank read the file. This probably saves ~20 lines of ugly MPI code.
+    ParseInput(argc, argv, &Params);
+    int N = Params.N; // total; will be divided by number of processes
+    Filename = Params.Filename;
+    Tcrit = Params.Tcrit;
+    ConstantStep = Params.ConstantStep;
+    DeviceID = Params.DeviceID;
+    dT1 = Params.dT1;
+    dT2 = Params.dT2;
+
+    if (DeviceID >= 0) {
+        if (cudaSetDevice(DeviceID) != cudaSuccess) {
+            cerr <<  "Problem opening device (ID=" << DeviceID << ")" << endl;
+            exit(1);
+        }
+    } else {
+        cerr << "Skipping call to cudaSetDevice." << endl;
+    }
+
+    // Read an input file and initialize the global particle structure.
+    Particle *FullList;
+    if (MyRank==0) {
+        if ((Filename == "_nofile_") || (Filename == "_hernquist_")) {
+            cout << "Generating a Hernquist sphere..." << endl;
+            etics::ic::hernquist(N, Params.Seed, &FullList);
+            FileSnapshotNum = 0;
+            FileTime = 0;
+            cout << "Done." << endl;
+        } else if (Filename == "_plummer_") {
+            cout << "Generating a Plummer sphere..." << endl;
+            etics::ic::plummer(N, Params.Seed, &FullList);
+            FileSnapshotNum = 0;
+            FileTime = 0;
+            cout << "Done." << endl;
+        }
+        else {
+            string InputFileSuffix = Filename.substr(Filename.find_last_of("."), Filename.length()-Filename.find_last_of("."));
+
+            if ((InputFileSuffix==".h5part") || (InputFileSuffix==".hdf5") || (InputFileSuffix==".h5")) {
+#ifndef ETICS_HDF5
+                cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot read input in this format." << endl;
+                exit(1);
+#else
+                ReadICsHDF5(Filename, N, &FullList, &FileSnapshotNum, &FileTime);
+#endif
+            } else ReadICsASCII(Filename, N, &FullList, &FileSnapshotNum, &FileTime);
+        }
+    }
+
+#ifndef ETICS_HDF5
+    if (Params.OutputFormat == "hdf5") {
+        cerr << "Compiled without the \"ETICS_HDF5\" flag; cannot output in requested format." << endl;
+        exit(1);
+    }
+#endif
+    if (!(Params.OutputFormat == "hdf5") && !(Params.OutputFormat == "ascii")) {
+        cerr << "Requested output format unrecognized." << endl;
+        exit(1);
+    }
+
+#ifdef ETICS_MPI
+    int LocalN = N / NumProcs;
+
+    int Remainder = N - LocalN*NumProcs;
+    if (MyRank==NumProcs-1) LocalN += Remainder;
+    Particle *LocalList = new Particle[LocalN];
+    int BufferSizes[NumProcs];
+    int Displacements[NumProcs];
+    if (MyRank==0) {
+        for (int p = 0; p < NumProcs; p++) BufferSizes[p] = (N / NumProcs)*sizeof(Particle);
+        BufferSizes[NumProcs-1] += Remainder*sizeof(Particle);
+        Displacements[0] = 0;
+        for (int p = 1; p < NumProcs; p++) Displacements[p] = Displacements[p-1] + BufferSizes[p-1];
+    }
+    MPI_Scatterv(FullList, BufferSizes, Displacements, MPI_BYTE, LocalList, LocalN*sizeof(Particle), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    if (MyRank==0) free(FullList);
+    N = LocalN;
+#else
+    Particle *LocalList = new Particle[N];
+    copy(FullList, FullList+N, LocalList);
+#endif
+
+#ifdef ETICS_MPI
+// This code block is just to ask each MPI process to report, and find if there is a conflict (i.e. multiple processes using the same physical device)
+    cudaDeviceProp DeviceProperties;
+    const int etics_str_len = 256;
+    cudaGetDeviceProperties(&DeviceProperties, 0);
+    char ProcessorName[etics_str_len];
+    int tmp;
+    MPI_Get_processor_name(ProcessorName, &tmp);
+    char UniqueDeviceID[etics_str_len];
+    sprintf(UniqueDeviceID, "%d$$$%s", DeviceProperties.pciBusID, ProcessorName);
+    char Message[etics_str_len];
+    sprintf(Message, "Hello from rank %d (of %d) on %s, using \"%s\" with PCI bus ID %d; this rank has %d particles.\n", MyRank, NumProcs, ProcessorName, DeviceProperties.name, DeviceProperties.pciBusID, LocalN);
+    if (MyRank == 0) {
+        printf("%s", Message);
+        fflush(stdout);
+        for (int Rank = 1; Rank < NumProcs; Rank++) {
+            MPI_Recv(Message, etics_str_len, MPI_CHAR, Rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            printf("%s", Message);
+            fflush(stdout);
+        }
+    } else {
+        MPI_Send(Message, etics_str_len, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
+    }
+
+    // Here we collect the GPU IDs from all MPI processes and print a warning if one GPU is assigned to more than one process.
+    char *StrBuf;
+    StrBuf = (char*)malloc(NumProcs*etics_str_len*sizeof(char));
+    MPI_Gather( UniqueDeviceID, etics_str_len, MPI_CHAR, StrBuf, etics_str_len, MPI_CHAR, 0, MPI_COMM_WORLD); 
+    if (MyRank == 0) {
+        bool DuplicateFound = false;
+        for (int i = 0; i < NumProcs; i++) {
+            for (int j = i+1; j < NumProcs; j++) {
+                if (strcmp(StrBuf+i*etics_str_len, StrBuf+j*etics_str_len) == 0) {
+                    DuplicateFound = true;
+                    break;
+                }
+            if (DuplicateFound) break;
+            }
+        }
+        if (DuplicateFound) {
+            printf("\x1B[31m!!SEVERE WARNING!!\x1B[0m It seems the same physical GPU device was assigned to multiple processes; check the submission script.\n");
+        }
+    }
+    free(StrBuf);
+#endif
+
+
+    // Now initiate the code
+    method::Init(N, 0, 0, 0, 0);
+
+    // Initiate the integrator
+    Integrator IntegratorObj(LocalList, N);
+
+    // More initializations.
+    Real NextOutput = 0, NextSnapshot = 0;
+    T = FileTime;
+    int SnapNumber = FileSnapshotNum;
+    Step = CalculateStepSize();
+
+    while (T <= Tcrit) {
+        if (T >= NextOutput) {
+            DisplayInformation(IntegratorObj);
+            NextOutput += dT1;
+        }
+        if (T >= NextSnapshot) {
+//             cerr << "TESTING MODE!!! NOT WRITING SNAPSHOTS!!!" << endl;
+            int CurrentTotalN;
+            PrepareSnapshot(IntegratorObj, &FullList, &CurrentTotalN);
+            if (MyRank==0) {
+                if (Params.OutputFormat == "ascii") WriteSnapshotASCII(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T);
+#ifdef ETICS_HDF5
+                else if (Params.OutputFormat == "hdf5") WriteSnapshotHDF5(Params.Prefix, SnapNumber, FullList, CurrentTotalN, T);
+#endif
+                else {cerr << "Error" << endl; exit(1);}
+                free(FullList);
+            }
+            SnapNumber++;
+            NextSnapshot += dT2;
+        }
+
+        // Take the drift step.
+        IntegratorObj.DriftStep(Step);
+
+        // Calculate the forces in the new positions.
+        IntegratorObj.CalculateGravity();
+
+        // Finish by taking the kick step.
+        // The kick functor also "commits" the predicted forces into the "acc" member.
+        IntegratorObj.KickStep(Step);
+
+        // N particles were implicitly propagated in this iteration.
+        NSteps += 1;
+
+        // Advance global time.
+        T += Step;
+
+        // Calculate the next step.
+        Step = CalculateStepSize();
+    }
+    IntegratorObj.~Integrator();
+#ifdef ETICS_MPI
+    MPI_Finalize();
+#endif
+    return 0;
+}
diff --git a/src/mathaux.cu b/src/mathaux.cu
new file mode 100644
index 0000000..c2d6aee
--- /dev/null
+++ b/src/mathaux.cu
@@ -0,0 +1,57 @@
+#include "common.hpp"
+// #include "mathaux.hpp"
+
+double FactorialSCF_tmpname(int x) {
+    return (x <= 1) ? 1 : x*FactorialSCF_tmpname(x - 1);
+}
+
+double GammaPlusHalf(int x) {
+    return pow(0.25, x) * sqrt(M_PI) * FactorialSCF_tmpname(2*x) / FactorialSCF_tmpname(x);
+}
+
+void RadialCoefficients(Real *Result) {
+    for (int n = 0; n <= NMAX; n++) {
+        for (int l = 0; l <= LMAX; l++) {
+            double AnlTilde =  - pow(2.0, 8*l + 6) * FactorialSCF_tmpname(n) * (n + 2*l + 1.5);
+            AnlTilde *= GammaPlusHalf(2*l + 1) * GammaPlusHalf(2*l + 1);
+            AnlTilde /= (4*M_PI * (n*(n + 3 + 4*l)/2 + (l+1)*(2*l+1)) * FactorialSCF_tmpname(n+4*l+2));
+            Result[(LMAX+1)*n+l] = (Real)AnlTilde;
+        }
+    }
+}
+
+__host__ __device__ Real Pl(int l, Real x) {
+    // No range check or anything... The first 10 polynomials are hardcoded
+    switch (l) {
+        case 0  : return 1;
+        case 1  : return x;
+        case 2  : return (1.5*x*x - 0.5);
+        case 3  : return (2.5*x*x*x - 1.5*x);
+        case 4  : {Real x2=x*x; return (4.375*x2*x2  - 3.75*x2 + 0.375);}
+        case 5  : {Real x2=x*x, x3=x2*x; return (7.875*x3*x2 - 8.75*x3 + 1.875*x);}
+        case 6  : {Real x2=x*x, x4=x2*x2; return (14.4375*x4*x2 - 19.6875*x4 + 6.5625*x2 - 0.3125);}
+        case 7  : {Real x2=x*x, x3=x2*x, x5=x3*x2; return (26.8125*x5*x2 - 43.3125*x5 + 19.6875*x3 - 2.1875*x);}
+        case 8  : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4; return (50.2734375*x8 - 93.84375*x6 + 54.140625*x4 - 9.84375*x2 + 0.2734375);}
+        case 9  : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2;
+                   return (94.9609375*x9 - 201.09375*x7 + 140.765625*x5 - 36.09375*x3 + 2.4609375*x);}
+        case 10 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4;
+                   return (180.42578125*x10 - 427.32421875*x8 + 351.9140625*x6 - 117.3046875*x4 + 13.53515625*x2 - 0.24609375);}
+        case 11 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2;
+                   return (344.44921875*x11 - 902.12890625*x9 + 854.6484375*x7 - 351.9140625*x5 + 58.65234375*x3 - 2.70703125*x);}
+        case 12 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6;
+                   return (660.1943359375*x12 - 1894.470703125*x10 + 2029.7900390625*x8 - 997.08984375*x6 + 219.9462890625*x4 - 17.595703125*x2 + 0.2255859375);}
+        case 13 : {Real x2=x*x, x3=x2*x, x5=x3*x2, x7=x5*x2, x9=x7*x2, x11=x9*x2, x13=x9*x2;
+                   return (1269.6044921875*x13 - 3961.166015625*x11 + 4736.1767578125*x9 - 2706.38671875*x7 + 747.8173828125*x5 -87.978515625*x3 + 2.9326171875*x);}
+        case 14 : {Real x2=x*x, x4=x2*x2, x6=x4*x2, x8=x4*x4, x10=x6*x4, x12=x6*x6, x14=x8*x6;
+                   return (2448.52294921875*x14 - 8252.42919921875*x12 + 10893.2065429688*x10 - 7104.26513671875*x8 + 2368.08837890625*x6 - 373.90869140625*x4 + 21.99462890625*x2 - 0.20947265625);}
+        default : return -1e30;
+    }
+}
+
+void AngularCoefficients(Real *Result) {
+    for (int l = 0; l <= LMAX; l++) {
+        for (int m = 0; m <= l; m++) {
+            Result[(l+1)*l/2+m] = (sqrt((2*l+1)/(4*M_PI)) * sqrt(FactorialSCF_tmpname(l-m)/FactorialSCF_tmpname(l+m)));
+        }
+    }
+}
diff --git a/src/mathaux.hpp b/src/mathaux.hpp
new file mode 100644
index 0000000..415a3b3
--- /dev/null
+++ b/src/mathaux.hpp
@@ -0,0 +1,30 @@
+#pragma once
+#include "common.hpp"
+#include <cuComplex.h>
+#ifdef ETICS_DOUBLE_PRECISION
+    #define Complex      cuDoubleComplex
+    #define make_Complex make_cuDoubleComplex
+    #define Complex_add  cuCadd
+    #define Complex_mul  cuCmul
+    #define Complex_imag cuCimag
+    #define Complex_real cuCreal
+    #define Complex_conj cuConj
+#else
+    #define Complex      cuFloatComplex
+    #define make_Complex make_cuFloatComplex
+    #define Complex_add  cuCaddf
+    #define Complex_mul  cuCmulf
+    #define Complex_imag cuCimagf
+    #define Complex_real cuCrealf
+    #define Complex_conj cuConjf
+#endif
+
+#define SQRT_4_PI 3.5449077018110320545963349666822903655950989122447742564276
+
+
+double FactorialSCF_tmpname(int x);
+double GammaPlusHalf(int x);
+void RadialCoefficients(Real *Result);
+#define MAX_HARDCODED_PL 14
+__host__ __device__ Real Pl(int l, Real x);
+void AngularCoefficients(Real *Result);
diff --git a/src/multicenter.cu b/src/multicenter.cu
new file mode 100644
index 0000000..975417b
--- /dev/null
+++ b/src/multicenter.cu
@@ -0,0 +1,131 @@
+#include "common.hpp"
+#include "mathaux.hpp"
+#include "scf.hpp"
+#include "integrate.hpp"
+#include "multicenter.hpp"
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include <thrust/execution_policy.h>
+
+etics::scf::scfclass SCFObject1;
+etics::scf::scfclass SCFObject2;
+
+vec3 *F_tmp;
+vec3 *F_tmp2;
+Real *Potential_tmp;
+Real *Potential_tmp2;
+Particle Origins[2];
+
+void etics::multicenter::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
+    cudaMalloc((void**)&F_tmp, Nmax*sizeof(vec3));
+    cudaMalloc((void**)&F_tmp2, Nmax*sizeof(vec3));
+    cudaMalloc((void**)&Potential_tmp, Nmax*sizeof(Real));
+    cudaMalloc((void**)&Potential_tmp2, Nmax*sizeof(Real));
+    SCFObject1.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+//     SCFObject1.OriginPos = vec3(5,0,0);
+//     SCFObject1.OriginVel = vec3(0,0.1,0);
+
+    SCFObject2.Init(Nmax, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+    Particle p;
+    p.m = 0;
+    p.pos = vec3(-6.000000e-01 , 0 , 0);
+    p.vel = vec3(0 , -3.803629e-03 , 0);
+    Origins[0] = p;
+    p.pos = vec3(2.940000e+01 , 0 , 0);
+    p.vel = vec3(0 , 1.863778e-01 , 0);
+    Origins[1] = p;
+//     OriginsIntegrator = etics::Integrator(Origins, 2);
+    
+}
+
+void etics::multicenter::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
+    Particle PP[2];
+    cudaMemcpy(PP, P+N-2, 2*sizeof(Particle), cudaMemcpyDeviceToHost);
+//     std::cerr << PP[0].pos.x << std::endl;
+//     std::cerr << PP[0].pos.y << std::endl;
+//     std::cerr << PP[0].pos.z << std::endl;
+//     std::cerr << PP[1].pos.x << std::endl;
+//     std::cerr << PP[1].pos.y << std::endl;
+//     std::cerr << PP[1].pos.z << std::endl;
+    
+    
+//     #error all we did last time was to add the offset to the subroutines, now the energy is nan. must be some kind of self force
+    
+    SCFObject1.GetGpuLock();
+    SCFObject1.SendCachePointersToGPU();
+    SCFObject1.LoadParticlesToCache(P, 800000, PP[0].pos);
+    SCFObject1.CalculateCoefficients();
+    SCFObject1.ReleaseGpuLock();
+    
+    
+    SCFObject2.GetGpuLock();
+    SCFObject2.SendCachePointersToGPU();
+    SCFObject2.LoadParticlesToCache(P+800000, 200000-2, PP[1].pos);
+    SCFObject2.CalculateCoefficients();
+    SCFObject2.ReleaseGpuLock();
+    
+    
+    SCFObject1.GetGpuLock();
+    SCFObject1.SendCoeffsToGPU();
+    SCFObject1.SendCachePointersToGPU();
+    SCFObject1.LoadParticlesToCache(P, N-2, PP[0].pos);
+    SCFObject1.CalculateGravityFromCoefficients(Potential_tmp, F_tmp);
+    SCFObject1.ReleaseGpuLock();
+    
+    SCFObject2.GetGpuLock();
+    SCFObject2.SendCoeffsToGPU();
+    SCFObject2.SendCachePointersToGPU();
+    SCFObject2.LoadParticlesToCache(P, N-2, PP[1].pos);
+    SCFObject2.CalculateGravityFromCoefficients(Potential_tmp2, F_tmp2);
+    SCFObject2.ReleaseGpuLock();
+    
+    // now deal just with the central particles
+    SCFObject1.GetGpuLock();
+    SCFObject1.SendCoeffsToGPU();
+    SCFObject1.SendCachePointersToGPU();
+    SCFObject1.LoadParticlesToCache(P+N-1, 1, PP[0].pos); //instead of doing that, you need to "push" it at the end of the list
+    SCFObject1.CalculateGravityFromCoefficients(Potential_tmp+N-1, F_tmp+N-1); // the last particle is the second center, so should feel the first's force
+    SCFObject1.ReleaseGpuLock();
+
+    SCFObject2.GetGpuLock();
+    SCFObject2.SendCoeffsToGPU();
+    SCFObject2.SendCachePointersToGPU();
+    SCFObject2.LoadParticlesToCache(P+N-2, 1, PP[1].pos);
+    SCFObject2.CalculateGravityFromCoefficients(Potential_tmp+N-2, F_tmp+N-2);
+    SCFObject2.ReleaseGpuLock();
+    
+    
+thrust::plus<Real> op;
+thrust::transform(thrust::device, Potential_tmp, Potential_tmp+N-2, Potential_tmp2, Potential_tmp, thrust::plus<Real>());
+thrust::transform(thrust::device, F_tmp, F_tmp+N-2, F_tmp2, F_tmp, thrust::plus<vec3>());
+
+/*    
+    vec3 FFF;
+    Real PPP;
+    SCFObject1.CalculateGravityAtPoint(vec3(100,0.001,0.001), &PPP, &FFF);
+    
+    std::cerr << PPP << std::endl;
+    std::cerr << FFF.x << std::endl;
+    std::cerr << FFF.y << std::endl;
+    std::cerr << FFF.z << std::endl;*/
+
+//     vec3 FFF;
+//     Real PPP;
+//     cudaMemcpy(&PPP, Potential_tmp+N-2, 1*sizeof(Real), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(&FFF, F_tmp+N-2, 1*sizeof(vec3), cudaMemcpyDeviceToHost);
+//     std::cerr << "===" << std::endl;
+//     std::cerr << PPP << std::endl;
+//     std::cerr << FFF.x << std::endl;
+//     std::cerr << FFF.y << std::endl;
+//     std::cerr << FFF.z << std::endl;
+//     cudaMemcpy(&PPP, Potential_tmp+N-1, 1*sizeof(Real), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(&FFF, F_tmp+N-1, 1*sizeof(vec3), cudaMemcpyDeviceToHost);
+//     std::cerr << "+++" << std::endl;
+//     std::cerr << PPP << std::endl;
+//     std::cerr << FFF.x << std::endl;
+//     std::cerr << FFF.y << std::endl;
+//     std::cerr << FFF.z << std::endl;
+
+    cudaMemcpy(Potential, Potential_tmp, N*sizeof(Real), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(F,         F_tmp,         N*sizeof(vec3), cudaMemcpyDeviceToDevice);
+}
\ No newline at end of file
diff --git a/src/multicenter.hpp b/src/multicenter.hpp
new file mode 100644
index 0000000..4723798
--- /dev/null
+++ b/src/multicenter.hpp
@@ -0,0 +1,6 @@
+namespace etics {
+    namespace multicenter {
+        void Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
+        void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
+    }
+}
\ No newline at end of file
diff --git a/src/noboost.inc b/src/noboost.inc
new file mode 100644
index 0000000..4735d28
--- /dev/null
+++ b/src/noboost.inc
@@ -0,0 +1,15 @@
+/*******************************************************************************
+* This is an ETICS parameter file used when the BOOST library is not available.
+* When changing a parameter, the code has to be recompiled; if using BOOST, this
+* is not required. The syntax is C++, just fill in the variables used in the
+* regular initialization file. No error messages will be shown if input is bad.
+*******************************************************************************/
+
+Filename     = "_nofile_ 0";
+N            = 1048576;
+Tcrit        = 16;
+StepSize     = 0.001953125;
+dT1          = 0.001953125;
+dT2          = 0.1;
+Prefix       = "test_del_me";
+OutputFormat = "ascii";
diff --git a/src/scf.cu b/src/scf.cu
new file mode 100644
index 0000000..5cbed0a
--- /dev/null
+++ b/src/scf.cu
@@ -0,0 +1,518 @@
+/**
+ * @file
+ * @author  Yohai Meiron <ymeiron@pku.edu.cn>
+ * @brief   Functions to calculate gravitational force using the SCF method.
+ */
+#include "common.hpp"
+#include "mathaux.hpp"
+#include "scf.hpp"
+
+#include <iostream>
+using std::cout;
+using std::cerr;
+using std::endl;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <unistd.h>
+
+// Use the MPI flag only for the standalone program
+#ifdef ETICS_MPI
+#include <mpi.h>
+#endif
+
+namespace etics {
+    namespace scf {
+        scfclass SCFObject;
+
+        // These are truly global variables that are initialized once and can be read by every scfclass object.
+        bool ScfGloballyInitialized = false;
+        __constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)];
+        __constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2];
+                     Real RadCoeff_h[(NMAX+1)*(LMAX+1)];
+                     Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2];
+        __constant__ Real LengthScale;
+                     Real LengthScale_h;
+
+        // These are global variables that are read/written by each scfclass object; they must be locked before use.
+        __constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
+        __constant__ CacheStruct Cache;
+
+        // These variables provide the mechanism to lock and release the global constant memory variables above.
+        scfclass *GpuLockOwner = NULL;
+        #define GPU_LOCK_PANIC \
+        { \
+            std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \
+            exit(1); \
+        }
+    }
+}
+
+namespace etics {
+        namespace scf {
+        int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function
+            return (LMAX+1)*sizeof(Complex)*BlockSize;
+        }
+    }
+}
+
+// Class implementation
+etics::scf::scfclass::scfclass() {
+    N=0;
+    Nmax=0;
+    k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1;
+}
+
+etics::scf::scfclass::~scfclass() {
+    if (etics::scf::GpuLockOwner == this) ReleaseGpuLock();
+    cudaFree(Cache_h.xi);
+    cudaFree(Cache_h.Phi0l);
+    cudaFree(Cache_h.Wprev1);
+    cudaFree(Cache_h.Wprev2);
+    cudaFree(Cache_h.costheta);
+    cudaFree(Cache_h.sintheta_I);
+    cudaFree(Cache_h.Exponent);
+    cudaFree(Cache_h.mass);
+    free(PartialSum_h);
+    cudaFree(PartialSum);
+}
+
+void etics::scf::scfclass::GetGpuLock() {
+    if (etics::scf::GpuLockOwner == this) return;
+    if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this;
+    else GPU_LOCK_PANIC;
+}
+
+void etics::scf::scfclass::ReleaseGpuLock() {
+    if (etics::scf::GpuLockOwner == NULL) return;
+    if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL;
+    else GPU_LOCK_PANIC;
+}
+
+void etics::scf::scfclass::CalculateCoefficients(int n, int l) {
+    int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2;
+    etics::scf::CalculateCoefficientsPartial<<<k3gs,k3bs,k3bs*sizeof(Complex)*(LMAX+1)>>>(N, n, l, PartialSum);
+    cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost);
+    for (int m = 0; m <= l; m++)
+        for (int Block=0; Block<k3gs; Block++)
+            A_h[BaseAddress + m] = Complex_add(A_h[BaseAddress + m], PartialSum_h[Block*(l+1) + m]);
+}
+
+void etics::scf::scfclass::CalculateCoefficients() {
+    memset(A_h, 0, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
+    for (int l = 0; l <= LMAX; l++) {
+        if (l > 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter
+        for (int n = 0; n <= NMAX; n++) {
+            CalculateCoefficients(n, l);
+        }
+    }
+}
+
+void etics::scf::scfclass::GetCoefficients(Complex *A) {
+    std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A);
+}
+
+
+void etics::scf::scfclass::SendCoeffsToGPU() {
+    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
+    cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
+}
+
+void etics::scf::scfclass::SendCachePointersToGPU() {
+    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
+    cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct));
+}
+
+void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) {
+    if (N > Nmax) {
+        fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax);
+        exit(1);
+    }
+    this->N = N;
+    etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset);
+}
+
+void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) {
+    etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F);
+}
+
+void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) {
+    static bool first = true;
+    if (first) {
+        printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n");
+        first = false;
+    }
+    Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
+    Real xi = (r-1)/(r+1);
+    Real costheta = Pos.z/r;
+    Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
+    Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I);
+
+    etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential);
+}
+
+void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
+    GetGpuLock();
+    SendCachePointersToGPU();
+    LoadParticlesToCache(P, N);
+    CalculateCoefficients();
+#ifdef ETICS_MPI
+    Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
+    MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD);
+    std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h);
+    // not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs
+#endif
+    SendCoeffsToGPU();
+    CalculateGravityFromCoefficients(Potential, F);
+    ReleaseGpuLock();
+}
+
+void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
+    if (k3gs_new != k3gs) {
+        if (PartialSum_h != NULL) free(PartialSum_h);
+        if (PartialSum   != NULL) cudaFree(PartialSum_h);
+        PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex));
+        cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex));
+    }
+    k3gs = k3gs_new;
+    k3bs = k3bs_new;
+    k4gs = k4gs_new;
+    k4bs = k4bs_new;
+}
+
+void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
+    // First, initialize global arrays if not already initialized.
+    if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit();
+
+    // Next, set the launch configuation (either guess it or use what the user specified).
+    if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) {
+        int k3gs_new, k3bs_new, k4gs_new, k4bs_new;
+        etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new);
+        SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+    }
+    else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+
+    // If Nmax > 0 then initialize the cahche memory.
+    if (Nmax > 0) {
+        this->Nmax = Nmax;
+        cudaMalloc((void**)&Cache_h.xi,         Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Phi0l,      Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Wprev1,     Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Wprev2,     Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.costheta,   Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Exponent,   Nmax * sizeof(Complex));
+        cudaMalloc((void**)&Cache_h.mass,       Nmax * sizeof(Real));
+    } else {
+        Cache_h.xi         = NULL;
+        Cache_h.Phi0l      = NULL;
+        Cache_h.Wprev1     = NULL;
+        Cache_h.Wprev2     = NULL;
+        Cache_h.costheta   = NULL;
+        Cache_h.sintheta_I = NULL;
+        Cache_h.Exponent   = NULL;
+        Cache_h.mass       = NULL;
+    }
+}
+
+// Kernels
+
+__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1"
+    int i = threadIdx.x + blockIdx.x *  blockDim.x;
+    while (i < N) {
+        vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale;
+        Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
+        Real xi = (r-1)/(r+1);
+        Real costheta = Pos.z/r;
+        Real sintheta_I = rsqrt(1-costheta*costheta);
+
+        Cache.xi[i] = xi;
+        Cache.Phi0l[i] = 0.5 * (1 - xi);
+        Cache.costheta[i] = costheta;
+        Cache.sintheta_I[i] = sintheta_I;
+
+        Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
+        Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I);
+        Cache.Exponent[i] = Exponent;
+        Cache.mass[i] = P[i].m;
+
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2"
+    int i = threadIdx.x + blockIdx.x *  blockDim.x;
+    while (i < N) {
+        Real xi = Cache.xi[i];
+        Cache.Phi0l[i] *= 0.25*(1-xi*xi);
+
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3"
+    extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch
+    int tid = threadIdx.x;
+    for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0);
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    while (i < N) {
+        Real xi = Cache.xi[i];
+        Real Wnl;
+        if (n == 0)      Wnl = 1;
+        else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;}
+        else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;}
+        else {
+            Real Wprev1 = Cache.Wprev1[i];
+            Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n;
+            if (n < NMAX) { // Writing is expensive, avoid if possible.
+                Cache.Wprev2[i] = Wprev1;
+                Cache.Wprev1[i] = Wnl;
+            }
+        }
+        Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l];
+        Real costheta = Cache.costheta[i];
+        Real Plm = Pl(l, costheta);
+        ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0));
+        if (l == 0) {i += blockDim.x * gridDim.x; continue;}
+
+        //////////////////////////////// ugly fix
+        if ((costheta < -0.999) || (costheta > +0.999)) {
+            i += blockDim.x * gridDim.x;
+            continue;
+        }
+        //////////////////////////////// ugly fix
+
+        Real Plm_prev1 = Plm;
+        Real sintheta_I = Cache.sintheta_I[i];
+        Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I;
+        Complex Exponent = Cache.Exponent[i];
+        Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1];
+        ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y));
+
+        if (l == 1) {i += blockDim.x * gridDim.x; continue;}
+
+        Complex TorodialPart = Exponent;
+        for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop
+            Real Plm_prev2 = Plm_prev1;
+            Plm_prev1 = Plm;
+            Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
+            TorodialPart = Complex_mul(TorodialPart, Exponent);
+            tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m];
+            ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y));
+        }
+        i += blockDim.x * gridDim.x;
+    }
+    __syncthreads();
+    for (int m = 0; m <= l; m++) {
+        i = blockDim.x/2;
+        while (i != 0) {
+            if (tid < i)
+                ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]);
+            __syncthreads();
+            i /= 2;
+        }
+        if (tid == 0)
+            PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x];
+    }
+}
+
+template<int Mode>
+__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) {
+// it gets A as parameter because it can be either on host or device
+// 0 = both force and potential, 1 = only force, 2 = only pot
+//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!!
+#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m]
+#ifndef  __CUDA_ARCH__
+    #define AngCoeff AngCoeff_h
+    #define LengthScale LengthScale_h
+#endif
+    Real dPhiLeft;
+    Real dPhiLeftMul;
+    Real dPhiRight;
+    Real dPhiRightAdd;
+    Real dPhi;
+    Real RadialPart2;
+    Real PlmDerivTheta;
+
+    Real Pot = 0;
+    Real Fr = 0, Ftheta = 0, Fphi = 0;
+    Real OneOverXiPlusOne = 1/(1+xi);
+    Real r_I = (1-xi)*OneOverXiPlusOne;
+    Real r = 1/r_I; // It's quite likely we can do everything without r.
+    Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite
+
+    Complex ExponentTmp[LMAX];
+    ExponentTmp[0] = Exponent;
+    for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent);
+    if (Mode != 2) {
+        Real xi2 = xi*xi;
+        Real xi3 = xi2*xi;
+        dPhiLeft = -0.25*OneOverXiPlusOne;
+        dPhiLeftMul = 0.25*(1-xi2);
+        dPhiRight = xi3 - xi2 - xi + 1;
+        dPhiRightAdd = 2*(xi3 - 2*xi2 + xi);
+    }
+    Real Phi0l = 1/(1+r);
+    Real tmp1 = Phi0l*Phi0l*r;
+    for (int l = 0; l <= LMAX; l++) {
+        if (Mode != 2) {
+            if (l > 0) {
+                dPhiLeft  *= dPhiLeftMul;
+                dPhiRight += dPhiRightAdd;
+            }
+        }
+        if (Mode != 2) dPhi = dPhiLeft * dPhiRight;
+        for (int n = 0; n <= NMAX; n++) {
+            Real Wnl, Wprev1, Wprev2;
+            if (n == 0)      Wnl = 1;
+            else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;}
+            else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;}
+            else {
+                Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n;
+                Wprev2 = Wprev1;
+                Wprev1 = Wnl;
+            }
+
+            Real Wderiv = 0;
+            if (n == 1) {Wderiv = 4*l + 3;}
+            else if (n > 1) {
+                Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi);
+            } // From an unknown reason it's faster to have this Block separate from the previous one.
+
+            Real RadialPart  = - SQRT_4_PI * Phi0l * Wnl;
+            if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2));
+
+            Real Plm = Pl(l, costheta);
+            Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm;
+            if (Mode != 1) Pot += RadialPart  * tmp2;
+            if (Mode != 2) Fr  += RadialPart2 * tmp2;
+
+            if (l == 0) continue;
+
+            //////////////////////////////// ugly fix
+            if ((costheta < -0.999) || (costheta > +0.999)) {
+                continue;
+            }
+            //////////////////////////////// ugly fix
+
+            // The Block below is l>=1, m=0.
+            if (Mode != 2) {
+                PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster
+                Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I;
+            }
+
+            // The Block below is l>=1, m=1.
+            if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1)
+            Real Plm_prev1 = Plm;
+            Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1.
+            if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1;
+            tmp2 = 2 * AngCoeff[(l+1)*l/2+1];
+            Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1));
+            Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
+            Complex tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
+            Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
+            if (Mode != 1) Pot += Complex_real(tmp6);
+            if (Mode != 2) {
+                Fr +=       RadialPart2 * Complex_real(tmp5);
+                Fphi +=     Complex_imag(tmp6) * sintheta_I * r_I;
+                Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
+            }
+
+            if (l == 1) continue;
+
+            for (int m = 2; m <= l; m++) {
+                Real Plm_prev2 = Plm_prev1;
+                Plm_prev1 = Plm;
+                Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
+                tmp2 = 2 * AngCoeff[(l+1)*l/2+m];
+                tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m));
+                tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
+                tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
+                tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
+                if (Mode != 1) Pot  += Complex_real(tmp6);
+                if (Mode != 2) {
+                    PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1;
+                    Fr +=       RadialPart2 * Complex_real(tmp5);
+                    Fphi +=     m * Complex_imag(tmp6) * sintheta_I * r_I;
+                    Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
+                }
+            }
+        }
+        Phi0l *= tmp1;
+    }
+
+    if (Mode != 2) {
+        Real sintheta = 1/sintheta_I;
+        Real tanphi = Exponent.y/Exponent.x;
+        Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit?
+        Real sinphi = tanphi*cosphi;
+        *F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi,   costheta*Fr - sintheta*Ftheta);
+        *F = (*F)*LengthScale*LengthScale;
+    }
+    if (Mode != 1) *Potential = Pot*LengthScale;
+#ifndef  __CUDA_ARCH__
+    #undef AngCoeff
+    #undef LengthScale
+#endif
+#undef A
+}
+
+__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4"
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    while (i < N) {
+        CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]);
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+// Global initialization
+void etics::scf::GlobalInit() {
+    int pid = getpid();
+    int devid;
+    cudaGetDevice(&devid);
+    char hostname[256];
+    gethostname(hostname, 256);
+    char output_string[256];
+    sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname);
+    std::cout << output_string << std::endl;
+
+    RadialCoefficients(etics::scf::RadCoeff_h);
+    cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real));
+    AngularCoefficients(etics::scf::AngCoeff_h);
+    cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real));
+    etics::scf::LengthScale_h = 1;
+    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
+    ScfGloballyInitialized = true;
+}
+
+void etics::scf::SetLengthScale(Real NewLengthScale) {
+    etics::scf::LengthScale_h = NewLengthScale;
+    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
+}
+
+// Old API
+void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) {
+    int blockSize;
+    int minGridSize;
+    int gridSize;
+//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem.
+    cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128);
+    gridSize = minGridSize;
+    *k3gs_new = gridSize;
+    *k3bs_new = blockSize;
+
+    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N);
+    gridSize = (N + blockSize - 1) / blockSize;
+    *k4gs_new = gridSize;
+    *k4bs_new = blockSize;
+}
+
+void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
+    SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+}
+
+void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
+    SCFObject.CalculateGravity(P, N, Potential, F);
+}
diff --git a/src/scf.hpp b/src/scf.hpp
new file mode 100644
index 0000000..87e902d
--- /dev/null
+++ b/src/scf.hpp
@@ -0,0 +1,61 @@
+#pragma once
+#include "common.hpp"
+#include "mathaux.hpp"
+namespace etics {
+    namespace scf {
+        // CUDA Kernels and device functions cannot be in class space
+        __global__ void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0));
+        __global__ void CalculatePhi0l(int N, int l);
+        __global__ void CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum);
+        template<int Mode> __host__ __device__ void CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential);
+        __global__ void CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F);
+
+        void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
+        void Init(int Nmax, int k3gs_new=0, int k3bs_new=0, int k4gs_new=0, int k4bs_new=0); // to be removed!
+        void GlobalInit();
+        void GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new);
+        void SetLengthScale(Real NewLengthScale);
+
+        struct CacheStruct {
+            Real *xi;
+            Real *Phi0l;
+            Real *Wprev1;
+            Real *Wprev2;
+            Real *costheta;
+            Real *sintheta_I;
+            Complex *Exponent;
+            Real *mass;
+        };
+
+        class scfclass {
+          public:
+            scfclass();
+            ~scfclass();
+            void SendCachePointersToGPU();
+            void SendCoeffsToGPU();
+            void LoadParticlesToCache(Particle *P, int N, vec3 Offset=vec3(0,0,0));
+            void CalculateCoefficients();
+            void CalculateGravityFromCoefficients(Real *Potential, vec3 *F);
+            void CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F);
+            void CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F);
+            void Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
+            void SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new);
+            void GetCoefficients(Complex *A);
+            void GetGpuLock();
+            void ReleaseGpuLock();
+//WARNING: the following two functions are not implemented but they should be
+            int GetNMAX();
+            int GetLMAX();
+//TODO We'll make them private in the final version, now we need access for debugging
+            int N;
+            int Nmax;
+//WARNING we just hardcode 1024 as the maximum size of the coefficient structure. This is because having NMAX and LMAX here causes problems.
+            Complex A_h[1024];
+            CacheStruct Cache_h;
+            Complex *PartialSum = NULL;
+            Complex *PartialSum_h = NULL;
+            int k3gs, k3bs, k4gs, k4bs;
+            void CalculateCoefficients(int n, int l);
+        };
+    }
+}