added project files

2019-08-25 16:08:44 +08:00 · 2019-08-25 16:08:44 +08:00 · 1718e6287b
commit 1718e6287b
24 changed files with 3045 additions and 0 deletions
--- a/src/scf.cu
+++ b/src/scf.cu
@ -0,0 +1,518 @@
+/**
+ * @file
+ * @author  Yohai Meiron <ymeiron@pku.edu.cn>
+ * @brief   Functions to calculate gravitational force using the SCF method.
+ */
+#include "common.hpp"
+#include "mathaux.hpp"
+#include "scf.hpp"
+
+#include <iostream>
+using std::cout;
+using std::cerr;
+using std::endl;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <unistd.h>
+
+// Use the MPI flag only for the standalone program
+#ifdef ETICS_MPI
+#include <mpi.h>
+#endif
+
+namespace etics {
+    namespace scf {
+        scfclass SCFObject;
+
+        // These are truly global variables that are initialized once and can be read by every scfclass object.
+        bool ScfGloballyInitialized = false;
+        __constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)];
+        __constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2];
+                     Real RadCoeff_h[(NMAX+1)*(LMAX+1)];
+                     Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2];
+        __constant__ Real LengthScale;
+                     Real LengthScale_h;
+
+        // These are global variables that are read/written by each scfclass object; they must be locked before use.
+        __constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
+        __constant__ CacheStruct Cache;
+
+        // These variables provide the mechanism to lock and release the global constant memory variables above.
+        scfclass *GpuLockOwner = NULL;
+        #define GPU_LOCK_PANIC \
+        { \
+            std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \
+            exit(1); \
+        }
+    }
+}
+
+namespace etics {
+        namespace scf {
+        int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function
+            return (LMAX+1)*sizeof(Complex)*BlockSize;
+        }
+    }
+}
+
+// Class implementation
+etics::scf::scfclass::scfclass() {
+    N=0;
+    Nmax=0;
+    k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1;
+}
+
+etics::scf::scfclass::~scfclass() {
+    if (etics::scf::GpuLockOwner == this) ReleaseGpuLock();
+    cudaFree(Cache_h.xi);
+    cudaFree(Cache_h.Phi0l);
+    cudaFree(Cache_h.Wprev1);
+    cudaFree(Cache_h.Wprev2);
+    cudaFree(Cache_h.costheta);
+    cudaFree(Cache_h.sintheta_I);
+    cudaFree(Cache_h.Exponent);
+    cudaFree(Cache_h.mass);
+    free(PartialSum_h);
+    cudaFree(PartialSum);
+}
+
+void etics::scf::scfclass::GetGpuLock() {
+    if (etics::scf::GpuLockOwner == this) return;
+    if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this;
+    else GPU_LOCK_PANIC;
+}
+
+void etics::scf::scfclass::ReleaseGpuLock() {
+    if (etics::scf::GpuLockOwner == NULL) return;
+    if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL;
+    else GPU_LOCK_PANIC;
+}
+
+void etics::scf::scfclass::CalculateCoefficients(int n, int l) {
+    int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2;
+    etics::scf::CalculateCoefficientsPartial<<<k3gs,k3bs,k3bs*sizeof(Complex)*(LMAX+1)>>>(N, n, l, PartialSum);
+    cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost);
+    for (int m = 0; m <= l; m++)
+        for (int Block=0; Block<k3gs; Block++)
+            A_h[BaseAddress + m] = Complex_add(A_h[BaseAddress + m], PartialSum_h[Block*(l+1) + m]);
+}
+
+void etics::scf::scfclass::CalculateCoefficients() {
+    memset(A_h, 0, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
+    for (int l = 0; l <= LMAX; l++) {
+        if (l > 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter
+        for (int n = 0; n <= NMAX; n++) {
+            CalculateCoefficients(n, l);
+        }
+    }
+}
+
+void etics::scf::scfclass::GetCoefficients(Complex *A) {
+    std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A);
+}
+
+
+void etics::scf::scfclass::SendCoeffsToGPU() {
+    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
+    cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
+}
+
+void etics::scf::scfclass::SendCachePointersToGPU() {
+    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
+    cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct));
+}
+
+void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) {
+    if (N > Nmax) {
+        fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax);
+        exit(1);
+    }
+    this->N = N;
+    etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset);
+}
+
+void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) {
+    etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F);
+}
+
+void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) {
+    static bool first = true;
+    if (first) {
+        printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n");
+        first = false;
+    }
+    Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
+    Real xi = (r-1)/(r+1);
+    Real costheta = Pos.z/r;
+    Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
+    Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I);
+
+    etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential);
+}
+
+void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
+    GetGpuLock();
+    SendCachePointersToGPU();
+    LoadParticlesToCache(P, N);
+    CalculateCoefficients();
+#ifdef ETICS_MPI
+    Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
+    MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD);
+    std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h);
+    // not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs
+#endif
+    SendCoeffsToGPU();
+    CalculateGravityFromCoefficients(Potential, F);
+    ReleaseGpuLock();
+}
+
+void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
+    if (k3gs_new != k3gs) {
+        if (PartialSum_h != NULL) free(PartialSum_h);
+        if (PartialSum   != NULL) cudaFree(PartialSum_h);
+        PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex));
+        cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex));
+    }
+    k3gs = k3gs_new;
+    k3bs = k3bs_new;
+    k4gs = k4gs_new;
+    k4bs = k4bs_new;
+}
+
+void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
+    // First, initialize global arrays if not already initialized.
+    if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit();
+
+    // Next, set the launch configuation (either guess it or use what the user specified).
+    if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) {
+        int k3gs_new, k3bs_new, k4gs_new, k4bs_new;
+        etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new);
+        SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+    }
+    else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+
+    // If Nmax > 0 then initialize the cahche memory.
+    if (Nmax > 0) {
+        this->Nmax = Nmax;
+        cudaMalloc((void**)&Cache_h.xi,         Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Phi0l,      Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Wprev1,     Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Wprev2,     Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.costheta,   Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real));
+        cudaMalloc((void**)&Cache_h.Exponent,   Nmax * sizeof(Complex));
+        cudaMalloc((void**)&Cache_h.mass,       Nmax * sizeof(Real));
+    } else {
+        Cache_h.xi         = NULL;
+        Cache_h.Phi0l      = NULL;
+        Cache_h.Wprev1     = NULL;
+        Cache_h.Wprev2     = NULL;
+        Cache_h.costheta   = NULL;
+        Cache_h.sintheta_I = NULL;
+        Cache_h.Exponent   = NULL;
+        Cache_h.mass       = NULL;
+    }
+}
+
+// Kernels
+
+__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1"
+    int i = threadIdx.x + blockIdx.x *  blockDim.x;
+    while (i < N) {
+        vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale;
+        Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
+        Real xi = (r-1)/(r+1);
+        Real costheta = Pos.z/r;
+        Real sintheta_I = rsqrt(1-costheta*costheta);
+
+        Cache.xi[i] = xi;
+        Cache.Phi0l[i] = 0.5 * (1 - xi);
+        Cache.costheta[i] = costheta;
+        Cache.sintheta_I[i] = sintheta_I;
+
+        Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
+        Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I);
+        Cache.Exponent[i] = Exponent;
+        Cache.mass[i] = P[i].m;
+
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2"
+    int i = threadIdx.x + blockIdx.x *  blockDim.x;
+    while (i < N) {
+        Real xi = Cache.xi[i];
+        Cache.Phi0l[i] *= 0.25*(1-xi*xi);
+
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3"
+    extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch
+    int tid = threadIdx.x;
+    for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0);
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    while (i < N) {
+        Real xi = Cache.xi[i];
+        Real Wnl;
+        if (n == 0)      Wnl = 1;
+        else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;}
+        else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;}
+        else {
+            Real Wprev1 = Cache.Wprev1[i];
+            Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n;
+            if (n < NMAX) { // Writing is expensive, avoid if possible.
+                Cache.Wprev2[i] = Wprev1;
+                Cache.Wprev1[i] = Wnl;
+            }
+        }
+        Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l];
+        Real costheta = Cache.costheta[i];
+        Real Plm = Pl(l, costheta);
+        ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0));
+        if (l == 0) {i += blockDim.x * gridDim.x; continue;}
+
+        //////////////////////////////// ugly fix
+        if ((costheta < -0.999) || (costheta > +0.999)) {
+            i += blockDim.x * gridDim.x;
+            continue;
+        }
+        //////////////////////////////// ugly fix
+
+        Real Plm_prev1 = Plm;
+        Real sintheta_I = Cache.sintheta_I[i];
+        Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I;
+        Complex Exponent = Cache.Exponent[i];
+        Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1];
+        ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y));
+
+        if (l == 1) {i += blockDim.x * gridDim.x; continue;}
+
+        Complex TorodialPart = Exponent;
+        for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop
+            Real Plm_prev2 = Plm_prev1;
+            Plm_prev1 = Plm;
+            Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
+            TorodialPart = Complex_mul(TorodialPart, Exponent);
+            tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m];
+            ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y));
+        }
+        i += blockDim.x * gridDim.x;
+    }
+    __syncthreads();
+    for (int m = 0; m <= l; m++) {
+        i = blockDim.x/2;
+        while (i != 0) {
+            if (tid < i)
+                ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]);
+            __syncthreads();
+            i /= 2;
+        }
+        if (tid == 0)
+            PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x];
+    }
+}
+
+template<int Mode>
+__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) {
+// it gets A as parameter because it can be either on host or device
+// 0 = both force and potential, 1 = only force, 2 = only pot
+//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!!
+#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m]
+#ifndef  __CUDA_ARCH__
+    #define AngCoeff AngCoeff_h
+    #define LengthScale LengthScale_h
+#endif
+    Real dPhiLeft;
+    Real dPhiLeftMul;
+    Real dPhiRight;
+    Real dPhiRightAdd;
+    Real dPhi;
+    Real RadialPart2;
+    Real PlmDerivTheta;
+
+    Real Pot = 0;
+    Real Fr = 0, Ftheta = 0, Fphi = 0;
+    Real OneOverXiPlusOne = 1/(1+xi);
+    Real r_I = (1-xi)*OneOverXiPlusOne;
+    Real r = 1/r_I; // It's quite likely we can do everything without r.
+    Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite
+
+    Complex ExponentTmp[LMAX];
+    ExponentTmp[0] = Exponent;
+    for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent);
+    if (Mode != 2) {
+        Real xi2 = xi*xi;
+        Real xi3 = xi2*xi;
+        dPhiLeft = -0.25*OneOverXiPlusOne;
+        dPhiLeftMul = 0.25*(1-xi2);
+        dPhiRight = xi3 - xi2 - xi + 1;
+        dPhiRightAdd = 2*(xi3 - 2*xi2 + xi);
+    }
+    Real Phi0l = 1/(1+r);
+    Real tmp1 = Phi0l*Phi0l*r;
+    for (int l = 0; l <= LMAX; l++) {
+        if (Mode != 2) {
+            if (l > 0) {
+                dPhiLeft  *= dPhiLeftMul;
+                dPhiRight += dPhiRightAdd;
+            }
+        }
+        if (Mode != 2) dPhi = dPhiLeft * dPhiRight;
+        for (int n = 0; n <= NMAX; n++) {
+            Real Wnl, Wprev1, Wprev2;
+            if (n == 0)      Wnl = 1;
+            else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;}
+            else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;}
+            else {
+                Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n;
+                Wprev2 = Wprev1;
+                Wprev1 = Wnl;
+            }
+
+            Real Wderiv = 0;
+            if (n == 1) {Wderiv = 4*l + 3;}
+            else if (n > 1) {
+                Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi);
+            } // From an unknown reason it's faster to have this Block separate from the previous one.
+
+            Real RadialPart  = - SQRT_4_PI * Phi0l * Wnl;
+            if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2));
+
+            Real Plm = Pl(l, costheta);
+            Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm;
+            if (Mode != 1) Pot += RadialPart  * tmp2;
+            if (Mode != 2) Fr  += RadialPart2 * tmp2;
+
+            if (l == 0) continue;
+
+            //////////////////////////////// ugly fix
+            if ((costheta < -0.999) || (costheta > +0.999)) {
+                continue;
+            }
+            //////////////////////////////// ugly fix
+
+            // The Block below is l>=1, m=0.
+            if (Mode != 2) {
+                PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster
+                Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I;
+            }
+
+            // The Block below is l>=1, m=1.
+            if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1)
+            Real Plm_prev1 = Plm;
+            Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1.
+            if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1;
+            tmp2 = 2 * AngCoeff[(l+1)*l/2+1];
+            Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1));
+            Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
+            Complex tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
+            Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
+            if (Mode != 1) Pot += Complex_real(tmp6);
+            if (Mode != 2) {
+                Fr +=       RadialPart2 * Complex_real(tmp5);
+                Fphi +=     Complex_imag(tmp6) * sintheta_I * r_I;
+                Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
+            }
+
+            if (l == 1) continue;
+
+            for (int m = 2; m <= l; m++) {
+                Real Plm_prev2 = Plm_prev1;
+                Plm_prev1 = Plm;
+                Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
+                tmp2 = 2 * AngCoeff[(l+1)*l/2+m];
+                tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m));
+                tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
+                tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
+                tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
+                if (Mode != 1) Pot  += Complex_real(tmp6);
+                if (Mode != 2) {
+                    PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1;
+                    Fr +=       RadialPart2 * Complex_real(tmp5);
+                    Fphi +=     m * Complex_imag(tmp6) * sintheta_I * r_I;
+                    Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
+                }
+            }
+        }
+        Phi0l *= tmp1;
+    }
+
+    if (Mode != 2) {
+        Real sintheta = 1/sintheta_I;
+        Real tanphi = Exponent.y/Exponent.x;
+        Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit?
+        Real sinphi = tanphi*cosphi;
+        *F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi,   costheta*Fr - sintheta*Ftheta);
+        *F = (*F)*LengthScale*LengthScale;
+    }
+    if (Mode != 1) *Potential = Pot*LengthScale;
+#ifndef  __CUDA_ARCH__
+    #undef AngCoeff
+    #undef LengthScale
+#endif
+#undef A
+}
+
+__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4"
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    while (i < N) {
+        CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]);
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+// Global initialization
+void etics::scf::GlobalInit() {
+    int pid = getpid();
+    int devid;
+    cudaGetDevice(&devid);
+    char hostname[256];
+    gethostname(hostname, 256);
+    char output_string[256];
+    sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname);
+    std::cout << output_string << std::endl;
+
+    RadialCoefficients(etics::scf::RadCoeff_h);
+    cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real));
+    AngularCoefficients(etics::scf::AngCoeff_h);
+    cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real));
+    etics::scf::LengthScale_h = 1;
+    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
+    ScfGloballyInitialized = true;
+}
+
+void etics::scf::SetLengthScale(Real NewLengthScale) {
+    etics::scf::LengthScale_h = NewLengthScale;
+    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
+}
+
+// Old API
+void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) {
+    int blockSize;
+    int minGridSize;
+    int gridSize;
+//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem.
+    cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128);
+    gridSize = minGridSize;
+    *k3gs_new = gridSize;
+    *k3bs_new = blockSize;
+
+    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N);
+    gridSize = (N + blockSize - 1) / blockSize;
+    *k4gs_new = gridSize;
+    *k4bs_new = blockSize;
+}
+
+void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
+    SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
+}
+
+void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
+    SCFObject.CalculateGravity(P, N, Potential, F);
+}