etics/src/scf.cu

/**
 * @file
 * @author  Yohai Meiron <ymeiron@pku.edu.cn>
 * @brief   Functions to calculate gravitational force using the SCF method.
 */
#include "common.hpp"
#include "mathaux.hpp"
#include "scf.hpp"

#include <iostream>
using std::cout;
using std::cerr;
using std::endl;

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <unistd.h>

// Use the MPI flag only for the standalone program
#ifdef ETICS_MPI
#include <mpi.h>
#endif

namespace etics {
    namespace scf {
        scfclass SCFObject;

        // These are truly global variables that are initialized once and can be read by every scfclass object.
        bool ScfGloballyInitialized = false;
        __constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)];
        __constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2];
                     Real RadCoeff_h[(NMAX+1)*(LMAX+1)];
                     Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2];
        __constant__ Real LengthScale;
                     Real LengthScale_h;

        // These are global variables that are read/written by each scfclass object; they must be locked before use.
        __constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
        __constant__ CacheStruct Cache;

        // These variables provide the mechanism to lock and release the global constant memory variables above.
        scfclass *GpuLockOwner = NULL;
        #define GPU_LOCK_PANIC \
        { \
            std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \
            exit(1); \
        }
    }
}

namespace etics {
        namespace scf {
        int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function
            return (LMAX+1)*sizeof(Complex)*BlockSize;
        }
    }
}

// Class implementation
etics::scf::scfclass::scfclass() {
    N=0;
    Nmax=0;
    k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1;
    PartialSum = NULL;
    PartialSum_h = NULL;
}

etics::scf::scfclass::~scfclass() {
    if (etics::scf::GpuLockOwner == this) ReleaseGpuLock();
    cudaFree(Cache_h.xi);
    cudaFree(Cache_h.Phi0l);
    cudaFree(Cache_h.Wprev1);
    cudaFree(Cache_h.Wprev2);
    cudaFree(Cache_h.costheta);
    cudaFree(Cache_h.sintheta_I);
    cudaFree(Cache_h.Exponent);
    cudaFree(Cache_h.mass);
    free(PartialSum_h);
    cudaFree(PartialSum);
}

void etics::scf::scfclass::GetGpuLock() {
    if (etics::scf::GpuLockOwner == this) return;
    if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this;
    else GPU_LOCK_PANIC;
}

void etics::scf::scfclass::ReleaseGpuLock() {
    if (etics::scf::GpuLockOwner == NULL) return;
    if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL;
    else GPU_LOCK_PANIC;
}

int etics::scf::scfclass::GetNMAX() {return NMAX;}

int etics::scf::scfclass::GetLMAX() {return LMAX;}

void etics::scf::scfclass::CalculateCoefficients(int n, int l) {
    int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2;
    etics::scf::CalculateCoefficientsPartial<<<k3gs,k3bs,k3bs*sizeof(Complex)*(LMAX+1)>>>(N, n, l, PartialSum);
    cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost);
    for (int m = 0; m <= l; m++)
        for (int Block=0; Block<k3gs; Block++)
            A_h[BaseAddress + m] = Complex_add(A_h[BaseAddress + m], PartialSum_h[Block*(l+1) + m]);
}

void etics::scf::scfclass::CalculateCoefficients() {
    memset(A_h, 0, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
    for (int l = 0; l <= LMAX; l++) {
        if (l > 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter
        for (int n = 0; n <= NMAX; n++) {
            CalculateCoefficients(n, l);
        }
    }
}

void etics::scf::scfclass::GetCoefficients(Complex *A) {
    std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A);
}


void etics::scf::scfclass::SendCoeffsToGPU() {
    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
    cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
}

void etics::scf::scfclass::SendCachePointersToGPU() {
    if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
    cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct));
}

void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) {
    if (N > Nmax) {
        fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax);
        exit(1);
    }
    this->N = N;
    etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset);
}

void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) {
    etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F);
}

void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) {
    static bool first = true;
    if (first) {
        printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n");
        first = false;
    }
    Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
    Real xi = (r-1)/(r+1);
    Real costheta = Pos.z/r;
    Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
    Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I);

    etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential);
}

void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
    GetGpuLock();
    SendCachePointersToGPU();
    LoadParticlesToCache(P, N);
    CalculateCoefficients();
#ifdef ETICS_MPI
    Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
    MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD);
    std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h);
    // not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs
#endif
    SendCoeffsToGPU();
    CalculateGravityFromCoefficients(Potential, F);
    ReleaseGpuLock();
}

void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
    if (k3gs_new != k3gs) {
        if (PartialSum_h != NULL) free(PartialSum_h);
        if (PartialSum   != NULL) cudaFree(PartialSum_h);
        PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex));
        cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex));
    }
    k3gs = k3gs_new;
    k3bs = k3bs_new;
    k4gs = k4gs_new;
    k4bs = k4bs_new;
}

void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
    // First, initialize global arrays if not already initialized.
    if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit();

    // Next, set the launch configuation (either guess it or use what the user specified).
    if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) {
        int k3gs_new, k3bs_new, k4gs_new, k4bs_new;
        etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new);
        SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
    }
    else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);

    // If Nmax > 0 then initialize the cahche memory.
    if (Nmax > 0) {
        this->Nmax = Nmax;
        cudaMalloc((void**)&Cache_h.xi,         Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.Phi0l,      Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.Wprev1,     Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.Wprev2,     Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.costheta,   Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real));
        cudaMalloc((void**)&Cache_h.Exponent,   Nmax * sizeof(Complex));
        cudaMalloc((void**)&Cache_h.mass,       Nmax * sizeof(Real));
    } else {
        Cache_h.xi         = NULL;
        Cache_h.Phi0l      = NULL;
        Cache_h.Wprev1     = NULL;
        Cache_h.Wprev2     = NULL;
        Cache_h.costheta   = NULL;
        Cache_h.sintheta_I = NULL;
        Cache_h.Exponent   = NULL;
        Cache_h.mass       = NULL;
    }
}

// Kernels

__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1"
    int i = threadIdx.x + blockIdx.x *  blockDim.x;
    while (i < N) {
        vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale;
        Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
        Real xi = (r-1)/(r+1);
        Real costheta = Pos.z/r;
        Real sintheta_I = rsqrt(1-costheta*costheta);

        Cache.xi[i] = xi;
        Cache.Phi0l[i] = 0.5 * (1 - xi);
        Cache.costheta[i] = costheta;
        Cache.sintheta_I[i] = sintheta_I;

        Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
        Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I);
        Cache.Exponent[i] = Exponent;
        Cache.mass[i] = P[i].m;

        i += blockDim.x * gridDim.x;
    }
}

__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2"
    int i = threadIdx.x + blockIdx.x *  blockDim.x;
    while (i < N) {
        Real xi = Cache.xi[i];
        Cache.Phi0l[i] *= 0.25*(1-xi*xi);

        i += blockDim.x * gridDim.x;
    }
}

__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3"
    extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch
    int tid = threadIdx.x;
    for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0);
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    while (i < N) {
        Real xi = Cache.xi[i];
        Real Wnl;
        if (n == 0)      Wnl = 1;
        else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;}
        else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;}
        else {
            Real Wprev1 = Cache.Wprev1[i];
            Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n;
            if (n < NMAX) { // Writing is expensive, avoid if possible.
                Cache.Wprev2[i] = Wprev1;
                Cache.Wprev1[i] = Wnl;
            }
        }
        Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l];
        Real costheta = Cache.costheta[i];
        Real Plm = Pl(l, costheta);
        ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0));
        if (l == 0) {i += blockDim.x * gridDim.x; continue;}

        //////////////////////////////// ugly fix
        if ((costheta < -0.999) || (costheta > +0.999)) {
            i += blockDim.x * gridDim.x;
            continue;
        }
        //////////////////////////////// ugly fix

        Real Plm_prev1 = Plm;
        Real sintheta_I = Cache.sintheta_I[i];
        Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I;
        Complex Exponent = Cache.Exponent[i];
        Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1];
        ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y));

        if (l == 1) {i += blockDim.x * gridDim.x; continue;}

        Complex TorodialPart = Exponent;
        for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop
            Real Plm_prev2 = Plm_prev1;
            Plm_prev1 = Plm;
            Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
            TorodialPart = Complex_mul(TorodialPart, Exponent);
            tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m];
            ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y));
        }
        i += blockDim.x * gridDim.x;
    }
    __syncthreads();
    for (int m = 0; m <= l; m++) {
        i = blockDim.x/2;
        while (i != 0) {
            if (tid < i)
                ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]);
            __syncthreads();
            i /= 2;
        }
        if (tid == 0)
            PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x];
    }
}

template<int Mode>
__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) {
// it gets A as parameter because it can be either on host or device
// 0 = both force and potential, 1 = only force, 2 = only pot
//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!!
#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m]
#ifndef  __CUDA_ARCH__
    #define AngCoeff AngCoeff_h
    #define LengthScale LengthScale_h
#endif
    Real dPhiLeft;
    Real dPhiLeftMul;
    Real dPhiRight;
    Real dPhiRightAdd;
    Real dPhi;
    Real RadialPart2;
    Real PlmDerivTheta;

    Real Pot = 0;
    Real Fr = 0, Ftheta = 0, Fphi = 0;
    Real OneOverXiPlusOne = 1/(1+xi);
    Real r_I = (1-xi)*OneOverXiPlusOne;
    Real r = 1/r_I; // It's quite likely we can do everything without r.
    Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite

    Complex ExponentTmp[LMAX];
    ExponentTmp[0] = Exponent;
    for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent);
    if (Mode != 2) {
        Real xi2 = xi*xi;
        Real xi3 = xi2*xi;
        dPhiLeft = -0.25*OneOverXiPlusOne;
        dPhiLeftMul = 0.25*(1-xi2);
        dPhiRight = xi3 - xi2 - xi + 1;
        dPhiRightAdd = 2*(xi3 - 2*xi2 + xi);
    }
    Real Phi0l = 1/(1+r);
    Real tmp1 = Phi0l*Phi0l*r;
    for (int l = 0; l <= LMAX; l++) {
        if (Mode != 2) {
            if (l > 0) {
                dPhiLeft  *= dPhiLeftMul;
                dPhiRight += dPhiRightAdd;
            }
        }
        if (Mode != 2) dPhi = dPhiLeft * dPhiRight;
        for (int n = 0; n <= NMAX; n++) {
            Real Wnl, Wprev1, Wprev2;
            if (n == 0)      Wnl = 1;
            else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;}
            else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;}
            else {
                Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n;
                Wprev2 = Wprev1;
                Wprev1 = Wnl;
            }

            Real Wderiv = 0;
            if (n == 1) {Wderiv = 4*l + 3;}
            else if (n > 1) {
                Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi);
            } // From an unknown reason it's faster to have this Block separate from the previous one.

            Real RadialPart  = - SQRT_4_PI * Phi0l * Wnl;
            if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2));

            Real Plm = Pl(l, costheta);
            Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm;
            if (Mode != 1) Pot += RadialPart  * tmp2;
            if (Mode != 2) Fr  += RadialPart2 * tmp2;

            if (l == 0) continue;

            //////////////////////////////// ugly fix
            if ((costheta < -0.999) || (costheta > +0.999)) {
                continue;
            }
            //////////////////////////////// ugly fix

            // The Block below is l>=1, m=0.
            if (Mode != 2) {
                PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster
                Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I;
            }

            // The Block below is l>=1, m=1.
            if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1)
            Real Plm_prev1 = Plm;
            Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1.
            if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1;
            tmp2 = 2 * AngCoeff[(l+1)*l/2+1];
            Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1));
            Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
            Complex tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
            Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
            if (Mode != 1) Pot += Complex_real(tmp6);
            if (Mode != 2) {
                Fr +=       RadialPart2 * Complex_real(tmp5);
                Fphi +=     Complex_imag(tmp6) * sintheta_I * r_I;
                Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
            }

            if (l == 1) continue;

            for (int m = 2; m <= l; m++) {
                Real Plm_prev2 = Plm_prev1;
                Plm_prev1 = Plm;
                Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
                tmp2 = 2 * AngCoeff[(l+1)*l/2+m];
                tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m));
                tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
                tmp5 = make_Complex(Plm *  tmp4.x, Plm *  tmp4.y);
                tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
                if (Mode != 1) Pot  += Complex_real(tmp6);
                if (Mode != 2) {
                    PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1;
                    Fr +=       RadialPart2 * Complex_real(tmp5);
                    Fphi +=     m * Complex_imag(tmp6) * sintheta_I * r_I;
                    Ftheta += - RadialPart * PlmDerivTheta *  Complex_real(tmp4) * r_I;
                }
            }
        }
        Phi0l *= tmp1;
    }

    if (Mode != 2) {
        Real sintheta = 1/sintheta_I;
        Real tanphi = Exponent.y/Exponent.x;
        Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit?
        Real sinphi = tanphi*cosphi;
        *F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi,   costheta*Fr - sintheta*Ftheta);
        *F = (*F)*LengthScale*LengthScale;
    }
    if (Mode != 1) *Potential = Pot*LengthScale;
#ifndef  __CUDA_ARCH__
    #undef AngCoeff
    #undef LengthScale
#endif
#undef A
}

__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4"
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    while (i < N) {
        CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]);
        i += blockDim.x * gridDim.x;
    }
}

// Global initialization
void etics::scf::GlobalInit() {
    int pid = getpid();
    int devid;
    cudaGetDevice(&devid);
    char hostname[256];
    gethostname(hostname, 256);
    char output_string[256];
    sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname);
    std::cout << output_string << std::endl;

    RadialCoefficients(etics::scf::RadCoeff_h);
    cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real));
    AngularCoefficients(etics::scf::AngCoeff_h);
    cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real));
    etics::scf::LengthScale_h = 1;
    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
    ScfGloballyInitialized = true;
}

void etics::scf::SetLengthScale(Real NewLengthScale) {
    etics::scf::LengthScale_h = NewLengthScale;
    cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
}

// Old API
void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) {
    int blockSize;
    int minGridSize;
    int gridSize;
//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem.
    cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128);
    gridSize = minGridSize;
    *k3gs_new = gridSize;
    *k3bs_new = blockSize;

    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N);
    gridSize = (N + blockSize - 1) / blockSize;
    *k4gs_new = gridSize;
    *k4bs_new = blockSize;
}

void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
    SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
}

void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
    SCFObject.CalculateGravity(P, N, Potential, F);
}