524 lines
20 KiB
Text
524 lines
20 KiB
Text
/**
|
|
* @file
|
|
* @author Yohai Meiron <ymeiron@pku.edu.cn>
|
|
* @brief Functions to calculate gravitational force using the SCF method.
|
|
*/
|
|
#include "common.hpp"
|
|
#include "mathaux.hpp"
|
|
#include "scf.hpp"
|
|
|
|
#include <iostream>
|
|
using std::cout;
|
|
using std::cerr;
|
|
using std::endl;
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <cmath>
|
|
#include <unistd.h>
|
|
|
|
// Use the MPI flag only for the standalone program
|
|
#ifdef ETICS_MPI
|
|
#include <mpi.h>
|
|
#endif
|
|
|
|
namespace etics {
|
|
namespace scf {
|
|
scfclass SCFObject;
|
|
|
|
// These are truly global variables that are initialized once and can be read by every scfclass object.
|
|
bool ScfGloballyInitialized = false;
|
|
__constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)];
|
|
__constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2];
|
|
Real RadCoeff_h[(NMAX+1)*(LMAX+1)];
|
|
Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2];
|
|
__constant__ Real LengthScale;
|
|
Real LengthScale_h;
|
|
|
|
// These are global variables that are read/written by each scfclass object; they must be locked before use.
|
|
__constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
|
|
__constant__ CacheStruct Cache;
|
|
|
|
// These variables provide the mechanism to lock and release the global constant memory variables above.
|
|
scfclass *GpuLockOwner = NULL;
|
|
#define GPU_LOCK_PANIC \
|
|
{ \
|
|
std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \
|
|
exit(1); \
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace etics {
|
|
namespace scf {
|
|
int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function
|
|
return (LMAX+1)*sizeof(Complex)*BlockSize;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Class implementation
|
|
etics::scf::scfclass::scfclass() {
|
|
N=0;
|
|
Nmax=0;
|
|
k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1;
|
|
PartialSum = NULL;
|
|
PartialSum_h = NULL;
|
|
}
|
|
|
|
etics::scf::scfclass::~scfclass() {
|
|
if (etics::scf::GpuLockOwner == this) ReleaseGpuLock();
|
|
cudaFree(Cache_h.xi);
|
|
cudaFree(Cache_h.Phi0l);
|
|
cudaFree(Cache_h.Wprev1);
|
|
cudaFree(Cache_h.Wprev2);
|
|
cudaFree(Cache_h.costheta);
|
|
cudaFree(Cache_h.sintheta_I);
|
|
cudaFree(Cache_h.Exponent);
|
|
cudaFree(Cache_h.mass);
|
|
free(PartialSum_h);
|
|
cudaFree(PartialSum);
|
|
}
|
|
|
|
void etics::scf::scfclass::GetGpuLock() {
|
|
if (etics::scf::GpuLockOwner == this) return;
|
|
if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this;
|
|
else GPU_LOCK_PANIC;
|
|
}
|
|
|
|
void etics::scf::scfclass::ReleaseGpuLock() {
|
|
if (etics::scf::GpuLockOwner == NULL) return;
|
|
if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL;
|
|
else GPU_LOCK_PANIC;
|
|
}
|
|
|
|
int etics::scf::scfclass::GetNMAX() {return NMAX;}
|
|
|
|
int etics::scf::scfclass::GetLMAX() {return LMAX;}
|
|
|
|
void etics::scf::scfclass::CalculateCoefficients(int n, int l) {
|
|
int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2;
|
|
etics::scf::CalculateCoefficientsPartial<<<k3gs,k3bs,k3bs*sizeof(Complex)*(LMAX+1)>>>(N, n, l, PartialSum);
|
|
cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost);
|
|
for (int m = 0; m <= l; m++)
|
|
for (int Block=0; Block<k3gs; Block++)
|
|
A_h[BaseAddress + m] = Complex_add(A_h[BaseAddress + m], PartialSum_h[Block*(l+1) + m]);
|
|
}
|
|
|
|
void etics::scf::scfclass::CalculateCoefficients() {
|
|
memset(A_h, 0, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
|
|
for (int l = 0; l <= LMAX; l++) {
|
|
if (l > 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter
|
|
for (int n = 0; n <= NMAX; n++) {
|
|
CalculateCoefficients(n, l);
|
|
}
|
|
}
|
|
}
|
|
|
|
void etics::scf::scfclass::GetCoefficients(Complex *A) {
|
|
std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A);
|
|
}
|
|
|
|
|
|
void etics::scf::scfclass::SendCoeffsToGPU() {
|
|
if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
|
|
cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex));
|
|
}
|
|
|
|
void etics::scf::scfclass::SendCachePointersToGPU() {
|
|
if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC;
|
|
cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct));
|
|
}
|
|
|
|
void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) {
|
|
if (N > Nmax) {
|
|
fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax);
|
|
exit(1);
|
|
}
|
|
this->N = N;
|
|
etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset);
|
|
}
|
|
|
|
void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) {
|
|
etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F);
|
|
}
|
|
|
|
void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) {
|
|
static bool first = true;
|
|
if (first) {
|
|
printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n");
|
|
first = false;
|
|
}
|
|
Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
|
|
Real xi = (r-1)/(r+1);
|
|
Real costheta = Pos.z/r;
|
|
Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
|
|
Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I);
|
|
|
|
etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential);
|
|
}
|
|
|
|
void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
|
|
GetGpuLock();
|
|
SendCachePointersToGPU();
|
|
LoadParticlesToCache(P, N);
|
|
CalculateCoefficients();
|
|
#ifdef ETICS_MPI
|
|
Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2];
|
|
MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD);
|
|
std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h);
|
|
// not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs
|
|
#endif
|
|
SendCoeffsToGPU();
|
|
CalculateGravityFromCoefficients(Potential, F);
|
|
ReleaseGpuLock();
|
|
}
|
|
|
|
void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
|
|
if (k3gs_new != k3gs) {
|
|
if (PartialSum_h != NULL) free(PartialSum_h);
|
|
if (PartialSum != NULL) cudaFree(PartialSum_h);
|
|
PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex));
|
|
cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex));
|
|
}
|
|
k3gs = k3gs_new;
|
|
k3bs = k3bs_new;
|
|
k4gs = k4gs_new;
|
|
k4bs = k4bs_new;
|
|
}
|
|
|
|
void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) {
|
|
// First, initialize global arrays if not already initialized.
|
|
if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit();
|
|
|
|
// Next, set the launch configuation (either guess it or use what the user specified).
|
|
if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) {
|
|
int k3gs_new, k3bs_new, k4gs_new, k4bs_new;
|
|
etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new);
|
|
SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
|
|
}
|
|
else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new);
|
|
|
|
// If Nmax > 0 then initialize the cahche memory.
|
|
if (Nmax > 0) {
|
|
this->Nmax = Nmax;
|
|
cudaMalloc((void**)&Cache_h.xi, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.Phi0l, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.Wprev1, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.Wprev2, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.costheta, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real));
|
|
cudaMalloc((void**)&Cache_h.Exponent, Nmax * sizeof(Complex));
|
|
cudaMalloc((void**)&Cache_h.mass, Nmax * sizeof(Real));
|
|
} else {
|
|
Cache_h.xi = NULL;
|
|
Cache_h.Phi0l = NULL;
|
|
Cache_h.Wprev1 = NULL;
|
|
Cache_h.Wprev2 = NULL;
|
|
Cache_h.costheta = NULL;
|
|
Cache_h.sintheta_I = NULL;
|
|
Cache_h.Exponent = NULL;
|
|
Cache_h.mass = NULL;
|
|
}
|
|
}
|
|
|
|
// Kernels
|
|
|
|
__global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1"
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
while (i < N) {
|
|
vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale;
|
|
Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z);
|
|
Real xi = (r-1)/(r+1);
|
|
Real costheta = Pos.z/r;
|
|
Real sintheta_I = rsqrt(1-costheta*costheta);
|
|
|
|
Cache.xi[i] = xi;
|
|
Cache.Phi0l[i] = 0.5 * (1 - xi);
|
|
Cache.costheta[i] = costheta;
|
|
Cache.sintheta_I[i] = sintheta_I;
|
|
|
|
Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y);
|
|
Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I);
|
|
Cache.Exponent[i] = Exponent;
|
|
Cache.mass[i] = P[i].m;
|
|
|
|
i += blockDim.x * gridDim.x;
|
|
}
|
|
}
|
|
|
|
__global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2"
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
while (i < N) {
|
|
Real xi = Cache.xi[i];
|
|
Cache.Phi0l[i] *= 0.25*(1-xi*xi);
|
|
|
|
i += blockDim.x * gridDim.x;
|
|
}
|
|
}
|
|
|
|
__global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3"
|
|
extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch
|
|
int tid = threadIdx.x;
|
|
for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0);
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
while (i < N) {
|
|
Real xi = Cache.xi[i];
|
|
Real Wnl;
|
|
if (n == 0) Wnl = 1;
|
|
else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;}
|
|
else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;}
|
|
else {
|
|
Real Wprev1 = Cache.Wprev1[i];
|
|
Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n;
|
|
if (n < NMAX) { // Writing is expensive, avoid if possible.
|
|
Cache.Wprev2[i] = Wprev1;
|
|
Cache.Wprev1[i] = Wnl;
|
|
}
|
|
}
|
|
Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l];
|
|
Real costheta = Cache.costheta[i];
|
|
Real Plm = Pl(l, costheta);
|
|
ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0));
|
|
if (l == 0) {i += blockDim.x * gridDim.x; continue;}
|
|
|
|
//////////////////////////////// ugly fix
|
|
if ((costheta < -0.999) || (costheta > +0.999)) {
|
|
i += blockDim.x * gridDim.x;
|
|
continue;
|
|
}
|
|
//////////////////////////////// ugly fix
|
|
|
|
Real Plm_prev1 = Plm;
|
|
Real sintheta_I = Cache.sintheta_I[i];
|
|
Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I;
|
|
Complex Exponent = Cache.Exponent[i];
|
|
Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1];
|
|
ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y));
|
|
|
|
if (l == 1) {i += blockDim.x * gridDim.x; continue;}
|
|
|
|
Complex TorodialPart = Exponent;
|
|
for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop
|
|
Real Plm_prev2 = Plm_prev1;
|
|
Plm_prev1 = Plm;
|
|
Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
|
|
TorodialPart = Complex_mul(TorodialPart, Exponent);
|
|
tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m];
|
|
ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y));
|
|
}
|
|
i += blockDim.x * gridDim.x;
|
|
}
|
|
__syncthreads();
|
|
for (int m = 0; m <= l; m++) {
|
|
i = blockDim.x/2;
|
|
while (i != 0) {
|
|
if (tid < i)
|
|
ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]);
|
|
__syncthreads();
|
|
i /= 2;
|
|
}
|
|
if (tid == 0)
|
|
PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x];
|
|
}
|
|
}
|
|
|
|
template<int Mode>
|
|
__host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) {
|
|
// it gets A as parameter because it can be either on host or device
|
|
// 0 = both force and potential, 1 = only force, 2 = only pot
|
|
//WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!!
|
|
#define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m]
|
|
#ifndef __CUDA_ARCH__
|
|
#define AngCoeff AngCoeff_h
|
|
#define LengthScale LengthScale_h
|
|
#endif
|
|
Real dPhiLeft;
|
|
Real dPhiLeftMul;
|
|
Real dPhiRight;
|
|
Real dPhiRightAdd;
|
|
Real dPhi;
|
|
Real RadialPart2;
|
|
Real PlmDerivTheta;
|
|
|
|
Real Pot = 0;
|
|
Real Fr = 0, Ftheta = 0, Fphi = 0;
|
|
Real OneOverXiPlusOne = 1/(1+xi);
|
|
Real r_I = (1-xi)*OneOverXiPlusOne;
|
|
Real r = 1/r_I; // It's quite likely we can do everything without r.
|
|
Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite
|
|
|
|
Complex ExponentTmp[LMAX];
|
|
ExponentTmp[0] = Exponent;
|
|
for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent);
|
|
if (Mode != 2) {
|
|
Real xi2 = xi*xi;
|
|
Real xi3 = xi2*xi;
|
|
dPhiLeft = -0.25*OneOverXiPlusOne;
|
|
dPhiLeftMul = 0.25*(1-xi2);
|
|
dPhiRight = xi3 - xi2 - xi + 1;
|
|
dPhiRightAdd = 2*(xi3 - 2*xi2 + xi);
|
|
}
|
|
Real Phi0l = 1/(1+r);
|
|
Real tmp1 = Phi0l*Phi0l*r;
|
|
for (int l = 0; l <= LMAX; l++) {
|
|
if (Mode != 2) {
|
|
if (l > 0) {
|
|
dPhiLeft *= dPhiLeftMul;
|
|
dPhiRight += dPhiRightAdd;
|
|
}
|
|
}
|
|
if (Mode != 2) dPhi = dPhiLeft * dPhiRight;
|
|
for (int n = 0; n <= NMAX; n++) {
|
|
Real Wnl, Wprev1, Wprev2;
|
|
if (n == 0) Wnl = 1;
|
|
else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;}
|
|
else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;}
|
|
else {
|
|
Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n;
|
|
Wprev2 = Wprev1;
|
|
Wprev1 = Wnl;
|
|
}
|
|
|
|
Real Wderiv = 0;
|
|
if (n == 1) {Wderiv = 4*l + 3;}
|
|
else if (n > 1) {
|
|
Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi);
|
|
} // From an unknown reason it's faster to have this Block separate from the previous one.
|
|
|
|
Real RadialPart = - SQRT_4_PI * Phi0l * Wnl;
|
|
if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2));
|
|
|
|
Real Plm = Pl(l, costheta);
|
|
Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm;
|
|
if (Mode != 1) Pot += RadialPart * tmp2;
|
|
if (Mode != 2) Fr += RadialPart2 * tmp2;
|
|
|
|
if (l == 0) continue;
|
|
|
|
//////////////////////////////// ugly fix
|
|
if ((costheta < -0.999) || (costheta > +0.999)) {
|
|
continue;
|
|
}
|
|
//////////////////////////////// ugly fix
|
|
|
|
// The Block below is l>=1, m=0.
|
|
if (Mode != 2) {
|
|
PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster
|
|
Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I;
|
|
}
|
|
|
|
// The Block below is l>=1, m=1.
|
|
if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1)
|
|
Real Plm_prev1 = Plm;
|
|
Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1.
|
|
if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1;
|
|
tmp2 = 2 * AngCoeff[(l+1)*l/2+1];
|
|
Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1));
|
|
Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
|
|
Complex tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y);
|
|
Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
|
|
if (Mode != 1) Pot += Complex_real(tmp6);
|
|
if (Mode != 2) {
|
|
Fr += RadialPart2 * Complex_real(tmp5);
|
|
Fphi += Complex_imag(tmp6) * sintheta_I * r_I;
|
|
Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I;
|
|
}
|
|
|
|
if (l == 1) continue;
|
|
|
|
for (int m = 2; m <= l; m++) {
|
|
Real Plm_prev2 = Plm_prev1;
|
|
Plm_prev1 = Plm;
|
|
Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2;
|
|
tmp2 = 2 * AngCoeff[(l+1)*l/2+m];
|
|
tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m));
|
|
tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y);
|
|
tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y);
|
|
tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y);
|
|
if (Mode != 1) Pot += Complex_real(tmp6);
|
|
if (Mode != 2) {
|
|
PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1;
|
|
Fr += RadialPart2 * Complex_real(tmp5);
|
|
Fphi += m * Complex_imag(tmp6) * sintheta_I * r_I;
|
|
Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I;
|
|
}
|
|
}
|
|
}
|
|
Phi0l *= tmp1;
|
|
}
|
|
|
|
if (Mode != 2) {
|
|
Real sintheta = 1/sintheta_I;
|
|
Real tanphi = Exponent.y/Exponent.x;
|
|
Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit?
|
|
Real sinphi = tanphi*cosphi;
|
|
*F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi, costheta*Fr - sintheta*Ftheta);
|
|
*F = (*F)*LengthScale*LengthScale;
|
|
}
|
|
if (Mode != 1) *Potential = Pot*LengthScale;
|
|
#ifndef __CUDA_ARCH__
|
|
#undef AngCoeff
|
|
#undef LengthScale
|
|
#endif
|
|
#undef A
|
|
}
|
|
|
|
__global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4"
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
while (i < N) {
|
|
CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]);
|
|
i += blockDim.x * gridDim.x;
|
|
}
|
|
}
|
|
|
|
// Global initialization
|
|
void etics::scf::GlobalInit() {
|
|
int pid = getpid();
|
|
int devid;
|
|
cudaGetDevice(&devid);
|
|
char hostname[256];
|
|
gethostname(hostname, 256);
|
|
char output_string[256];
|
|
sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname);
|
|
std::cout << output_string << std::endl;
|
|
|
|
RadialCoefficients(etics::scf::RadCoeff_h);
|
|
cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real));
|
|
AngularCoefficients(etics::scf::AngCoeff_h);
|
|
cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real));
|
|
etics::scf::LengthScale_h = 1;
|
|
cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
|
|
ScfGloballyInitialized = true;
|
|
}
|
|
|
|
void etics::scf::SetLengthScale(Real NewLengthScale) {
|
|
etics::scf::LengthScale_h = NewLengthScale;
|
|
cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real));
|
|
}
|
|
|
|
// Old API
|
|
void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) {
|
|
int blockSize;
|
|
int minGridSize;
|
|
int gridSize;
|
|
//WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem.
|
|
cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128);
|
|
gridSize = minGridSize;
|
|
*k3gs_new = gridSize;
|
|
*k3bs_new = blockSize;
|
|
|
|
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N);
|
|
gridSize = (N + blockSize - 1) / blockSize;
|
|
*k4gs_new = gridSize;
|
|
*k4bs_new = blockSize;
|
|
}
|
|
|
|
void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!!
|
|
SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new);
|
|
}
|
|
|
|
void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) {
|
|
SCFObject.CalculateGravity(P, N, Potential, F);
|
|
}
|