/** * @file * @author Yohai Meiron * @brief Functions to calculate gravitational force using the SCF method. */ #include "common.hpp" #include "mathaux.hpp" #include "scf.hpp" #include using std::cout; using std::cerr; using std::endl; #include #include #include #include #include // Use the MPI flag only for the standalone program #ifdef ETICS_MPI #include #endif namespace etics { namespace scf { scfclass SCFObject; // These are truly global variables that are initialized once and can be read by every scfclass object. bool ScfGloballyInitialized = false; __constant__ Real RadCoeff[(NMAX+1)*(LMAX+1)]; __constant__ Real AngCoeff[(LMAX+1)*(LMAX+2)/2]; Real RadCoeff_h[(NMAX+1)*(LMAX+1)]; Real AngCoeff_h[(LMAX+1)*(LMAX+2)/2]; __constant__ Real LengthScale; Real LengthScale_h; // These are global variables that are read/written by each scfclass object; they must be locked before use. __constant__ Complex A[(NMAX+1)*(LMAX+1)*(LMAX+2)/2]; __constant__ CacheStruct Cache; // These variables provide the mechanism to lock and release the global constant memory variables above. scfclass *GpuLockOwner = NULL; #define GPU_LOCK_PANIC \ { \ std::cerr << "Error: GPU lock failed; lock not owned by this object!" << endl; \ exit(1); \ } } } namespace etics { namespace scf { int blockSizeToDynamicSMemSize(int BlockSize) { // Should be a lambda function return (LMAX+1)*sizeof(Complex)*BlockSize; } } } // Class implementation etics::scf::scfclass::scfclass() { N=0; Nmax=0; k3gs=-1; k3bs=-1; k4gs=-1; k4bs=-1; PartialSum = NULL; PartialSum_h = NULL; } etics::scf::scfclass::~scfclass() { if (etics::scf::GpuLockOwner == this) ReleaseGpuLock(); cudaFree(Cache_h.xi); cudaFree(Cache_h.Phi0l); cudaFree(Cache_h.Wprev1); cudaFree(Cache_h.Wprev2); cudaFree(Cache_h.costheta); cudaFree(Cache_h.sintheta_I); cudaFree(Cache_h.Exponent); cudaFree(Cache_h.mass); free(PartialSum_h); cudaFree(PartialSum); } void etics::scf::scfclass::GetGpuLock() { if (etics::scf::GpuLockOwner == this) return; if (etics::scf::GpuLockOwner == NULL) etics::scf::GpuLockOwner = this; else GPU_LOCK_PANIC; } void etics::scf::scfclass::ReleaseGpuLock() { if (etics::scf::GpuLockOwner == NULL) return; if (etics::scf::GpuLockOwner == this) etics::scf::GpuLockOwner = NULL; else GPU_LOCK_PANIC; } int etics::scf::scfclass::GetNMAX() {return NMAX;} int etics::scf::scfclass::GetLMAX() {return LMAX;} void etics::scf::scfclass::CalculateCoefficients(int n, int l) { int BaseAddress = n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2; etics::scf::CalculateCoefficientsPartial<<>>(N, n, l, PartialSum); cudaMemcpy(PartialSum_h, PartialSum, k3gs*(l+1)*sizeof(Complex), cudaMemcpyDeviceToHost); for (int m = 0; m <= l; m++) for (int Block=0; Block 0) etics::scf::CalculatePhi0l<<<128,128>>>(N, l); // wouldn't it make sense just putting it after the n-loop finishes? Probably not becasue then we need to skip at the last iter for (int n = 0; n <= NMAX; n++) { CalculateCoefficients(n, l); } } } void etics::scf::scfclass::GetCoefficients(Complex *A) { std::copy(A_h, A_h+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A); } void etics::scf::scfclass::SendCoeffsToGPU() { if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC; cudaMemcpyToSymbol(etics::scf::A, A_h, (NMAX+1)*(LMAX+1)*(LMAX+2)/2 * sizeof(Complex)); } void etics::scf::scfclass::SendCachePointersToGPU() { if (etics::scf::GpuLockOwner != this) GPU_LOCK_PANIC; cudaMemcpyToSymbol(etics::scf::Cache, &Cache_h, sizeof(etics::scf::CacheStruct)); } void etics::scf::scfclass::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { if (N > Nmax) { fprintf(stderr, "You tried to load too many particles! (%d > %d)", N, Nmax); exit(1); } this->N = N; etics::scf::LoadParticlesToCache<<<128,128>>>(P, N, Offset); } void etics::scf::scfclass::CalculateGravityFromCoefficients(Real *Potential, vec3 *F) { etics::scf::CalculateGravityFromCoefficients<<<(N+k4bs-1)/k4bs,k4bs>>>(N, Potential, F); } void etics::scf::scfclass::CalculateGravityAtPoint(vec3 Pos, Real *Potential, vec3 *F) { static bool first = true; if (first) { printf("WARNING!!! the code has been modified and CalculateGravityAtPoint may not work as planned.\n"); first = false; } Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z); Real xi = (r-1)/(r+1); Real costheta = Pos.z/r; Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y); Complex Exponent = make_Complex(Pos.x*Normal_I, Pos.y*Normal_I); etics::scf::CalculateGravityTemplate<0>(xi, costheta, Exponent, A_h, F, Potential); } void etics::scf::scfclass::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) { GetGpuLock(); SendCachePointersToGPU(); LoadParticlesToCache(P, N); CalculateCoefficients(); #ifdef ETICS_MPI Complex ATotal[(NMAX+1)*(LMAX+1)*(LMAX+2)/2]; MPI_Allreduce(&A_h, &ATotal, (NMAX+1)*(LMAX+1)*(LMAX+2)/2*2, MPI_ETICS_REAL, MPI_SUM, MPI_COMM_WORLD); std::copy(ATotal, ATotal+(NMAX+1)*(LMAX+1)*(LMAX+2)/2, A_h); // not really need this copy, just calculate the coefficients in some local array, then sum it into a global array (A_h or somthing) and copy it to GPUs #endif SendCoeffsToGPU(); CalculateGravityFromCoefficients(Potential, F); ReleaseGpuLock(); } void etics::scf::scfclass::SetLaunchConfiguration(int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { if (k3gs_new != k3gs) { if (PartialSum_h != NULL) free(PartialSum_h); if (PartialSum != NULL) cudaFree(PartialSum_h); PartialSum_h = (Complex*)malloc(k3gs_new*(LMAX+1)*sizeof(Complex)); cudaMalloc((void**)&PartialSum, k3gs_new*(LMAX+1)*sizeof(Complex)); } k3gs = k3gs_new; k3bs = k3bs_new; k4gs = k4gs_new; k4bs = k4bs_new; } void etics::scf::scfclass::Init(int Nmax, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // First, initialize global arrays if not already initialized. if (!etics::scf::ScfGloballyInitialized) etics::scf::GlobalInit(); // Next, set the launch configuation (either guess it or use what the user specified). if ((k3gs_new<=0) || (k3bs_new<=0) || (k4gs_new<=0) || (k4bs_new<=0)) { int k3gs_new, k3bs_new, k4gs_new, k4bs_new; etics::scf::GuessLaunchConfiguration(Nmax, &k3gs_new, &k3bs_new, &k4gs_new, &k4bs_new); SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new); } else SetLaunchConfiguration(k3gs_new, k3bs_new, k4gs_new, k4bs_new); // If Nmax > 0 then initialize the cahche memory. if (Nmax > 0) { this->Nmax = Nmax; cudaMalloc((void**)&Cache_h.xi, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.Phi0l, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.Wprev1, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.Wprev2, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.costheta, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.sintheta_I, Nmax * sizeof(Real)); cudaMalloc((void**)&Cache_h.Exponent, Nmax * sizeof(Complex)); cudaMalloc((void**)&Cache_h.mass, Nmax * sizeof(Real)); } else { Cache_h.xi = NULL; Cache_h.Phi0l = NULL; Cache_h.Wprev1 = NULL; Cache_h.Wprev2 = NULL; Cache_h.costheta = NULL; Cache_h.sintheta_I = NULL; Cache_h.Exponent = NULL; Cache_h.mass = NULL; } } // Kernels __global__ void etics::scf::LoadParticlesToCache(Particle *P, int N, vec3 Offset) { // formerly "Kernel1" int i = threadIdx.x + blockIdx.x * blockDim.x; while (i < N) { vec3 Pos = (P[i].pos - Offset)*etics::scf::LengthScale; Real r = sqrt(Pos.x*Pos.x + Pos.y*Pos.y + Pos.z*Pos.z); Real xi = (r-1)/(r+1); Real costheta = Pos.z/r; Real sintheta_I = rsqrt(1-costheta*costheta); Cache.xi[i] = xi; Cache.Phi0l[i] = 0.5 * (1 - xi); Cache.costheta[i] = costheta; Cache.sintheta_I[i] = sintheta_I; Real Normal_I = rsqrt(Pos.x*Pos.x + Pos.y*Pos.y); Complex Exponent = make_Complex(Pos.x*Normal_I, -Pos.y*Normal_I); Cache.Exponent[i] = Exponent; Cache.mass[i] = P[i].m; i += blockDim.x * gridDim.x; } } __global__ void etics::scf::CalculatePhi0l(int N, int l) { // formerly "Kernel2" int i = threadIdx.x + blockIdx.x * blockDim.x; while (i < N) { Real xi = Cache.xi[i]; Cache.Phi0l[i] *= 0.25*(1-xi*xi); i += blockDim.x * gridDim.x; } } __global__ void etics::scf::CalculateCoefficientsPartial(int N, int n, int l, Complex *PartialSum) { // formerly "Kernel3" extern __shared__ Complex ReductionCache[]; // size is determined in kernel launch int tid = threadIdx.x; for (int m = 0; m <= l; m++) ReductionCache[m*blockDim.x+tid] = make_Complex(0, 0); int i = threadIdx.x + blockIdx.x * blockDim.x; while (i < N) { Real xi = Cache.xi[i]; Real Wnl; if (n == 0) Wnl = 1; else if (n == 1) {Wnl = (4*l+3)*xi; Cache.Wprev2[i] = Wnl;} else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Cache.Wprev1[i] = Wnl;} else { Real Wprev1 = Cache.Wprev1[i]; Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Cache.Wprev2[i])/(Real)n; if (n < NMAX) { // Writing is expensive, avoid if possible. Cache.Wprev2[i] = Wprev1; Cache.Wprev1[i] = Wnl; } } Real RadialPart = - Cache.mass[i] * SQRT_4_PI * Cache.Phi0l[i] * Wnl * RadCoeff[(LMAX+1)*n+l]; Real costheta = Cache.costheta[i]; Real Plm = Pl(l, costheta); ReductionCache[tid] = Complex_add(ReductionCache[tid], make_Complex(RadialPart * Plm * AngCoeff[(l+1)*l/2],0)); if (l == 0) {i += blockDim.x * gridDim.x; continue;} //////////////////////////////// ugly fix if ((costheta < -0.999) || (costheta > +0.999)) { i += blockDim.x * gridDim.x; continue; } //////////////////////////////// ugly fix Real Plm_prev1 = Plm; Real sintheta_I = Cache.sintheta_I[i]; Plm = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; Complex Exponent = Cache.Exponent[i]; Real tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+1]; ReductionCache[blockDim.x+tid] = Complex_add(ReductionCache[blockDim.x+tid], make_Complex(tmp0 * Exponent.x, tmp0 * Exponent.y)); if (l == 1) {i += blockDim.x * gridDim.x; continue;} Complex TorodialPart = Exponent; for (int m = 2; m <= l; m++) { // make sure no redundancy at the end of the loop Real Plm_prev2 = Plm_prev1; Plm_prev1 = Plm; Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2; TorodialPart = Complex_mul(TorodialPart, Exponent); tmp0 = RadialPart * Plm * AngCoeff[(l+1)*l/2+m]; ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], make_Complex(tmp0 * TorodialPart.x, tmp0 * TorodialPart.y)); } i += blockDim.x * gridDim.x; } __syncthreads(); for (int m = 0; m <= l; m++) { i = blockDim.x/2; while (i != 0) { if (tid < i) ReductionCache[m*blockDim.x+tid] = Complex_add(ReductionCache[m*blockDim.x+tid], ReductionCache[m*blockDim.x+tid+i]); __syncthreads(); i /= 2; } if (tid == 0) PartialSum[blockIdx.x*(l+1) + m] = ReductionCache[m*blockDim.x]; } } template __host__ __device__ void etics::scf::CalculateGravityTemplate(Real xi, Real costheta, Complex Exponent, Complex *A, vec3 *F, Real *Potential) { // it gets A as parameter because it can be either on host or device // 0 = both force and potential, 1 = only force, 2 = only pot //WARNING !!! This cannot really be a host function because it needs device cahce, angular coefficients which are on device !!! #define A(n,l,m) A[n*(LMAX+1)*(LMAX+2)/2 + l*(l+1)/2 + m] #ifndef __CUDA_ARCH__ #define AngCoeff AngCoeff_h #define LengthScale LengthScale_h #endif Real dPhiLeft; Real dPhiLeftMul; Real dPhiRight; Real dPhiRightAdd; Real dPhi; Real RadialPart2; Real PlmDerivTheta; Real Pot = 0; Real Fr = 0, Ftheta = 0, Fphi = 0; Real OneOverXiPlusOne = 1/(1+xi); Real r_I = (1-xi)*OneOverXiPlusOne; Real r = 1/r_I; // It's quite likely we can do everything without r. Real sintheta_I = rsqrt(1-costheta*costheta); // faster than using cachei // You sure??? in K3 it's the opposite Complex ExponentTmp[LMAX]; ExponentTmp[0] = Exponent; for (int m = 1; m < LMAX; m++) ExponentTmp[m] = Complex_mul(ExponentTmp[m-1],Exponent); if (Mode != 2) { Real xi2 = xi*xi; Real xi3 = xi2*xi; dPhiLeft = -0.25*OneOverXiPlusOne; dPhiLeftMul = 0.25*(1-xi2); dPhiRight = xi3 - xi2 - xi + 1; dPhiRightAdd = 2*(xi3 - 2*xi2 + xi); } Real Phi0l = 1/(1+r); Real tmp1 = Phi0l*Phi0l*r; for (int l = 0; l <= LMAX; l++) { if (Mode != 2) { if (l > 0) { dPhiLeft *= dPhiLeftMul; dPhiRight += dPhiRightAdd; } } if (Mode != 2) dPhi = dPhiLeft * dPhiRight; for (int n = 0; n <= NMAX; n++) { Real Wnl, Wprev1, Wprev2; if (n == 0) Wnl = 1; else if (n == 1) {Wnl = (4*l+3)*xi; Wprev2 = Wnl;} else if (n == 2) {Wnl = -(2*l+1.5)+( 8*l*(l+2) +7.5)*xi*xi; Wprev1 = Wnl;} else { Wnl = (xi*(2*n+4*l+1)*Wprev1 - (n+4*l+1)*Wprev2)/(Real)n; Wprev2 = Wprev1; Wprev1 = Wnl; } Real Wderiv = 0; if (n == 1) {Wderiv = 4*l + 3;} else if (n > 1) { Wderiv = (-n*xi*Wnl + (n+4*l+2)*Wprev2)/(1-xi*xi); } // From an unknown reason it's faster to have this Block separate from the previous one. Real RadialPart = - SQRT_4_PI * Phi0l * Wnl; if (Mode != 2) RadialPart2 = SQRT_4_PI * (dPhi*Wnl + Phi0l*Wderiv*2/pow(1+r,2)); Real Plm = Pl(l, costheta); Real tmp2 = Complex_real(A(n,l,0)) * AngCoeff[(l+1)*l/2] * Plm; if (Mode != 1) Pot += RadialPart * tmp2; if (Mode != 2) Fr += RadialPart2 * tmp2; if (l == 0) continue; //////////////////////////////// ugly fix if ((costheta < -0.999) || (costheta > +0.999)) { continue; } //////////////////////////////// ugly fix // The Block below is l>=1, m=0. if (Mode != 2) { PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO check if storing Pl(l-1) somewhere makes it faster Ftheta += - PlmDerivTheta * AngCoeff[(l+1)*l/2] * Complex_real(A(n,l,0)) * RadialPart * r_I; } // The Block below is l>=1, m=1. if (Mode == 2) PlmDerivTheta = (costheta*Plm - Pl(l-1, costheta))*l*sintheta_I; //TODO see above regarding storing Pl(l-1) Real Plm_prev1 = Plm; Plm = PlmDerivTheta; // PlmDerivTheta equals Plm for m=1. if (Mode != 2) PlmDerivTheta = - Plm*costheta*sintheta_I - l*(l+1)*Plm_prev1; tmp2 = 2 * AngCoeff[(l+1)*l/2+1]; Complex tmp3 = Complex_mul(ExponentTmp[0], A(n,l,1)); Complex tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y); Complex tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y); Complex tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y); if (Mode != 1) Pot += Complex_real(tmp6); if (Mode != 2) { Fr += RadialPart2 * Complex_real(tmp5); Fphi += Complex_imag(tmp6) * sintheta_I * r_I; Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I; } if (l == 1) continue; for (int m = 2; m <= l; m++) { Real Plm_prev2 = Plm_prev1; Plm_prev1 = Plm; Plm = - 2*(m-1)*costheta*sintheta_I*Plm_prev1 - (l+m-1)*(l-m+2)*Plm_prev2; tmp2 = 2 * AngCoeff[(l+1)*l/2+m]; tmp3 = Complex_mul(ExponentTmp[m-1], A(n,l,m)); tmp4 = make_Complex(tmp2 * tmp3.x, tmp2 * tmp3.y); tmp5 = make_Complex(Plm * tmp4.x, Plm * tmp4.y); tmp6 = make_Complex(RadialPart * tmp5.x, RadialPart * tmp5.y); if (Mode != 1) Pot += Complex_real(tmp6); if (Mode != 2) { PlmDerivTheta = - m*Plm*costheta*sintheta_I - (l+m)*(l-m+1)*Plm_prev1; Fr += RadialPart2 * Complex_real(tmp5); Fphi += m * Complex_imag(tmp6) * sintheta_I * r_I; Ftheta += - RadialPart * PlmDerivTheta * Complex_real(tmp4) * r_I; } } } Phi0l *= tmp1; } if (Mode != 2) { Real sintheta = 1/sintheta_I; Real tanphi = Exponent.y/Exponent.x; Real cosphi = ((Exponent.x >= 0)?(+1):(-1)) * rsqrt(1+tanphi*tanphi); // no simpler way to get sign bit? Real sinphi = tanphi*cosphi; *F = vec3(sintheta*cosphi*Fr + costheta*cosphi*Ftheta - sinphi*Fphi, sintheta*sinphi*Fr + costheta*sinphi*Ftheta + cosphi*Fphi, costheta*Fr - sintheta*Ftheta); *F = (*F)*LengthScale*LengthScale; } if (Mode != 1) *Potential = Pot*LengthScale; #ifndef __CUDA_ARCH__ #undef AngCoeff #undef LengthScale #endif #undef A } __global__ void etics::scf::CalculateGravityFromCoefficients(int N, Real *Potential, vec3 *F) { // formerly "Kernel4" int i = threadIdx.x + blockIdx.x * blockDim.x; while (i < N) { CalculateGravityTemplate<0>(Cache.xi[i], Cache.costheta[i], Complex_conj(Cache.Exponent[i]), A, &F[i], &Potential[i]); i += blockDim.x * gridDim.x; } } // Global initialization void etics::scf::GlobalInit() { int pid = getpid(); int devid; cudaGetDevice(&devid); char hostname[256]; gethostname(hostname, 256); char output_string[256]; sprintf(output_string, "ETICS: pid %05d using GPU device number %d on %s", pid, devid, hostname); std::cout << output_string << std::endl; RadialCoefficients(etics::scf::RadCoeff_h); cudaMemcpyToSymbol(etics::scf::RadCoeff, etics::scf::RadCoeff_h, (NMAX+1)*(LMAX+1) * sizeof(Real)); AngularCoefficients(etics::scf::AngCoeff_h); cudaMemcpyToSymbol(etics::scf::AngCoeff, etics::scf::AngCoeff_h, (LMAX+1)*(LMAX+2)/2 * sizeof(Real)); etics::scf::LengthScale_h = 1; cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real)); ScfGloballyInitialized = true; } void etics::scf::SetLengthScale(Real NewLengthScale) { etics::scf::LengthScale_h = NewLengthScale; cudaMemcpyToSymbol(etics::scf::LengthScale, &etics::scf::LengthScale_h, sizeof(Real)); } // Old API void etics::scf::GuessLaunchConfiguration(int N, int *k3gs_new, int *k3bs_new, int *k4gs_new, int *k4bs_new) { int blockSize; int minGridSize; int gridSize; //WARNING setting blockSizeLimit=128 for cudaOccupancyMaxPotentialBlockSizeVariableSMem. cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, CalculateCoefficientsPartial, blockSizeToDynamicSMemSize, 128); gridSize = minGridSize; *k3gs_new = gridSize; *k3bs_new = blockSize; cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, CalculateGravityFromCoefficients, 0, N); gridSize = (N + blockSize - 1) / blockSize; *k4gs_new = gridSize; *k4bs_new = blockSize; } void etics::scf::Init(int N, int k3gs_new, int k3bs_new, int k4gs_new, int k4bs_new) { // to be removed!! SCFObject.Init(N, k3gs_new, k3bs_new, k4gs_new, k4bs_new); } void etics::scf::CalculateGravity(Particle *P, int N, Real *Potential, vec3 *F) { SCFObject.CalculateGravity(P, N, Potential, F); }