phigrape/phigrape.cpp

/*****************************************************************************
File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
                :
Contents       : N-body code with integration by individual block time step
                : together with the parallel using of GRAPE6a board's.
                :
                : Added the GPU support via SAPPORO library.
                :
                : Normalization to the physical units!!!
                :
                : External Potential added
                : Plummer-Kuzmin: Bulge, Disk, Halo
                : Kharchenko+Andreas...
                :
                : SC extra POT for Bek SC test runs...
                :
                : Rebuced to the Single BH -> Plummer
                : Andreas+Fazeel...
                :
                : Stellar evolution added
                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
                : IMS mass loss: van den Hoeg & Groenewegen (1997)
                :
                : STARDESTR_EXT: Tidal disruption of stars by external BH...
                : Chingis, Denis & Maxim...
                :
                : STARDESTR: Tidal disruption of stars by BH...
                : Jose, Li Shuo & Shiyan Zhong
                :
                : STARDISK: Drag force...
                : Chingis, Denis & Maxim...
                :
                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
                : Taras, Andreas...
                :
                : Live BH (1 || 2) + ACC + EJECT...
                : Li Shuo & Shiyan Zhong
                :
                : dt_min for BH (1 || 2)...
                :
                : added the PN calculus for the BBH
                : PN0, PN1, PN2, PN2.5 (coded on the base of
                : Gabor Kupi original routine)
                :
                : added the "name" array...
                :
                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
                : for Alexey SC runs... and also for Fazeel Zurich runs...
                :
                : CPU_TIMELIMIT added for the Julich MW cluster runs...
                :
Coded by       : Peter Berczik
Version number : 19.04
Last redaction : 2019.04.16 12:55
*****************************************************************************/
#define TIMING

#define ETA_S_CORR   4.0
#define ETA_BH_CORR  4.0

#define DTMAXPOWER  -3.0
#define DTMINPOWER -36.0

#include <algorithm>
#include <math.h>
#include <mpi.h>
#include <numeric>
#include <stdexcept>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <time.h>

#include "black_holes.h"
#include "config.h"
#include "double3.h"
#include "external.h"
#include "grape6.h"
#include "io.h"

#ifdef ETICS
#include "grapite.h"
//#define ACT_DEF_GRAPITE
#endif
const bool act_def_grapite = true;

Config *config;

// These are used in the energy control, could be static but will probably be removed in the end anyway
double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
double CPU_time_real,  CPU_time_user,  CPU_time_syst;

#ifdef TIMING
// TODO clean up here
double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;

double DT_TOT,
    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
    DT_ACT_GRAV, DT_EXT_GRAV,
    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
    DT_ACT_CORR, DT_ACT_LOAD,
    DT_STEVOL, DT_STARDISK, DT_STARDESTR;
double DT_ACT_REDUCE;
#endif

void get_CPU_time(double *time_real, double *time_user, double *time_syst)
{
    struct rusage xxx;
    double sec_u, microsec_u, sec_s, microsec_s;
    struct timeval tv;

    getrusage(RUSAGE_SELF,&xxx);

    sec_u = xxx.ru_utime.tv_sec;
    sec_s = xxx.ru_stime.tv_sec;

    microsec_u = xxx.ru_utime.tv_usec;
    microsec_s = xxx.ru_stime.tv_usec;

    *time_user = sec_u + microsec_u * 1.0E-06;
    *time_syst = sec_s + microsec_s * 1.0E-06;

    gettimeofday(&tv, NULL);
    *time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;

    *time_user = *time_real;
}

class Calc_self_grav {
public:
    Calc_self_grav(const int N, const int n_loc, const int clusterid, const int npipe, const double eps)
        : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
    {
        h2.assign(N, eps2);
    }
    void operator()(const double t, const int n_act, int ind_act[], const double3 x_act[], const double3 v_act[],
              double pot[], double3 acc[], double3 jrk[])
    {
        g6_set_ti(clusterid, t);
        for (int i=0; i<n_act; i+=npipe) {
            int nn = npipe;
            if (n_act-i < npipe) nn = n_act - i;
            //TODO any way we can clean up this ugly casting?
            g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i, eps2, h2.data());
            g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, eps2, h2.data(), (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i);
            g6_calls++;
        } /* i */
    }
    double g6_calls;
private:
    int n_loc, clusterid, npipe;
    double eps2;
    std::vector<double> h2;
};

void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n, double3 *x, double3 *v, double *pot, double3 *acc, double3* jrk)
// TODO should just be a class that has this pointer array as a member
{
#ifdef TIMING
    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

    std::fill(pot, pot+n, 0.);
    for (auto component : external_gravity_components) {
        if (component->is_active)
            component->apply(n, x, v, pot, acc, jrk);
    }

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif
}

void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, double m[], double3 x[], double3 v[], double pot[], double pot_ext[])
{
     double E_pot = 0;
    for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;

    double E_kin = 0;
    for (int i=0; i<N; i++) E_kin += m[i]*v[i].norm2();
    E_kin *= 0.5;

    double E_pot_ext = 0;
    for (int i=0; i<N; i++) E_pot_ext += m[i]*pot_ext[i];

    double m_tot = 0;
    double3 xcm = {0, 0, 0};
    double3 vcm = {0, 0, 0};
    for (int i=0; i<N; i++) {
        m_tot += m[i];
        xcm += m[i] * x[i];
        vcm += m[i] * v[i];
    }
    xcm /= m_tot;
    vcm /= m_tot;
    double rcm_mod = xcm.norm();
    double vcm_mod = vcm.norm();

    double3 mom = {0, 0, 0};
    for (int i=0; i<N; i++) {
        mom[0] += m[i] * (x[i][1]* v[i][2] - x[i][2]*v[i][1]);
        mom[1] += m[i] * (x[i][2]* v[i][0] - x[i][0]*v[i][2]);
        mom[2] += m[i] * (x[i][0]* v[i][1] - x[i][1]*v[i][0]);
    }

    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);

    double E_tot = E_pot + E_kin + E_pot_ext;

    static double E_tot_prev;
    if (timesteps == 0.0) E_tot_prev = E_tot;

    double DE_tot = E_tot - E_tot_prev;

    /* This is the only output to screen */
    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
            CPU_time_user-CPU_time_user0);

    fflush(stdout);

    auto out = fopen("contr.dat", "a");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    E_tot_prev = E_tot;
}

class Active_search {
    // TODO you can add pointers to t and dt at the constructor, no point giving them at get_minimum_time but without the size.
public:
    Active_search(const int myRank, const int n_proc, const int n_loc, const int N)
        : myRank(myRank), n_proc(n_proc), n_loc(n_loc), N(N)
    {
        ind_act_loc = new int[n_loc];
    }
    ~Active_search() { delete[] ind_act_loc; };
    double get_minimum_time(const double t[], const double dt[])
    {
        double min_t_loc, min_t;
        if (act_def_grapite) {
            min_t_loc = grapite_get_minimum_time();
            printf("gggggggggggg  min_t_loc=%.10e\n", min_t_loc);
        } else {
            min_t_loc = t[myRank*n_loc]+dt[myRank*n_loc];
            for (int j=myRank*n_loc+1; j<(myRank+1)*n_loc; j++) {
                double tmp = t[j] + dt[j];
                if (tmp < min_t_loc) min_t_loc = tmp;
            }
        }
        /* Reduce the "global" min_t from min_t_loc "local" on all processors) */
        MPI_Allreduce(&min_t_loc, &min_t, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
        return min_t;
    }
    void get_active_indices(const double min_t, const double t[], const double dt[], int ind_act[], int *n_act)
#warning refrence not pointer
    {
        if (act_def_grapite) {
            int n_act_loc;
            grapite_active_search(min_t, ind_act_loc, &n_act_loc);
            if (myRank > 0)
                for (int i=0; i<n_act_loc; i++)
                    ind_act_loc[i] += myRank*n_loc;
            int n_act_arr[256], displs[256]; // Assuming maximum of 256 processes... seems safe.
            MPI_Allgather(&n_act_loc, 1, MPI_INT, n_act_arr, 1, MPI_INT, MPI_COMM_WORLD);
            *n_act = n_act_arr[0];
            for (int i=1; i<n_proc; i++)
                *n_act += n_act_arr[i];
            displs[0] = 0;
            for (int i=1; i<n_proc; i++)
                displs[i]=displs[i-1]+n_act_arr[i-1];
            MPI_Allgatherv(ind_act_loc, n_act_loc, MPI_INT, ind_act, n_act_arr, displs, MPI_INT, MPI_COMM_WORLD);
        } else {
            *n_act = 0;
            for (int i=0; i<N; i++) {
                if (t[i]+dt[i] == min_t) ind_act[(*n_act)++] = i;
            } /* i */
        }
    }
private:
    int myRank, n_proc, n_loc, N;
    int *ind_act_loc;
};

inline void calc_high_derivatives(const double dt_tmp, const double3 a_old, const double3 a_new, const double3 a1_old, const double3 a1_new, double3& a2, double3& a3)
{
    double dtinv = 1/dt_tmp;
    double dt2inv = dtinv*dtinv;
    double dt3inv = dt2inv*dtinv;

    double3 a0mia1 = a_old-a_new;
    double3 ad04plad12 = 4*a1_old + 2*a1_new;
    double3 ad0plad1 = a1_old + a1_new;

    a2 = -6*a0mia1*dt2inv - ad04plad12*dtinv;
    a3 = 12*a0mia1*dt3inv + 6*ad0plad1*dt2inv;
}

inline void corrector(const double dt_tmp, const double3 a2, const double3 a3, double3& x, double3& v)
{
    double dt3over6 = dt_tmp*dt_tmp*dt_tmp/6.0;
    double dt4over24 = dt3over6*dt_tmp/4.0;
    double dt5over120 = dt4over24*dt_tmp/5.0;

    x += dt4over24*a2 + dt5over120*a3;
    v += dt3over6*a2 + dt4over24*a3;
}

inline double aarseth_step(const double eta, const double dt, const double3 a, const double3 a1, const double3 a2, const double3 a3)
{
    double a1abs = a.norm();
    double adot1abs = a1.norm();
    double3 a2dot1 = a2 + dt*a3;
    double a2dot1abs = a2dot1.norm();
    double a3dot1abs = a3.norm();
    return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
}

int main(int argc, char *argv[])
{
    double timesteps=0.0, n_act_sum=0.0;

    double3 xcm, vcm, xdc, vdc; // these should go away

    int    i_bh1, i_bh2;

    double3 x_bbhc, v_bbhc;

    double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.

    /* INIT the rand() !!! */
    srand(19640916);                 /* it is just my birthday :-) */

    /* Init MPI */
    MPI_Init(&argc, &argv);

    /* Define the total number of processors and the Rank of each processors */
    int n_proc, myRank;
    const int rootRank = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &n_proc);
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

    /* Define the processors names */
    int name_proc;
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(processor_name, &name_proc);

    /* Print the Rank and the names of processors */
    printf("Rank of the processor %03d on %s \n", myRank, processor_name);

    /* read the input parameters */
    config = new Config("phigrape.conf");

    int diskstep, N;
    double time_cur;
    double *m;
    double3 *x, *v;
    if (is_hdf5(config->input_file_name)) {
#ifndef HAS_HDF5
        fprintf(stderr, "ERROR: input file is in HDF5 format, but the code was compiled without HDF5 support\n");
        return -1;
#endif
        h5_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
    }
    else
        ascii_read(config->input_file_name, diskstep, N, time_cur, &m, &x, &v);

    int *ind  = new int[N];
    std::iota(ind, ind+N, 0);
    double3 *a = new double3[N], *adot = new double3[N];
    double *pot  = new double[N], *pot_ext = new double[N], *t = new double[N], *dt = new double[N];

    /* data for active particles */
    // x_act_new and v_act_new arrays hold the predicted position and velocity of i-particles, which is later corrected before moving into the j-particle memory. The [pot,a,adot]_act_tmp arrays hold the calculation results from each node. The [pot,a,adot]_act_new arrays hold the reduced calculation results from all nodes.
    int n_act, *ind_act = new int[N];
    double *pot_act_new = new double[N], *pot_act_tmp = new double[N], *pot_act_ext = new double[N];
    double3 *x_act_new = new double3[N], *v_act_new    = new double3[N],
            *a_act_tmp = new double3[N], *adot_act_tmp = new double3[N],
            *a_act_new = new double3[N], *adot_act_new = new double3[N];

    double eps = config->eps;
    double eta = config->eta;
    double t_end = config->t_end;
    double dt_disk = config->dt_disk;
    double dt_contr = config->dt_contr;
    double dt_bh = config->dt_bh;

    if (myRank == rootRank) {
        printf("\n");
        printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
        printf("\n");
        printf("N       = %07d \t eps      = %.6E \n", N, eps);
        printf("t_beg   = %.6E \t t_end    = %.6E \n", time_cur, t_end);
        printf("dt_disk = %.6E \t dt_contr = %.6E \n", dt_disk, dt_contr);
        printf("dt_bh   = %.6E \n", dt_bh);
        printf("eta     = %.6E \n", eta);
        printf("\n");

        if ((diskstep == 0) && (time_cur == 0)) {
            FILE *out = fopen("contr.dat", "w");
            fclose(out);
#ifdef TIMING
            out = fopen("timing.dat", "w");
            fclose(out);
#endif
            if (config->live_smbh_output && (config->live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
            }
            if ((config->live_smbh_neighbor_output) && (config->live_smbh_count > 0)) {
                out = fopen("bh_neighbors.dat", "w");
                fclose(out);
            }
        }

        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);

    } /* if (myRank == rootRank) */

    double normalization_mass=1, normalization_length=1, normalization_velocity=1;
    if (config->ext_units_physical) {
        normalization_mass = 1/config->unit_mass;
        normalization_length = 1000/config->unit_length;
        normalization_velocity = 1.52484071426404437233e+01*sqrt(config->unit_length/config->unit_mass);
    }
    Plummer ext_bulge(config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    Miyamoto_Nagai ext_disk(config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    Plummer ext_halo_plummer(config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    Logarithmic_halo ext_log_halo(config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    Dehnen ext_dehnen(config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);

    std::vector<External_gravity*> external_gravity_components;
    external_gravity_components.push_back(&ext_bulge);
    external_gravity_components.push_back(&ext_disk);
    external_gravity_components.push_back(&ext_halo_plummer);
    external_gravity_components.push_back(&ext_log_halo);
    external_gravity_components.push_back(&ext_dehnen);
    bool has_external_gravity = false;
    for (auto component : external_gravity_components) {
        if (component->is_active) {
            has_external_gravity = true;
            break;
        }
    }

    if (has_external_gravity) printf("External Potential: \n\n");
    if (ext_bulge.is_active)        printf("m_bulge = %.4E                        b_bulge = %.4E\n", config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    if (ext_disk.is_active)         printf("m_disk  = %.4E  a_disk  = %.4E  b_disk  = %.4E\n", config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    if (ext_halo_plummer.is_active) printf("m_halo  = %.4E                        b_halo = %.4E\n", config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    if (ext_log_halo.is_active)     printf("v_halo = %.6E r_halo = %.6E \n", config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    if (ext_dehnen.is_active)       printf("m_ext  = %.6E r_ext  = %.6E \t g_ext = %.3E \n", config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);
    printf("\n");
    fflush(stdout);

    double eta_bh = eta/ETA_BH_CORR;

    const double dt_min = pow(2, DTMINPOWER);
    const double dt_max = std::min({dt_disk, dt_contr, pow(2, DTMAXPOWER)});

    double t_disk  = dt_disk*(1+floor(time_cur/dt_disk));
    double t_contr = dt_contr*(1+floor(time_cur/dt_contr));
    double t_bh    = dt_bh*(1+floor(time_cur/dt_bh));

    if (myRank == rootRank) {
        printf("t_disk = %.6E   t_contr = %.6E   t_bh = %.6E \n", t_disk, t_contr, t_bh);
        printf("\n");
        fflush(stdout);
    } /* if (myRank == rootRank) */

    std::fill(t,  t+N,  time_cur);
    std::fill(dt, dt+N, dt_min);

    /* some local settings for G6a boards */
    int clusterid, numGPU;
    if (config->devices_per_node==0) {
        MPI_Comm shmcomm;
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(shmcomm, &numGPU);
        MPI_Comm_rank(shmcomm, &clusterid);
    } else {
        numGPU = config->devices_per_node;
        clusterid = myRank % numGPU;
    }
    printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
    fflush(stdout);

    /* init the local GRAPEs */
    g6_open(clusterid);
    int npipe = g6_npipes();
    g6_set_tunit(51);
    g6_set_xunit(51);

    int n_loc = N/n_proc;
    Calc_self_grav calc_self_grav(N, n_loc, clusterid, npipe, eps);
    Active_search active_search(myRank, n_proc, n_loc, N);
    Black_hole_physics black_hole_physics;
    if (config->live_smbh_count == 1)
        black_hole_physics = Black_hole_physics(m[0], 0, myRank, rootRank);
    else if (config->live_smbh_count == 2)
        black_hole_physics = Black_hole_physics(m[0], m[1], myRank, rootRank);
    if (config->binary_smbh_pn) {
        black_hole_physics.set_post_newtonian(config->pn_c, config->pn_usage.data());
        if (config->pn_usage[6]) black_hole_physics.set_spins(config->smbh1_spin.data(), config->smbh2_spin.data());
    }
    black_hole_physics.set_softening(eps, config->live_smbh_custom_eps);

    Binary_smbh_influence_sphere_output binary_smbh_influence_sphere_output(config->binary_smbh_influence_radius_factor, N, m, x, v, pot, dt);

    Write_bh_nb_data write_bh_nb_data(config->live_smbh_neighbor_number, config->live_smbh_count, N, m, x, v);

#ifdef ETICS
    grapite_read_particle_tags(N, config->grapite_mask_file_name.c_str(), myRank, n_loc);
    grapite_set_dt_exp(config->dt_scf);
    grapite_set_t_exp(time_cur);
#endif

    /* load the nj particles to the G6 */

    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, zeros, zeros, v[j], x[j]);
    } /* k */

#ifdef ETICS
    double etics_length_scale;
    if (myRank == rootRank) etics_length_scale = grapite_get_length_scale(N, m, (double(*)[3])x, (double(*)[3])v); // We don't want all ranks to do it, because they need to write a file and might confuse each other
    MPI_Bcast(&etics_length_scale, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    grapite_set_length_scale(etics_length_scale);

    int grapite_cep_index = grapite_get_cep_index();
    if (grapite_cep_index >= 0) {
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, zeros, zeros);
    }
#endif

    /* define the all particles as a active on all the processors for the first time grav calc. */
    calc_self_grav(time_cur, N, ind, x, v, pot_act_tmp, a_act_tmp, adot_act_tmp);

    /* Reduce the "global" vectors from "local" on all processors) */
    // TODO why won't we do the MPI_Allreduce inside the calc_self_grav function, and get rid of these _tmp arrays?
    MPI_Allreduce(pot_act_tmp,  pot,    N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(a_act_tmp,    a,    3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(adot_act_tmp, adot, 3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

    if (config->live_smbh_count == 2) {
        black_hole_physics.set_xv(x[0], x[1], v[0], v[1]);
        if (config->live_smbh_custom_eps >= 0) black_hole_physics.adjust_softening(pot[0], pot[1], a[0], a[1], adot[0], adot[1]);
        if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[0], a[0], a[1], adot[0], adot[1]);
    }

    calc_ext_grav(external_gravity_components, N, x, v, pot_ext, a, adot);

    /* Energy control... */
    if (myRank == rootRank) energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);

#ifdef ETICS
    if (config->etics_dump_coeffs && (diskstep==0)) {
        char out_fname[256];
        sprintf(out_fname, "coeffs.%06d.%02d.dat", 0, myRank);
        grapite_dump(out_fname, 2);
    }

    if (grapite_cep_index >= 0) {
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
    }
#endif

    /* Define initial timestep for all particles on all nodes */

    for (int j=0; j<N; j++) {
        double a2_mod = a[j].norm2();
        double adot2_mod = adot[j].norm2();

        double dt_tmp, eta_s = eta/ETA_S_CORR;
        if (adot2_mod==0) dt_tmp = eta_s; // That's weird, when will we have such a case?
        else              dt_tmp = eta_s*sqrt(a2_mod/adot2_mod);

        int power = log(dt_tmp)/log(2.0) - 1;

        dt_tmp = pow(2.0, (double)power);

        if (dt_tmp < dt_min) dt_tmp = dt_min;
        if (dt_tmp > dt_max) dt_tmp = dt_max;

        dt[j] = dt_tmp;

        if (config->dt_min_warning && (myRank == 0)) {
            if (dt[j] == dt_min) {
                printf("!!! Warning0: dt = dt_min = %.6E \t ind = %07d \n", dt[j], ind[j]);
                fflush(stdout);
            }
        }
    } /* j */

    if (config->live_smbh_count > 0) {
        double min_dt = *std::min_element(dt, dt+N);
        for (int i=0; i<config->live_smbh_count; i++) dt[i] = min_dt;
    }

    /* load the new values for particles to the local GRAPEs */

    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, adot[j]*(1./6.), a[j]*0.5, v[j], x[j]);
    } /* k */

    if (myRank == rootRank) {

        /* Write BH data... */
        if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

        /* Write BH NB data... */
        if (config->live_smbh_neighbor_output) write_bh_nb_data(time_cur);

    } /* if (myRank == rootRank) */

    /* Get the Starting time on rootRank */

    if (myRank == rootRank) {
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
    } /* if (myRank == rootRank) */

    timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
    n_act_sum = 0.0;

#ifdef TIMING
    DT_TOT      = 0.0;

    DT_ACT_DEF1 = 0.0;
    DT_ACT_DEF2 = 0.0;
    DT_ACT_DEF3 = 0.0;
    DT_ACT_PRED = 0.0;
    DT_ACT_GRAV = 0.0;
    DT_EXT_GRAV = 0.0;
    DT_EXT_GMC_GRAV = 0.0;
    DT_GMC_GMC_GRAV = 0.0;
    DT_ACT_CORR = 0.0;
    DT_ACT_LOAD = 0.0;

    DT_STEVOL    = 0.0;
    DT_STARDISK  = 0.0;
    DT_STARDESTR = 0.0;

    DT_ACT_REDUCE = 0.0;
#endif

    /* The main integration loop */

    while (time_cur <= t_end) {

        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        double min_t = active_search.get_minimum_time(t, dt);
        printf("zzzzzzzzzzzzzzzzz %.10e\n", min_t);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        active_search.get_active_indices(min_t, t, dt, ind_act, &n_act);

// TODO deal with it below
// #ifdef ACT_DEF_GRAPITE
// #error please fix here
// #endif

        static int printouts = 0;
        int n_bh = config->live_smbh_count;
        if (n_bh>0) {
            if (act_def_grapite) {
                int act_def_grapite_bh_count = 0;
                int i_bh[n_bh];
                for (int i=0; i<n_act; i++) {
                    if (ind_act[i]<n_bh) {
                        i_bh[ind_act[i]] = i;
                        if (++act_def_grapite_bh_count == n_bh) break;
                    }
                }
                i_bh1 = i_bh[0];
                if (n_bh == 2) i_bh2 = i_bh[1];
            } else {
                i_bh1 == 0;
                if (n_bh == 2) i_bh2 = 1;
            }
        }
        printf("previously got i_bh1=%d and i_bh2=%d\n", i_bh1, i_bh2);
        for (int i=0; i<n_act; i++) {
            if (ind_act[i]==0) i_bh1=i;
            if (ind_act[i]==1) i_bh2=i;
        }
        printf("now finding    i_bh1=%d and i_bh2=%d\n", i_bh1, i_bh2);
        if (++printouts >= 10) return 0;


// #if defined(ACT_DEF_GRAPITE) && (defined(ADD_BH1) || defined(ADD_BH2))

// #ifdef ADD_BH1
// #define ACT_DEF_GRAPITE_NUMBH 1
// #else
// #define ACT_DEF_GRAPITE_NUMBH 2
// #endif

//         int act_def_grapite_bh_count = 0;
//         for (i=0; i<n_act; i++) {
//             if (ind_act[i]==0) {
//                 i_bh1 = i;
//                 act_def_grapite_bh_count++;
//         }
// #ifdef ADD_BH2
//             else if (ind_act[i]==1) {
//                 i_bh2 = i;
//                 act_def_grapite_bh_count++;
//             }
// #endif
//             if (act_def_grapite_bh_count==config->live_smbh_count) break;
//         }
//         if (i==n_act) {
//             fprintf(stderr, "ERROR: black holes were not found in the active particle list");
//             return -1;
//         }
// #else
//         if (config->live_smbh_count > 0) {
//             i_bh1 = 0;
//             i_bh2 = 1;
//         }
// #endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* predict the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            double dt = min_t - t[j_act];
            double dt2half = 0.5*dt*dt;
            double dt3over6 = (1./3.)*dt*dt2half;
            x_act_new[i] = x[j_act] + v[j_act]*dt + a[j_act]*dt2half + adot[j_act]*dt3over6;
            v_act_new[i] = v[j_act] + a[j_act]*dt + adot[j_act]*dt2half;
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif
        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new,
                               pot_act_tmp, a_act_tmp, adot_act_tmp);

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Reduce the "global" vectors from "local" on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
#endif

        if (config->live_smbh_count == 2) {
            black_hole_physics.set_xv(x_act_new[i_bh1], x_act_new[i_bh2], v_act_new[i_bh1], v_act_new[i_bh2]);
            if (config->live_smbh_custom_eps >= 0) black_hole_physics.adjust_softening(pot_act_new[i_bh1], pot_act_new[i_bh2], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
            if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[i_bh1], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
        }

        calc_ext_grav(external_gravity_components, n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);

        /* correct the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        double min_dt = dt_max;
        for (int i=0; i<n_act; i++) {
            // NOTICE looks like we're doing three unrelated things in this loop: (1) correcting positions and velocities (2) calculating new steps, and (3) putting the corrected values from the _act_new back in the _act arrays.
            // After going back to the _act arrays they don't do much before they go back to the main arrays, so this copy seems redundant (the SMBH influence sphere printout needs these values but it should be a function anyway).
            // TODO split this loop into the three tasks it is doing, and remove the redundancy.
            int j_act = ind_act[i];
            double dt_tmp = min_t - t[j_act];

            double3 a2, a3;
            calc_high_derivatives(dt_tmp, a[j_act], a_act_new[i], adot[j_act], adot_act_new[i], a2, a3);

            corrector(dt_tmp, a2, a3, x_act_new[i], v_act_new[i]);

            //TODO make beautiful
            double eta_curr;
            if ((config->live_smbh_count > 0) && (ind_act[i] < config->live_smbh_count)) eta_curr = eta_bh;
            else eta_curr = eta;

            double dt_new = aarseth_step(eta_curr, dt_tmp, a_act_new[i], adot_act_new[i], a2, a3);

            //TODO the below should be moved to a function
            if (dt_new < dt_min) dt_tmp = dt_min;
            if ((dt_new < dt_tmp) && (dt_new > dt_min)) {
                int power = log(dt_new)/log(2.0) - 1;
                dt_tmp = pow(2.0, (double)power); // TODO why is this casting needed here?
            }

            if ((dt_new > 2*dt_tmp) && (fmod(min_t, 2*dt_tmp) == 0) && (2*dt_tmp <= dt_max)) {
                dt_tmp *= 2;
            }

            if (config->dt_min_warning && (myRank == 0)) {
                if (dt_tmp == dt_min) {
                    printf("!!! Warning1: dt_act = dt_min = %.6E \t ind_act = %07d \n", dt_tmp, ind_act[i]);
                    fflush(stdout);
                }
            }
            if (dt_tmp < min_dt) min_dt = dt_tmp;

            x[j_act] = x_act_new[i];
            v[j_act] = v_act_new[i];
            t[j_act] = min_t;
            dt[j_act] = dt_tmp;
            pot[j_act] = pot_act_new[i];
            pot_ext[j_act] = pot_act_ext[i];
            a[j_act] = a_act_new[i];
            adot[j_act] = adot_act_new[i];
        } /* i */

        /* define the min. dt over all the act. part. and set it also for the BH... */

        if (config->live_smbh_count > 0) {
            if (config->live_smbh_count>=1) dt[0] = min_dt;
            if (config->live_smbh_count==2) dt[1] = min_dt;

        }

        if (config->binary_smbh_influence_sphere_output && (myRank == rootRank)) {
            //TODO clean up this mass. We don't want to have all these _act arrays; they are only needed for this lame function.
            binary_smbh_influence_sphere_output(ind_act, n_act, timesteps, time_cur);
        }

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* load the new values for active particles to the local GRAPE's */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
#ifdef ETICS
            if (ind_act[i] == grapite_cep_index) grapite_update_cep(t[grapite_cep_index], x[grapite_cep_index], v[grapite_cep_index], a[grapite_cep_index], adot[grapite_cep_index]); // All ranks should do it.
#endif
            int cur_rank = ind_act[i]/n_loc;

            if (myRank == cur_rank) {
                int j_act = ind_act[i];
                int address = ind_act[i] - myRank*n_loc;
                g6_set_j_particle(clusterid, address, ind_act[i], t[j_act], dt[j_act], m[j_act], zeros, adot[j_act]*(1./6.), a[j_act]*0.5, v[j_act], x[j_act]);

            } /* if (myRank == cur_rank) */

        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Current time set to min_t */

        time_cur = min_t;

        timesteps += 1.0;
        n_act_sum += n_act;

        if (time_cur >= t_bh) {
            if (myRank == rootRank) {
                /* Write BH data... */
                if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

                /* Write BH NB data... */
                if (config->live_smbh_neighbor_output) write_bh_nb_data(time_cur);

            } /* if (myRank == rootRank) */

            t_bh += dt_bh;
        } /* if (time_cur >= t_bh) */

        if (time_cur >= t_contr) {
            if (myRank == rootRank) {

                energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);

                /* write cont data */
                if (config->output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);

                /* possible OUT for timing !!! */
#ifdef TIMING
                FILE *out = fopen("timing.dat", "a");

                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
                         DT_ACT_CORR + DT_ACT_LOAD +
                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
                         DT_ACT_REDUCE;

                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
                        time_cur, DT_TOT,
                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
                        100.0*DT_ACT_REDUCE/DT_TOT,
                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);

                fclose(out);
#endif

            } /* if (myRank == rootRank) */

#ifdef ETICS
            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.
            if (grapite_cep_index >= 0) {
                grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
                x[grapite_cep_index] = xdc;
                v[grapite_cep_index] = vdc;
                grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
            }
#endif

            t_contr += dt_contr;
        } /* if (time_cur >= t_contr) */

        if (time_cur >= t_disk) {
            char out_fname[256];
            diskstep++;
            if (myRank == rootRank) {
                sprintf(out_fname, "%06d", diskstep);
                if (config->output_hdf5) h5_write(std::string(out_fname) + ".h5", diskstep, N, time_cur, m, x, v, pot, a, adot, config->output_extra_mode, config->output_hdf5_double_precision);
                else ascii_write(std::string(out_fname) + ".dat", diskstep, N, time_cur, m, x, v, config->output_ascii_precision);
            } /* if (myRank == rootRank) */

#ifdef ETICS
            if (config->etics_dump_coeffs) {
                sprintf(out_fname, "coeffs.%06d.%02d.dat", diskstep, myRank);
                grapite_dump(out_fname, 2);
            }
#endif
            t_disk += dt_disk;
        } /* if (time_cur >= t_disk) */
    } /* while (time_cur < t_end) */

    /* close the local GRAPEs */
    g6_close(clusterid);

    double g6_calls_sum;
    MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);

    if (myRank == rootRank) {

        /* Write some output for the timestep annalize... */

        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
        printf("\n");
        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
        fflush(stdout);

    } /* if (myRank == rootRank) */

    delete[] m; delete[] x; delete[] v; delete[] ind; delete[] a; delete[] adot; delete[] pot; delete[] pot_ext; delete[] t; delete[] dt; delete[] ind_act; delete[] pot_act_new; delete[] pot_act_tmp; delete[] x_act_new; delete[] v_act_new; delete[] a_act_tmp; delete[] adot_act_tmp; delete[] a_act_new; delete[] adot_act_new; delete[] pot_act_ext;

    /* Finalize the MPI work */
    MPI_Finalize();

    return 0;
}