phigrape/phigrape.cpp

/*****************************************************************************
File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
                :
Contents       : N-body code with integration by individual block time step
                : together with the parallel using of GRAPE6a board's.
                :
                : Added the GPU support via SAPPORO library.
                :
                : Normalization to the physical units!!!
                :
                : External Potential added
                : Plummer-Kuzmin: Bulge, Disk, Halo
                : Kharchenko+Andreas...
                :
                : SC extra POT for Bek SC test runs...
                :
                : Rebuced to the Single BH -> Plummer
                : Andreas+Fazeel...
                :
                : Stellar evolution added
                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
                : IMS mass loss: van den Hoeg & Groenewegen (1997)
                :
                : STARDESTR_EXT: Tidal disruption of stars by external BH...
                : Chingis, Denis & Maxim...
                :
                : STARDESTR: Tidal disruption of stars by BH...
                : Jose, Li Shuo & Shiyan Zhong
                :
                : STARDISK: Drag force...
                : Chingis, Denis & Maxim...
                :
                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
                : Taras, Andreas...
                :
                : Live BH (1 || 2) + ACC + EJECT...
                : Li Shuo & Shiyan Zhong
                :
                : dt_min for BH (1 || 2)...
                :
                : added the PN calculus for the BBH
                : PN0, PN1, PN2, PN2.5 (coded on the base of
                : Gabor Kupi original routine)
                :
                : added the "name" array...
                :
                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
                : for Alexey SC runs... and also for Fazeel Zurich runs...
                :
                : CPU_TIMELIMIT added for the Julich MW cluster runs...
                :
Coded by       : Peter Berczik
Version number : 19.04
Last redaction : 2019.04.16 12:55
*****************************************************************************/
#include "double3.h"
#include "config.h"
#include "io.h"
#include "external.h"

Config *config;

#ifdef ETICS
#include "grapite.h"
// why do we need CEP as a compilaion flag... just have it always on when ETICS is on. IF there is no CEP, there should be a graceful skipping of those operations.
//#define ETICS_CEP
#ifndef ETICS_DTSCF
#error "ETICS_DTSCF must be defined"
#endif
#endif

#define TIMING

#define ETA_S_CORR   4.0
#define ETA_BH_CORR  4.0

#define DTMAXPOWER  -3.0
#define DTMINPOWER -36.0

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <algorithm>
#include <numeric>
#include <stdexcept>

#define G6_NPIPE 2048
#include "grape6.h"

#include <mpi.h>

/* Some "good" functions and constants... */
// TODO replace with inline functions
#define MIN(a,b) (((a)<(b)) ? (a):(b) )
#define SQR(x)   ((x)*(x) )


#define KB         1024
#define MB         (KB*KB)

#define N_MAX      (6*MB)

double L[3]; // needed in pn_bh_spin.c
// Needed for things related to BHs
#include "debug.h"

double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
double CPU_time_real,  CPU_time_user,  CPU_time_syst;

#ifdef TIMING

double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;

double DT_TOT,
    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
    DT_ACT_GRAV, DT_EXT_GRAV,
    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
    DT_ACT_CORR, DT_ACT_LOAD,
    DT_STEVOL, DT_STARDISK, DT_STARDESTR;

double DT_ACT_REDUCE;

#endif

/* some local settings for G6a board's */

int clusterid, ii, nn, numGPU;

int    npipe=G6_NPIPE, index_i[G6_NPIPE];
double h2_i[G6_NPIPE], p_i[G6_NPIPE];
double3 x_i[G6_NPIPE], v_i[G6_NPIPE],
        a_i[G6_NPIPE], jerk_i[G6_NPIPE];
int    new_tunit=51, new_xunit=51;

double ti=0.0;

double eps_BH=0.0;

/* external potential... */

double3 x_bh1, x_bh2, v_bh1, v_bh2;

double pot_bh1, pot_bh2;
double3 a_bh1, adot_bh1, a_bh2, adot_bh2;

#include "n_bh.c"

double3 a_pn1[7], adot_pn1[7], a_pn2[7], adot_pn2[7];

#include "pn_bh_spin.c"

#ifdef ETICS
double t_exp, dt_exp=ETICS_DTSCF; // t_exp is just the initial value
#ifdef ETICS_CEP
int grapite_cep_index;
#endif
#endif

void get_CPU_time(double *time_real, double *time_user, double *time_syst)
{
    struct rusage xxx;
    double sec_u, microsec_u, sec_s, microsec_s;
    struct timeval tv;

    getrusage(RUSAGE_SELF,&xxx);

    sec_u = xxx.ru_utime.tv_sec;
    sec_s = xxx.ru_stime.tv_sec;

    microsec_u = xxx.ru_utime.tv_usec;
    microsec_s = xxx.ru_stime.tv_usec;

    *time_user = sec_u + microsec_u * 1.0E-06;
    *time_syst = sec_s + microsec_s * 1.0E-06;

    gettimeofday(&tv, NULL);
    *time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;

    *time_user = *time_real;
}

void write_bh_data(double time_cur, double m[], double3 x[], double3 v[], double pot[], double3 a[], double3 adot[], double dt[])
{
    if (config->live_smbh_count == 2) {

        auto out = fopen("bh.dat", "a");

        for (int i=0; i < 2; i++) {
            double3 *a_pn, *adot_pn, a_bh, adot_bh;
            double pot_bh;
            if (i==0) {
                a_pn    = a_pn1;
                adot_pn = adot_pn1;
                pot_bh  = pot_bh1;
                a_bh    = a_bh1;
                adot_bh = adot_bh1;
            } else {
                a_pn    = a_pn2;
                adot_pn = adot_pn2;
                pot_bh  = pot_bh2;
                a_bh    = a_bh2;
                adot_bh = adot_bh2;
            }
            if (config->live_smbh_custom_eps >= 0) {
                fprintf(out, "%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \t\t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E ",
                        time_cur, m[i],
                        x[i][0], x[i][1], x[i][2], x[i].norm(),
                        v[i][0], v[i][1], v[i][2], v[i].norm(),
                        pot[i],
                        a[i][0],       a[i][1],       a[i][2],       a[i].norm(),
                        adot[i][0],    adot[i][1],    adot[i][2],    adot[i].norm(),
                        dt[i],
                        pot_bh,
                        a_bh[0],       a_bh[1],       a_bh[2],       a_bh.norm(),
                        adot_bh[0],    adot_bh[1],    adot_bh[2],    adot_bh.norm());
                if (config->binary_smbh_pn) {
                    fprintf(out, "\t");
                    for (int pn_idx=0; pn_idx < 7; pn_idx++) {
                        fprintf(out, "\t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E ", a_pn[pn_idx][0], a_pn[pn_idx][1], a_pn[pn_idx][2], a_pn[pn_idx].norm(), adot_pn[pn_idx][0], adot_pn[pn_idx][1], adot_pn[pn_idx][2], adot_pn[pn_idx].norm());
                    }
                }
                fprintf(out, "\n");
            } else {
                fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \n",
                        time_cur, m[i],
                        x[i][0], x[i][1], x[i][2], x[i].norm(),
                        v[i][0], v[i][1], v[i][2], v[i].norm(),
                        pot[i],
                        a[i][0], a[i][1], a[i][2], a[i].norm(),
                        adot[i][0], adot[i][1], adot[i][2], adot[i].norm(),
                        dt[i]);
            }
        }
        fprintf(out, "\n");
        fclose(out);
    } else if (config->live_smbh_count == 1) {
        auto out = fopen("bh.dat", "a");
        double tmp_r = sqrt( SQR(x[0][0]) + SQR(x[0][1]) + SQR(x[0][2]) );
        double tmp_v = sqrt( SQR(v[0][0]) + SQR(v[0][1]) + SQR(v[0][2]) );
        double tmp_a = sqrt( SQR(a[0][0]) + SQR(a[0][1]) + SQR(a[0][2]) );
        double tmp_adot = sqrt( SQR(adot[0][0]) + SQR(adot[0][1]) + SQR(adot[0][2]) );
        fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \n",
                time_cur, m[0],
                x[0][0], x[0][1], x[0][2], tmp_r,
                v[0][0], v[0][1], v[0][2], tmp_v,
                pot[0],
                a[0][0], a[0][1], a[0][2], tmp_a,
                adot[0][0], adot[0][1], adot[0][2], tmp_adot,
                dt[0]);
        fprintf(out,"\n");
        fclose(out);
    }
}

void write_bh_nb_data(double time_cur, int N, double m[], double3 x[], double3 v[])
{
    int nb = config->live_smbh_neighbor_number;
    int    ind_sort[N_MAX];
    double var_sort[N_MAX];
    //TODO you don't want and probably don't need to allocate these here. Maybe just need size nb, or maybe use as private variables.

    auto out = fopen("bh_neighbors.dat", "a");

    /* 1st BH */

    for (int i_bh=0; i_bh < config->live_smbh_count; i_bh++) {
        for (int i=0; i<N; i++) var_sort[i] = (x[i]-x[i_bh]).norm();
        std::iota(ind_sort, ind_sort+N, 0);
        std::partial_sort(ind_sort, ind_sort + nb, ind_sort + N, [&](int i, int j) {return var_sort[i] < var_sort[j];});

        fprintf(out,"%.16E \t %07d \t %.8E \t % .8E % .8E % .8E \t % .8E % .8E % .8E \t",
                time_cur,
                i_bh,
                m[i_bh],
                x[i_bh][0], x[i_bh][1], x[i_bh][2],
                v[i_bh][0], v[i_bh][1], v[i_bh][2]);

        for (int j=1; j < nb; j++) {
            int i = ind_sort[j];
            fprintf(out,"%02d %07d   %.8E   % .8E % .8E % .8E   %.8E   % .8E % .8E % .8E   %.8E \t",
                    j, i,
                    m[i],
                    x[i][0], x[i][1], x[i][2], (x[i]-x[i_bh]).norm(),
                    v[i][0], v[i][1], v[i][2], (v[i]-v[i_bh]).norm());
        }

        fprintf(out,"\n");
    }
    fprintf(out, "\n"); // this is redundant
    fclose(out);
}

void calc_self_grav(double t, double eps2, double &g6_calls, int n_loc,
                    int n_act, int ind_act[],
                    double3 x_act_new[], double3 v_act_new[],
                    double pot_act_tmp[],
                    double3 a_act_tmp[],
                    double3 adot_act_tmp[],
                    double h2_i[])
{
    /* calc the new grav for the active particles */

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

    g6_set_ti(clusterid, t);

    int ni = n_act; // TODO why is this needed?

    /* define the local phi, a, adot for these active particles */

    for (int i=0; i<ni; i+=npipe) {
        nn = npipe;
        if (ni-i < npipe) nn = ni - i;
        for (ii=0; ii<nn; ii++) {
            h2_i[ii]    = eps2; // TODO This should be a global or something
        } /* ii */
        //TODO any way we can clean up this ugly casting?
        g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i, eps2, h2_i);
        g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, eps2, h2_i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i);
        g6_calls++;
    } /* i */

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif
}

void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n_act, double3 *x_act_new, double3 *v_act_new, double *pot_act_ext, double3 *a_act_new, double3* adot_act_new)
{

/* Define the external potential for all active particles on all nodes */

int ni = n_act; // TODO redundant?

    std::fill(pot_act_ext, pot_act_ext+n_act, 0.);

    for (auto component : external_gravity_components) {
        if (component->is_active)
            component->apply(n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);
    }

#ifdef TIMING
get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

    /* For simple Plummer potential... */

#ifdef TIMING
get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif

}

void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, double& rcm_sum, double& vcm_sum, double& E_tot_0, double& E_tot_corr_0, double& E_tot_corr_sd_0, int &skip_con, int N, double m[], double3 x[], double3 v[], double pot[], double pot_ext[])
{
    // TODO maybe use static variables here for the previous step energy?
    double E_pot = 0.0;
    for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;

    double E_kin = 0.0;
    for (int i=0; i<N; i++) E_kin += m[i]*( SQR(v[i][0]) + SQR(v[i][1]) + SQR(v[i][2]) );
    E_kin *= 0.5;

    double E_pot_ext = 0.0;

    for (int i=0; i<N; i++) E_pot_ext += m[i]*pot_ext[i];

    if (timesteps == 0.0) { //TODO what is this thing?
        rcm_sum = 0.0;
        vcm_sum = 0.0;
    }

    double mcm     = 0.0;
    double rcm_mod = 0.0;
    double vcm_mod = 0.0;

    double3 xcm = {0, 0, 0};
    double3 vcm = {0, 0, 0};

    for (int i=0; i<N; i++) {
        mcm += m[i];
        xcm += m[i] * x[i];
        vcm += m[i] * v[i];
    } /* i */

    xcm /= mcm;
    vcm /= mcm;

    rcm_mod = xcm.norm();
    vcm_mod = vcm.norm();

    rcm_sum += rcm_mod;
    vcm_sum += vcm_mod;

    double3 mom = {0, 0, 0};

    for (int i=0; i<N; i++) {
        mom[0] += m[i] * ( x[i][1] * v[i][2] - x[i][2] * v[i][1] );
        mom[1] += m[i] * ( x[i][2] * v[i][0] - x[i][0] * v[i][2] );
        mom[2] += m[i] * ( x[i][0] * v[i][1] - x[i][1] * v[i][0] );
    } /* i */

    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);

    double E_tot = E_pot + E_kin + E_pot_ext;
    double E_tot_corr    = E_tot /* + E_corr */; // TODO what is this?

    if (timesteps == 0.0) {
        /* initialize the E_tot_0 + etc... */

        E_tot_0 = E_tot;
        E_tot_corr_0 = E_tot;
        E_tot_corr_sd_0 = E_tot;

        skip_con = 1;
    } else {
        /* read the E_tot_0 + etc... */
        if (skip_con == 0) {
            auto inp = fopen("contr.con","r");
            double tmp;
            fscanf(inp,"%lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE",
                   &tmp, &tmp, &tmp, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
                   &E_tot_0, &tmp,
                   &E_tot_corr_0, &tmp,
                   &E_tot_corr_sd_0, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp);
            fclose(inp);
            skip_con = 1;
        }
    }
    double DE_tot         = E_tot         - E_tot_0;

    /* This is the only output to screen */
    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
            CPU_time_user-CPU_time_user0);

    fflush(stdout);

    auto out = fopen("contr.dat", "a");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    out = fopen("contr.con", "w");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    E_tot_0 = E_tot;
    E_tot_corr_0 = E_tot_corr;
    E_tot_corr_sd_0 = 0;
}

class Active_search {
    // TODO you can add pointers to t and dt at the constructor, no point giving them at get_minimum_time but without the size.
public:
    Active_search(const int myRank, const int n_proc, const int n_loc, const int N)
        : myRank(myRank), n_proc(n_proc), n_loc(n_loc), N(N)
    {
        ind_act_loc = new int[n_loc];
    }
    ~Active_search() { delete[] ind_act_loc; };
    double get_minimum_time(const double t[], const double dt[])
    {
        double min_t_loc, min_t;
#ifdef ACT_DEF_GRAPITE
        min_t_loc = grapite_get_minimum_time();
#else
        min_t_loc = t[myRank*n_loc]+dt[myRank*n_loc];
        for (int j=myRank*n_loc+1; j<(myRank+1)*n_loc; j++) {
            double tmp = t[j] + dt[j];
            if (tmp < min_t_loc) min_t_loc = tmp;
        }
#endif
        /* Reduce the "global" min_t from min_t_loc "local" on all processors) */
        MPI_Allreduce(&min_t_loc, &min_t, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
        return min_t;
    }
    void get_active_indices(const double min_t, const double t[], const double dt[], int ind_act[], int *n_act)
    {
#ifdef ACT_DEF_GRAPITE
        int n_act_loc;
        grapite_active_search(min_t, ind_act_loc, &n_act_loc);
        if (myRank > 0)
            for (int i=0; i<n_act_loc; i++)
                ind_act_loc[i] += myRank*n_loc;
        int n_act_arr[256], displs[256]; // Assuming maximum of 256 processes... seems safe.
        MPI_Allgather(&n_act_loc, 1, MPI_INT, n_act_arr, 1, MPI_INT, MPI_COMM_WORLD);
        *n_act = n_act_arr[0];
        for (int i=1; i<n_proc; i++)
            *n_act += n_act_arr[i];
        displs[0] = 0;
        for (int i=1; i<n_proc; i++)
            displs[i]=displs[i-1]+n_act_arr[i-1];
        MPI_Allgatherv(ind_act_loc, n_act_loc, MPI_INT, ind_act, n_act_arr, displs, MPI_INT, MPI_COMM_WORLD);
#else
        *n_act = 0;
        for (int i=0; i<N; i++) {
            if (t[i]+dt[i] == min_t) ind_act[(*n_act)++] = i;
        } /* i */
#endif      // ACT_DEF_GRAPITE
    }
private:
    int myRank, n_proc, n_loc, N;
    int *ind_act_loc;
};

class Source_particle_list {
public:
    Source_particle_list(int N)
        : N(N)
    {
        ind  = new int[N];
        m    = new double[N];
        x    = new double3[N];
        v    = new double3[N];
        pot  = new double[N];
        a    = new double3[N];
        adot = new double3[N];
        t    = new double[N];
        dt   = new double[N];
    }
    ~Source_particle_list()
    {
        delete[] ind;
        delete[] m;
        delete[] x;
        delete[] v;
        delete[] pot;
        delete[] a;
        delete[] adot;
        delete[] t;
        delete[] dt;
    }
    int N;
    int *ind;
    double3 *x, *v, *a, *adot;
    double *m, *t, *dt, *pot;
};

inline void calc_high_derivatives(const double dt_tmp, const double3 a_act, const double3 a_act_new, const double3 adot_act, const double3 adot_act_new, double3 *a2, double3 *a3)
{
    double dtinv = 1.0/dt_tmp;
    double dt2inv = dtinv*dtinv;
    double dt3inv = dt2inv*dtinv;

    double3 a0mia1 = a_act-a_act_new;
    double3 ad04plad12 = 4.0*adot_act + 2.0*adot_act_new;
    double3 ad0plad1 = adot_act + adot_act_new;

    *a2 = -6.0*a0mia1*dt2inv - ad04plad12*dtinv;
    *a3 = 12.0*a0mia1*dt3inv + 6.0*ad0plad1*dt2inv;
}

inline void corrector(const double dt_tmp, const double3 a2, const double3 a3, double3* x_act_new, double3* v_act_new)
{
    double dt3over6 = dt_tmp*dt_tmp*dt_tmp/6.0;
    double dt4over24 = dt3over6*dt_tmp/4.0;
    double dt5over120 = dt4over24*dt_tmp/5.0;

    *x_act_new += dt4over24*a2 + dt5over120*a3;
    *v_act_new += dt3over6*a2 + dt4over24*a3;
}

inline double aarseth_step(const double eta, const double dt_tmp, const double3 a_act_new, const double3 adot_act_new, const double3 a2, const double3 a3)
{
    double a1abs = a_act_new.norm();
    double adot1abs = adot_act_new.norm();
    double3 a2dot1 = a2 + dt_tmp*a3;
    double a2dot1abs = a2dot1.norm();
    double a3dot1abs = a3.norm();

    return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
}

void binary_smbh_influence_sphere_output(int i_bh1, int i_bh2, int ind_act[], double m_act[], double3 x_act[], double3 v_act[], double pot_act[], double dt_act[], int n_act, double timesteps, double time_cur,  double factor, int inf_event[])
{
    //TODO !!IMPORTANT!! only open the file IF THERE IS ANYTHING TO WRITE!!!
    //TODO inf_event to be static or something?
    auto out = fopen("bbh_inf.dat", "a");

    double m_bh1 = m_act[i_bh1];
    double m_bh2 = m_act[i_bh2];

    double3 x_bh1 = x_act[i_bh1];
    double3 x_bh2 = x_act[i_bh2];

    double3 v_bh1 = v_act[i_bh1];
    double3 v_bh2 = v_act[i_bh2];

    double3 x_bbhc = (m_bh1*x_bh1 + m_bh2*x_bh2)/(m_bh1 + m_bh2);
    double3 v_bbhc = (m_bh1*v_bh1 + m_bh2*v_bh2)/(m_bh1 + m_bh2);

    double DR2 = SQR(x_bh1[0] - x_bh2[0]) + SQR(x_bh1[1] - x_bh2[1]) + SQR(x_bh1[2] - x_bh2[2]);
    double DV2 = SQR(v_bh1[0] - v_bh2[0]) + SQR(v_bh1[1] - v_bh2[1]) + SQR(v_bh1[2] - v_bh2[2]);

    double EB = -(m_bh1 + m_bh2) / sqrt(DR2) + 0.5 * DV2;

    double SEMI_a = -0.5 * (m_bh1 + m_bh2)/EB;
    double SEMI_a2 = SQR(SEMI_a);

    for (int i=2; i<n_act; i++) {

        double tmp_r2 = SQR(x_act[i][0] - x_bbhc[0]) + SQR(x_act[i][1] - x_bbhc[1]) + SQR(x_act[i][2] - x_bbhc[2]);

        int iii = ind_act[i];

        if (tmp_r2 < SEMI_a2*SQR(factor)) {

            if (inf_event[iii] == 0) {

                fprintf(out,"INF1 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                    timesteps, time_cur, i, ind_act[i],
                    sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                    m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_act[0],
                    m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_act[1],
                    sqrt(tmp_r2),
                    m_act[i], x_act[i][0], x_act[i][1], x_act[i][2], v_act[i][0], v_act[i][1], v_act[i][2], pot_act[i],
                    dt_act[i]);

            inf_event[iii] = 1;
            }

        } else {
            if (inf_event[iii] == 1) {
                fprintf(out,"INF2 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                    timesteps, time_cur, i, ind_act[i],
                    sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                    m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_act[0],
                    m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_act[1],
                    sqrt(tmp_r2),
                    m_act[i], x_act[i][0], x_act[i][1], x_act[i][2], v_act[i][0], v_act[i][1], v_act[i][2], pot_act[i],
                    dt_act[i]);
            }

            inf_event[iii] = 0;

        } /* if (tmp_r2 < DR2*R_INF2) */

    } /* i */
    fclose(out);

}

int main(int argc, char *argv[])
{
    int name_proc, n_proc=1, myRank=0, rootRank=0, cur_rank,
        nj, diskstep=0, power, jjj, iii,
        skip_con=0, tmp_i;

    double dt_disk, dt_contr, t_disk=0.0, t_contr=0.0,
           dt_bh, t_bh=0.0, dt_bh_tmp,
           t_end, time_cur, dt_min, dt_max, min_t, min_t_loc, dt_new,
           eta_s, eta, eta_bh,
           E_tot_0, E_tot_corr_0, E_tot_corr_sd_0,
           rcm_sum=0.0, vcm_sum=0.0,
           eps=0.0, eps2,
           a2_mod, adot2_mod,
           dt_tmp, dt2half, dt3over6, dt4over24, dt5over120,
           dtinv, dt2inv, dt3inv,
           a1abs, adot1abs, a2dot1abs, a3dot1abs,
           timesteps=0.0, n_act_sum=0.0, n_act_distr[N_MAX], g6_calls=0.0, g6_calls_sum=0.0,
           tmp;

    double3 xcm, vcm, mom,
            xdc, vdc,
            a2, a3, a2dot1;

    char processor_name[MPI_MAX_PROCESSOR_NAME],
         inp_fname[30];

    /* global variables */

    Source_particle_list source_particle_list(N_MAX);

    int N;
    int     *ind  = source_particle_list.ind;
    double  *m    = source_particle_list.m;
    double3 *x    = source_particle_list.x;
    double3 *v    = source_particle_list.v;
    double  *pot  = source_particle_list.pot;
    double3 *a    = source_particle_list.a;
    double3 *adot = source_particle_list.adot;
    double  *t    = source_particle_list.t;
    double  *dt   = source_particle_list.dt;

    /* local variables */
    int n_loc;

    /* data for active particles */

    int    n_act, ind_act[N_MAX];
    double m_act[N_MAX],
           pot_act[N_MAX], t_act[N_MAX], dt_act[N_MAX],
           pot_act_new[N_MAX],
           pot_act_tmp[N_MAX];

    double3 x_act[N_MAX], v_act[N_MAX],
            a_act[N_MAX], adot_act[N_MAX],
            x_act_new[N_MAX], v_act_new[N_MAX],
            a_act_new[N_MAX], adot_act_new[N_MAX],
            a_act_tmp[N_MAX], adot_act_tmp[N_MAX];;

    FILE   *out;

    double C_NB;
    int    usedOrNot[7];

    double pot_ext[N_MAX], pot_act_ext[N_MAX];    // for all EXTPOT

    int    i_bh1, i_bh2;

    double m_bh1, m_bh2;

    int    inf_event[N_MAX];
    double DR2, tmp_r2;
    double3 x_bbhc, v_bbhc;
    double DV2, EB, SEMI_a, SEMI_a2;

    double s_bh1[3] = {0.0, 0.0, 1.0};
    double s_bh2[3] = {0.0, 0.0, 1.0};

    double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.

    /* INIT the rand() !!! */
    srand(19640916);                 /* it is just my birthday :-) */

    /* Init MPI */
    MPI_Init(&argc, &argv);

    /* Define the total number of processors */
    MPI_Comm_size(MPI_COMM_WORLD, &n_proc);

    /* Define of the Rank of each processors */
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

    /* Define the processors names */
    MPI_Get_processor_name(processor_name, &name_proc);

    /* Print the Rank and the names of processors */
    printf("Rank of the processor %03d on %s \n", myRank, processor_name);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* read the input parameters to the rootRank */

    config = new Config("phigrape.conf");
    C_NB = config->pn_c;
    std::copy(config->pn_usage.begin(), config->pn_usage.end(), usedOrNot);

    if (is_hdf5(config->input_file_name)) {
#ifndef HAS_HDF5
        fprintf(stderr, "ERROR: input file is in HDF5 format, but the code was compiled without HDF5 support\n")
        return -1;
#endif
        h5_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
    }
    else
        ascii_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
    std::iota(ind, ind+N, 0);

    if (myRank == rootRank) {

        //TODO move it out of (myRank == rootRank) so you don't need to communicate them.
        eps = config->eps;
        t_end = config->t_end;
        dt_disk = config->dt_disk;
        dt_contr = config->dt_contr;
        dt_bh = config->dt_bh;
        eta = config->eta;
        strcpy(inp_fname, config->input_file_name.c_str());

        if (config->binary_smbh_influence_sphere_output) for (int i=0; i<N; i++) inf_event[i] = 0; // WARNING N wasn't set yet!

        /*
        eps      : Plummer softening parameter (can be even 0)
        t_end    : end time of calculation
        dt_disk  : interval of snapshot files output (0xxx.dat)
        dt_contr : interval for the energy control output (contr.dat)
        dt_bh    : interval for BH output (bh.dat & bh_neighbors.dat)
        eta      : parameter for timestep determination
        inp_data : name of the input file (data.inp)
        */

        printf("\n");
        printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
        printf("\n");
        printf("N       = %07d \t eps      = %.6E \n", N, eps);
        printf("t_beg   = %.6E \t t_end    = %.6E \n", time_cur, t_end);
        printf("dt_disk = %.6E \t dt_contr = %.6E \n", dt_disk, dt_contr);
        printf("dt_bh   = %.6E \n", dt_bh);
        printf("eta     = %.6E \n", eta);
        printf("\n");

        printf("External Potential: \n");
        printf("\n");

        if ((diskstep == 0) && (time_cur == 0.0)) {
            out = fopen("contr.dat", "w");
            fclose(out);

#ifdef TIMING
            out = fopen("timing.dat", "w");
            fclose(out);
#endif

            if (config->live_smbh_output && (config->live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
            }
            if ((config->live_smbh_neighbor_output) && (config->live_smbh_count > 0)) {
                out = fopen("bh_neighbors.dat", "w");
                fclose(out);
            }
            if (config->binary_smbh_influence_sphere_output) {
                out = fopen("bbh_inf.dat", "w");
                fclose(out);
            }

        } else {   // if (diskstep == 0)

        }        // if (diskstep == 0)

        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Broadcast all useful values to all processors... */
    MPI_Bcast(&N,        1, MPI_INT,    rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&eps,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&eta,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&t_end,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_disk,  1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_contr, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_bh,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&time_cur, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    double normalization_mass=1, normalization_length=1, normalization_velocity=1;
    if (config->ext_units_physical) {
        normalization_mass = 1/config->unit_mass;
        normalization_length = 1000/config->unit_length;
        normalization_velocity = 1.52484071426404437233e+01*sqrt(config->unit_length/config->unit_mass);
    }
    Plummer ext_bulge(config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    Miyamoto_Nagai ext_disk(config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    Plummer ext_halo_plummer(config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    Logarithmic_halo ext_log_halo(config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    Dehnen ext_dehnen(config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);

    std::vector<External_gravity*> external_gravity_components;
    external_gravity_components.push_back(&ext_bulge);
    external_gravity_components.push_back(&ext_disk);
    external_gravity_components.push_back(&ext_halo_plummer);
    external_gravity_components.push_back(&ext_log_halo);
    external_gravity_components.push_back(&ext_dehnen);

    if (ext_bulge.is_active)        printf("m_bulge = %.4E                        b_bulge = %.4E\n", config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    if (ext_disk.is_active)         printf("m_disk  = %.4E  a_disk  = %.4E  b_disk  = %.4E\n", config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    if (ext_halo_plummer.is_active) printf("m_halo  = %.4E                        b_halo = %.4E\n", config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    if (ext_log_halo.is_active)     printf("v_halo = %.6E r_halo = %.6E \n", config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    if (ext_dehnen.is_active)       printf("m_ext  = %.6E r_ext  = %.6E \t g_ext = %.3E \n", config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);
    printf("\n");
    fflush(stdout);

    eta_s  = eta/ETA_S_CORR;
    eta_bh = eta/ETA_BH_CORR;

    eps2 = SQR(eps);

    dt_min = 1.0*pow(2.0, DTMINPOWER);
    dt_max = 1.0*pow(2.0, DTMAXPOWER);

    if (dt_disk == dt_contr)
        dt_max = dt_contr;
    else
        dt_max = MIN(dt_disk, dt_contr);

    if (dt_max > 1.0) dt_max = 1.0;

    t_disk  = dt_disk*(1.0+floor(time_cur/dt_disk));
    t_contr = dt_contr*(1.0+floor(time_cur/dt_contr));
    t_bh    = dt_bh*(1.0+floor(time_cur/dt_bh));

    if (myRank == rootRank) {
        printf("t_disk = %.6E   t_contr = %.6E   t_bh = %.6E \n", t_disk, t_contr, t_bh);
        printf("\n");
        fflush(stdout);
    } /* if (myRank == rootRank) */

    std::fill(t,  t+N,  time_cur);
    std::fill(dt, dt+N, dt_min);

    n_loc = N/n_proc;

    Active_search active_search(myRank, n_proc, n_loc, N);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Broadcast the values of all particles to all processors... */
    MPI_Bcast(ind, N, MPI_INT,    rootRank, MPI_COMM_WORLD);
    MPI_Bcast(m,   N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(x, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(v, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* init the local GRAPE's */

    if (config->devices_per_node==0) {
        MPI_Comm shmcomm;
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(shmcomm, &numGPU);
        MPI_Comm_rank(shmcomm, &clusterid);
    } else {
        numGPU = config->devices_per_node;
        clusterid = myRank % numGPU;
    }
    printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
    fflush(stdout);

    g6_open(clusterid);
    npipe = g6_npipes();
    g6_set_tunit(new_tunit);
    g6_set_xunit(new_xunit);

#ifdef ETICS
    grapite_read_particle_tags(N, "grapite.mask", myRank, n_loc);
    grapite_set_dt_exp(dt_exp);
    dt_exp = time_cur; // if we don't have a binary restart then we don't remember the coefficients, and we need to calculate them now.
    grapite_set_t_exp(t_exp);
#endif

    /* load the nj particles to the G6 */

    for (int j=0; j<n_loc; j++) {
        int kkk = j + myRank*n_loc;
        g6_set_j_particle(clusterid, j, ind[kkk], t[kkk], dt[kkk], m[kkk], zeros, zeros, zeros, v[kkk], x[kkk]);
    } /* j */

#ifdef ETICS
    double etics_length_scale;
    if (myRank == rootRank) etics_length_scale = grapite_get_length_scale(N, m, x, v); // We don't want all ranks to do it, because they need to write a file and might confuse each other
    MPI_Bcast(&etics_length_scale, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    grapite_set_length_scale(etics_length_scale);
#endif

#ifdef ETICS_CEP
    // First time only: get the CEP index
    grapite_cep_index = grapite_get_cep_index();

    // First calculate the DC
    grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

    // Now copy it to the global particle list
    memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
    memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

    double zeros[3] = {0., 0., 0.}; // We haven't calculated the force yet!
    grapite_update_cep(time_cur, xdc, vdc, zeros, zeros);
#endif

    /* define the all particles as a active on all the processors for the first time grav calc. */

    n_act = N;

    for (int i=0; i<n_act; i++) {
        ind_act[i]  = ind[i];
        iii         = ind_act[i];
        m_act[i]    = m[iii];

        x_act[i] = x[iii];
        v_act[i] = v[iii];

        t_act[i]  = t[iii];
        dt_act[i] = dt[iii];

    } /* i */

    // NOTE this is where calc_self_grav_zero() used to be.
    calc_self_grav(time_cur, eps2, g6_calls, n_loc,
                   n_act, ind_act,
                   x_act, v_act,
                   pot_act_tmp,
                   a_act_tmp,
                   adot_act_tmp,
                   h2_i);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Reduce the "global" vectors from "local" on all processors) */
    MPI_Allreduce(pot_act_tmp,  pot,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(a_act_tmp,    a,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(adot_act_tmp, adot, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (config->live_smbh_count == 2) {
        i_bh1 = 0;
        i_bh2 = 1;

        if (config->live_smbh_custom_eps >= 0) {

            m_bh1 = m[i_bh1];
            m_bh2 = m[i_bh2];

            x_bh1 = x[i_bh1];
            v_bh1 = v[i_bh1];

            x_bh2 = x[i_bh2];
            v_bh2 = v[i_bh2];

            // calculate and "minus" the BH <-> BH _softened_ pot, acc & jerk

            tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                    m_bh2, x_bh2, v_bh2,
                                    eps,
                                    &pot_bh1, a_bh1, adot_bh1,
                                    &pot_bh2, a_bh2, adot_bh2);

            pot[i_bh1] -= pot_bh1;
            pot[i_bh2] -= pot_bh2;

            a[i_bh1] -= a_bh1;
            a[i_bh2] -= a_bh2;

            adot[i_bh1] -= adot_bh1;
            adot[i_bh2] -= adot_bh2;

            // calculate and "plus" the new BH <-> BH _unsoftened_ pot, acc, jerk

            tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                    m_bh2, x_bh2, v_bh2,
                                    config->live_smbh_custom_eps,
                                    &pot_bh1, a_bh1, adot_bh1,
                                    &pot_bh2, a_bh2, adot_bh2);

            pot[i_bh1] += pot_bh1;
            pot[i_bh2] += pot_bh2;

            a[i_bh1] += a_bh1;
            a[i_bh2] += a_bh2;

            adot[i_bh1] += adot_bh1;
            adot[i_bh2] += adot_bh2;
        }

        if (config->binary_smbh_pn) {

            // calculate and "plus" the new BH <-> BH : PN1, PN2, PN2.5, PN3, PN3.5 : acc, jerk

            dt_bh_tmp = dt[0];

            tmp_i = calc_force_pn_BH(m_bh1, x_bh1, v_bh1, s_bh1,
                                    m_bh2, x_bh2, v_bh2, s_bh2,
                                    C_NB,  dt_bh_tmp, usedOrNot,
                                    (double(*)[3])a_pn1, (double(*)[3])adot_pn1,
                                    (double(*)[3])a_pn2, (double(*)[3])adot_pn2, myRank, rootRank);

            a[i_bh1] += a_pn1[1] + a_pn1[2] + a_pn1[3] + a_pn1[4] + a_pn1[5] + a_pn1[6];
            a[i_bh2] += a_pn2[1] + a_pn2[2] + a_pn2[3] + a_pn2[4] + a_pn2[5] + a_pn2[6];

            adot[i_bh1] += adot_pn1[1] + adot_pn1[2] + adot_pn1[3] + adot_pn1[4] + adot_pn1[5] + adot_pn1[6];
            adot[i_bh2] += adot_pn2[1] + adot_pn2[2] + adot_pn2[3] + adot_pn2[4] + adot_pn2[5] + adot_pn2[6];

            if (myRank == rootRank) {
                if (tmp_i == 505) {
                    printf("PN RSDIST: %.8E \t %.8E \n", timesteps, time_cur);
                    fflush(stdout);
                    exit(-1);
                }
            }
        }
    }

    calc_ext_grav(external_gravity_components, n_act, x_act, v_act, pot_ext, a, adot);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Energy control... */
    if (myRank == rootRank) {
        energy_contr(time_cur, timesteps, n_act_sum, g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
    } /* if (myRank == rootRank) */

#ifdef ETICS_DUMP
    if (diskstep==0) {
        sprintf(out_fname, "coeffs.%06d.%02d.dat", 0, myRank);
        grapite_dump(out_fname, 2);
    }
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

#ifdef ETICS_CEP
    // First calculate the DC
    grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

    // Now copy it to the global particle list
    memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
    memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

    grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Define initial timestep for all particles on all nodes */

    for (int i=0; i<N; i++) {
        a2_mod = a[i][0]*a[i][0]+a[i][1]*a[i][1]+a[i][2]*a[i][2];
        adot2_mod = adot[i][0]*adot[i][0]+adot[i][1]*adot[i][1]+adot[i][2]*adot[i][2];

        if (adot2_mod == 0)
            dt_tmp = eta_s;
        else
            dt_tmp = eta_s*sqrt(a2_mod/adot2_mod);

        power = log(dt_tmp)/log(2.0) - 1;

        dt_tmp = pow(2.0, (double)power);

        if (dt_tmp < dt_min) dt_tmp = dt_min;
        if (dt_tmp > dt_max) dt_tmp = dt_max;

        dt[i] = dt_tmp;

        if (config->dt_min_warning && (myRank == 0)) {
            if (dt[i] == dt_min) {
                printf("!!! Warning0: dt = dt_min = %.6E \t ind = %07d \n", dt[i], ind[i]);
                fflush(stdout);
            }
        }
    } /* i */

    if (config->live_smbh_count > 0) {
        double min_dt = *std::min_element(dt, dt+N);
        for (int i=0; i<config->live_smbh_count; i++) dt[i] = min_dt;
    }

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* load the new values for particles to the local GRAPE's */
    /* load the nj particles to the G6 */

    for (int j=0; j<n_loc; j++) {
        int kkk = j + myRank*n_loc;
        g6_set_j_particle(clusterid, j, ind[kkk], t[kkk], dt[kkk], m[kkk], zeros, adot[kkk]*(1./6.), a[kkk]*0.5, v[kkk], x[kkk]);
    } /* j */

    if (myRank == rootRank) {

        /* Write BH data... */
        if (config->live_smbh_output) write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

        /* Write BH NB data... */
        if (config->live_smbh_neighbor_output) write_bh_nb_data(time_cur, N, m, x, v);

    } /* if (myRank == rootRank) */

    /* Get the Starting time on rootRank */

    if (myRank == rootRank) {
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
    } /* if (myRank == rootRank) */

    timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
    n_act_sum = 0.0;

    for (int i=1; i<N+1; i++) { // TODO why like this?
        n_act_distr[i-1] = 0.0;
    }

    g6_calls = 0.0; //TODO this should include the calls at the zeroth step, so move it further up.

#ifdef TIMING
    DT_TOT      = 0.0;

    DT_ACT_DEF1 = 0.0;
    DT_ACT_DEF2 = 0.0;
    DT_ACT_DEF3 = 0.0;
    DT_ACT_PRED = 0.0;
    DT_ACT_GRAV = 0.0;
    DT_EXT_GRAV = 0.0;
    DT_EXT_GMC_GRAV = 0.0;
    DT_GMC_GMC_GRAV = 0.0;
    DT_ACT_CORR = 0.0;
    DT_ACT_LOAD = 0.0;

    DT_STEVOL    = 0.0;
    DT_STARDISK  = 0.0;
    DT_STARDESTR = 0.0;

    DT_ACT_REDUCE = 0.0;
#endif

    /* The main integration loop */

    while (time_cur <= t_end) {

        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        min_t = active_search.get_minimum_time(t, dt);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        active_search.get_active_indices(min_t, t, dt, ind_act, &n_act);

// TODO deal with it below
#ifdef ACT_DEF_GRAPITE
#error please fix here
#endif
#if defined(ACT_DEF_GRAPITE) && (defined(ADD_BH1) || defined(ADD_BH2))
#ifdef ADD_BH1
#define ACT_DEF_GRAPITE_NUMBH 1
#else
#define ACT_DEF_GRAPITE_NUMBH 2
#endif
        int acf_def_grapite_bh_count = 0;
        for (i=0; i<n_act; i++) {
            if (ind_act[i]==0) {
                i_bh1 = i;
                acf_def_grapite_bh_count++;
        }
#ifdef ADD_BH2
            else if (ind_act[i]==1) {
                i_bh2 = i;
                acf_def_grapite_bh_count++;
            }
#endif
            if (acf_def_grapite_bh_count==ACT_DEF_GRAPITE_NUMBH) break;
        }
        if (i==n_act) {
            fprintf(stderr, "ERROR: black holes were not found in the active particle list");
            exit(1);
        }
#else
        if (config->live_smbh_count > 0) {
            i_bh1 = 0;
            i_bh2 = 1;
        }
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif
        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            m_act[i] = m[j_act];
            x_act[i] = x[j_act];
            v_act[i] = v[j_act];
            t_act[i]  = t[j_act];
            dt_act[i] = dt[j_act];
            // NOTICE Why do we need pot_act and pot_act_ext? Probably redundant.
            pot_act[i] = pot[j_act];
            pot_act_ext[i] = pot_ext[j_act];
            a_act[i] = a[j_act];
            adot_act[i] = adot[j_act];
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF3 += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* predict the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
            dt_tmp = min_t - t_act[i];
            dt2half = 0.5*dt_tmp*dt_tmp;
            dt3over6 = (1./3.)*dt_tmp*dt2half;
            x_act_new[i] = x_act[i] + v_act[i]*dt_tmp + a_act[i]*dt2half + adot_act[i]*dt3over6;
            v_act_new[i] = v_act[i] + a_act[i]*dt_tmp + adot_act[i]*dt2half;
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
#endif

        calc_self_grav(min_t, eps2, g6_calls, n_loc,
                n_act, ind_act,
                x_act_new, v_act_new,
                pot_act_tmp,
                a_act_tmp,
                adot_act_tmp,
                h2_i);

        /* Reduce the "global" vectors from "local" on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

        /* Wait to all processors to finish his works... */
        MPI_Barrier(MPI_COMM_WORLD);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
#endif

        if (config->live_smbh_count == 2) {
            if (config->live_smbh_custom_eps >= 0) {
                m_bh1 = m_act[i_bh1];
                m_bh2 = m_act[i_bh2];

                x_bh1 = x_act_new[i_bh1];
                v_bh1 = v_act_new[i_bh1];

                x_bh2 = x_act_new[i_bh2];
                v_bh2 = v_act_new[i_bh2];

                // calculate and "minus" the BH <-> BH softened pot, acc & jerk

                tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                        m_bh2, x_bh2, v_bh2,
                                        eps,
                                        &pot_bh1, a_bh1, adot_bh1,
                                        &pot_bh2, a_bh2, adot_bh2);

                pot_act_new[i_bh1] -= pot_bh1;
                pot_act_new[i_bh2] -= pot_bh2;

                a_act_new[i_bh1] -= a_bh1;
                a_act_new[i_bh2] -= a_bh2;

                adot_act_new[i_bh1] -= adot_bh1;
                adot_act_new[i_bh2] -= adot_bh2;

                // calculate and "plus" the new BH <-> BH unsoftened pot, acc, jerk

                tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                        m_bh2, x_bh2, v_bh2,
                                        config->live_smbh_custom_eps,
                                        &pot_bh1, a_bh1, adot_bh1,
                                        &pot_bh2, a_bh2, adot_bh2);

                pot_act_new[i_bh1] += pot_bh1;
                pot_act_new[i_bh2] += pot_bh2;

                a_act_new[i_bh1] += a_bh1;
                a_act_new[i_bh2] += a_bh2;

                adot_act_new[i_bh1] += adot_bh1;
                adot_act_new[i_bh2] += adot_bh2;
            }

            if (config->binary_smbh_pn) {
                // calculate and "plus" the new BH <-> BH : PN1, PN2, PN2.5, PN3, PN3.5 : acc, jerk

                dt_bh_tmp = dt[0];

                tmp_i = calc_force_pn_BH(m_bh1, x_bh1, v_bh1, s_bh1,
                                        m_bh2, x_bh2, v_bh2, s_bh2,
                                        C_NB,  dt_bh_tmp, usedOrNot,
                                        (double(*)[3])a_pn1, (double(*)[3])adot_pn1,
                                        (double(*)[3])a_pn2, (double(*)[3])adot_pn2, myRank, rootRank);

                a_act_new[i_bh1] += a_pn1[1] + a_pn1[2] + a_pn1[3] + a_pn1[4] + a_pn1[5] + a_pn1[6];
                a_act_new[i_bh2] += a_pn2[1] + a_pn2[2] + a_pn2[3] + a_pn2[4] + a_pn2[5] + a_pn2[6];

                adot_act_new[i_bh1] += adot_pn1[1] + adot_pn1[2] + adot_pn1[3] + adot_pn1[4] + adot_pn1[5] + adot_pn1[6];
                adot_act_new[i_bh2] += adot_pn2[1] + adot_pn2[2] + adot_pn2[3] + adot_pn2[4] + adot_pn2[5] + adot_pn2[6];

                if (myRank == rootRank) {
                    if (tmp_i == 505) {
                        printf("PN RSDIST: TS = %.8E \t t  = %.8E \n", timesteps, time_cur);
                        fflush(stdout);
                        exit(-1);
                    }
                }
            }
        }

        calc_ext_grav(external_gravity_components, n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);

        /* correct the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
            // NOTICE looks like we're doing three unrelated things in this loop: (1) correcting positions and velocities (2) calculating new steps, and (3) putting the corrected values from the _act_new back in the _act arrays.
            // After going back to the _act arrays they don't do much before they go back to the main arrays, so this copy seems redundant (the SMBH influence sphere printout needs these values but it should be a function anyway).
            // TODO split this loop into the three tasks it is doing, and remove the redundancy.
            dt_tmp = min_t - t_act[i];

            double3 a2, a3;
            calc_high_derivatives(dt_tmp, a_act[i], a_act_new[i], adot_act[i], adot_act_new[i], &a2, &a3);

            corrector(dt_tmp, a2, a3, &x_act_new[i], &v_act_new[i]);


            //TODO make beautiful
            double eta_curr;
            if ((config->live_smbh_count > 0) && (ind_act[i] < config->live_smbh_count)) eta_curr = eta_bh;
            else eta_curr = eta;

            double dt_new = aarseth_step(eta_curr, dt_tmp, a_act_new[i], adot_act_new[i], a2, a3);

            //TODO the below should be moved to a function
            if (dt_new < dt_min) dt_tmp = dt_min;

            if ((dt_new < dt_tmp) && (dt_new > dt_min)) {
                power = log(dt_new)/log(2.0) - 1;
                dt_tmp = pow(2.0, (double)power); // TODO why is this casting needed here?
            }

            if ((dt_new > 2*dt_tmp) && (fmod(min_t, 2*dt_tmp) == 0) && (2*dt_tmp <= dt_max)) {
                dt_tmp *= 2;
            }

            if (config->dt_min_warning && (myRank == 0)) {
                if (dt_act[i] == dt_min) {
                    printf("!!! Warning1: dt_act = dt_min = %.6E \t ind_act = %07d \n", dt[i], ind_act[i]);
                    fflush(stdout);
                }
            }

            /* BEGIN copy of everything */

            dt_act[i] = dt_tmp;
            t_act[i] = min_t;

            pot_act[i] = pot_act_new[i];

            x_act[i]    =    x_act_new[i];
            v_act[i]    =    v_act_new[i];
            a_act[i]    =    a_act_new[i];
            adot_act[i] = adot_act_new[i];

            /* END copy of everything */


        } /* i */

        /* define the min. dt over all the act. part. and set it also for the BH... */

        if (config->live_smbh_count > 0) {
            double min_dt = *std::min_element(dt_act, dt_act+n_act);
            if (config->live_smbh_count>=1) dt_act[i_bh1] = min_dt;
            if (config->live_smbh_count==2) dt_act[i_bh2] = min_dt;
        }

        if (config->binary_smbh_influence_sphere_output && (myRank == rootRank)) {
            //TODO clean up this mass. We don't want to have all these _act arrays; they are only needed for this lame function.
            binary_smbh_influence_sphere_output(i_bh1, i_bh2, ind_act, m_act, x_act, v_act, pot_act, dt_act, n_act, timesteps, time_cur, config->binary_smbh_influence_radius_factor, inf_event);
        }

        /* Return back the new coordinates + etc... of active part. to the global data... */

        for (int i=0; i<n_act; i++) {

            iii = ind_act[i];

            m[iii] = m_act[i];

            x[iii] = x_act[i];
            v[iii] = v_act[i];

            t[iii] = t_act[i];
            dt[iii] = dt_act[i];

            pot[iii] = pot_act[i];

            pot_ext[iii] = pot_act_ext[i];

            a[iii] = a_act[i];
            adot[iii] = adot_act[i];
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* load the new values for active particles to the local GRAPE's */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) { // TODO would be nicer to use i instead of j here
#ifdef ETICS_CEP
            if (ind_act[i] == grapite_cep_index) grapite_update_cep(t_act[i], x_act[i], v_act[i], a_act[i], adot_act[i]); // All ranks should do it.
#endif
            cur_rank = ind_act[i]/n_loc;

            if (myRank == cur_rank) {

                jjj = ind_act[i] - myRank*n_loc;

                g6_set_j_particle(clusterid, jjj, ind_act[i], t_act[i], dt_act[i], m_act[i], zeros, adot_act[i]*(1./6.), a_act[i]*0.5, v_act[i], x_act[i]);

            } /* if (myRank == cur_rank) */

        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Current time set to min_t */

        time_cur = min_t;

        timesteps += 1.0;
        n_act_sum += n_act;
        n_act_distr[n_act-1]++;

        if (time_cur >= t_bh) {
            if (myRank == rootRank) {
                /* Write BH data... */
                if (config->live_smbh_output) write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

                /* Write BH NB data... */
                if (config->live_smbh_neighbor_output) write_bh_nb_data(time_cur, N, m, x, v);

            } /* if (myRank == rootRank) */

            t_bh += dt_bh;
        } /* if (time_cur >= t_bh) */

        if (time_cur >= t_contr) {
            if (myRank == rootRank) {

                energy_contr(time_cur, timesteps, n_act_sum, g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);

                /* write cont data */

                if (config->output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);

                /* possible OUT for timing !!! */

#ifdef TIMING
                out = fopen("timing.dat", "a");

                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
                         DT_ACT_CORR + DT_ACT_LOAD +
                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
                         DT_ACT_REDUCE;

                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
                        time_cur, DT_TOT,
                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
                        100.0*DT_ACT_REDUCE/DT_TOT,
                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);

                fclose(out);
#endif

            } /* if (myRank == rootRank) */

#ifdef ETICS_CEP
            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.

            // First calculate the DC
            grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

            // Now copy it to the global particle list
            memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
            memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

            // Now copy it to the local particle list for tha appropriate rank
            if (myRank == grapite_cep_index/n_loc) {
                memcpy(x_loc[grapite_cep_index - myRank*n_loc], xdc, 3*sizeof(double));
                memcpy(v_loc[grapite_cep_index - myRank*n_loc], vdc, 3*sizeof(double));
            }
            grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
#endif

            t_contr += dt_contr;
        } /* if (time_cur >= t_contr) */

        if (time_cur >= t_disk) {
            if (myRank == rootRank) {
                diskstep++;
                char out_fname[256];
                sprintf(out_fname, "%06d", diskstep);
                if (config->output_hdf5) h5_write(std::string(out_fname) + ".h5", diskstep, N, time_cur, m, x, v, pot, a, adot, config->output_extra_mode, config->output_hdf5_double_precision);
                else ascii_write(std::string(out_fname) + ".dat", diskstep, N, time_cur, m, x, v, config->output_ascii_precision);
            } /* if (myRank == rootRank) */

#ifdef ETICS_DUMP
            sprintf(out_fname, "coeffs.%06d.%02d.dat", diskstep, myRank);
            grapite_dump(out_fname, 2);
#endif

            t_disk += dt_disk;
        } /* if (time_cur >= t_disk) */

    } /* while (time_cur < t_end) */

    /* close the local GRAPEs */

    g6_close(clusterid);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Reduce(&g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (myRank == rootRank) {

        /* Write some output for the timestep annalize... */

        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls);
        printf("\n");
        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
        fflush(stdout);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Finalize the MPI work */
    MPI_Finalize();

    return 0;
}