phigrape/phigrape.cpp

/*****************************************************************************
File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
                :
Contents       : N-body code with integration by individual block time step
                : together with the parallel using of GRAPE6a board's.
                :
                : Added the GPU support via SAPPORO library.
                :
                : Normalization to the physical units!!!
                :
                : External Potential added
                : Plummer-Kuzmin: Bulge, Disk, Halo
                : Kharchenko+Andreas...
                :
                : SC extra POT for Bek SC test runs...
                :
                : Rebuced to the Single BH -> Plummer
                : Andreas+Fazeel...
                :
                : Stellar evolution added
                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
                : IMS mass loss: van den Hoeg & Groenewegen (1997)
                :
                : STARDESTR_EXT: Tidal disruption of stars by external BH...
                : Chingis, Denis & Maxim...
                :
                : STARDESTR: Tidal disruption of stars by BH...
                : Jose, Li Shuo & Shiyan Zhong
                :
                : STARDISK: Drag force...
                : Chingis, Denis & Maxim...
                :
                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
                : Taras, Andreas...
                :
                : Live BH (1 || 2) + ACC + EJECT...
                : Li Shuo & Shiyan Zhong
                :
                : dt_min for BH (1 || 2)...
                :
                : added the PN calculus for the BBH
                : PN0, PN1, PN2, PN2.5 (coded on the base of
                : Gabor Kupi original routine)
                :
                : added the "name" array...
                :
                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
                : for Alexey SC runs... and also for Fazeel Zurich runs...
                :
                : CPU_TIMELIMIT added for the Julich MW cluster runs...
                :
Coded by       : Peter Berczik
Version number : 19.04
Last redaction : 2019.04.16 12:55
*****************************************************************************/
#include "double3.h"
#include "black_holes.h"
#include "config.h"
#include "io.h"
#include "external.h"

Config *config;

#ifdef ETICS
#include "grapite.h"
#endif

#define TIMING

#define ETA_S_CORR   4.0
#define ETA_BH_CORR  4.0

#define DTMAXPOWER  -3.0
#define DTMINPOWER -36.0

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <algorithm>
#include <numeric>
#include <stdexcept>

#include "grape6.h"

#include <mpi.h>

/* Some "good" functions and constants... */
// TODO replace with inline functions
#define MIN(a,b) (((a)<(b)) ? (a):(b) )
#define SQR(x)   ((x)*(x) )


#define KB         1024
#define MB         (KB*KB)

#define N_MAX      (32*KB)

// double L[3]; // needed in pn_bh_spin.c
// Needed for things related to BHs
#include "debug.h"

double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
double CPU_time_real,  CPU_time_user,  CPU_time_syst;

#ifdef TIMING

double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;

double DT_TOT,
    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
    DT_ACT_GRAV, DT_EXT_GRAV,
    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
    DT_ACT_CORR, DT_ACT_LOAD,
    DT_STEVOL, DT_STARDISK, DT_STARDESTR;

double DT_ACT_REDUCE;

#endif


/* external potential... */

double3 x_bh1, x_bh2, v_bh1, v_bh2;

double pot_bh1, pot_bh2;
double3 a_bh1, adot_bh1, a_bh2, adot_bh2;

//#include "n_bh.c"

//double3 a_pn1[7], adot_pn1[7], a_pn2[7], adot_pn2[7];

//#include "pn_bh_spin.c"

#ifdef ETICS
int grapite_cep_index;
#endif

void get_CPU_time(double *time_real, double *time_user, double *time_syst)
{
    struct rusage xxx;
    double sec_u, microsec_u, sec_s, microsec_s;
    struct timeval tv;

    getrusage(RUSAGE_SELF,&xxx);

    sec_u = xxx.ru_utime.tv_sec;
    sec_s = xxx.ru_stime.tv_sec;

    microsec_u = xxx.ru_utime.tv_usec;
    microsec_s = xxx.ru_stime.tv_usec;

    *time_user = sec_u + microsec_u * 1.0E-06;
    *time_syst = sec_s + microsec_s * 1.0E-06;

    gettimeofday(&tv, NULL);
    *time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;

    *time_user = *time_real;
}

class Calc_self_grav {
public:
    Calc_self_grav(const int N, const int n_loc, const int clusterid, const int npipe, const double eps)
        : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
    {
        h2.assign(N, eps2);
    }
    void operator()(const double t, const int n_act, int ind_act[], const double3 x_act[], const double3 v_act[],
              double pot[], double3 acc[], double3 jrk[])
    {
        g6_set_ti(clusterid, t);
        for (int i=0; i<n_act; i+=npipe) {
            int nn = npipe;
            if (n_act-i < npipe) nn = n_act - i;
            //TODO any way we can clean up this ugly casting?
            g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i, eps2, h2.data());
            g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, eps2, h2.data(), (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i);
            g6_calls++;
        } /* i */
    }
    double g6_calls;
private:
    int n_loc, clusterid, npipe;
    double eps2;
    std::vector<double> h2;
};

void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n_act, double3 *x_act_new, double3 *v_act_new, double *pot_act_ext, double3 *a_act_new, double3* adot_act_new)
{
#ifdef TIMING
    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

    /* Define the external potential for all active particles on all nodes */

    std::fill(pot_act_ext, pot_act_ext+n_act, 0.);

    for (auto component : external_gravity_components) {
        if (component->is_active)
            component->apply(n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);
    }

    /* For simple Plummer potential... */

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif

}

void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, double& rcm_sum, double& vcm_sum, double& E_tot_0, double& E_tot_corr_0, double& E_tot_corr_sd_0, int &skip_con, int N, double m[], double3 x[], double3 v[], double pot[], double pot_ext[])
{
    // TODO maybe use static variables here for the previous step energy?
    double E_pot = 0.0;
    for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;

    double E_kin = 0.0;
    for (int i=0; i<N; i++) E_kin += m[i]*( SQR(v[i][0]) + SQR(v[i][1]) + SQR(v[i][2]) );
    E_kin *= 0.5;

    double E_pot_ext = 0.0;

    for (int i=0; i<N; i++) E_pot_ext += m[i]*pot_ext[i];

    if (timesteps == 0.0) { //TODO what is this thing?
        rcm_sum = 0.0;
        vcm_sum = 0.0;
    }

    double mcm     = 0.0;
    double rcm_mod = 0.0;
    double vcm_mod = 0.0;

    double3 xcm = {0, 0, 0};
    double3 vcm = {0, 0, 0};

    for (int i=0; i<N; i++) {
        mcm += m[i];
        xcm += m[i] * x[i];
        vcm += m[i] * v[i];
    } /* i */

    xcm /= mcm;
    vcm /= mcm;

    rcm_mod = xcm.norm();
    vcm_mod = vcm.norm();

    rcm_sum += rcm_mod;
    vcm_sum += vcm_mod;

    double3 mom = {0, 0, 0};

    for (int i=0; i<N; i++) {
        mom[0] += m[i] * ( x[i][1] * v[i][2] - x[i][2] * v[i][1] );
        mom[1] += m[i] * ( x[i][2] * v[i][0] - x[i][0] * v[i][2] );
        mom[2] += m[i] * ( x[i][0] * v[i][1] - x[i][1] * v[i][0] );
    } /* i */

    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);

    double E_tot = E_pot + E_kin + E_pot_ext;
    double E_tot_corr    = E_tot /* + E_corr */; // TODO what is this?

    if (timesteps == 0.0) {
        /* initialize the E_tot_0 + etc... */

        E_tot_0 = E_tot;
        E_tot_corr_0 = E_tot;
        E_tot_corr_sd_0 = E_tot;

        skip_con = 1;
    } else {
        /* read the E_tot_0 + etc... */
        if (skip_con == 0) {
            auto inp = fopen("contr.con","r");
            double tmp;
            fscanf(inp,"%lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE",
                   &tmp, &tmp, &tmp, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
                   &E_tot_0, &tmp,
                   &E_tot_corr_0, &tmp,
                   &E_tot_corr_sd_0, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp);
            fclose(inp);
            skip_con = 1;
        }
    }
    double DE_tot         = E_tot         - E_tot_0;

    /* This is the only output to screen */
    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
            CPU_time_user-CPU_time_user0);

    fflush(stdout);

    auto out = fopen("contr.dat", "a");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    out = fopen("contr.con", "w");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    E_tot_0 = E_tot;
    E_tot_corr_0 = E_tot_corr;
    E_tot_corr_sd_0 = 0;
}

class Active_search {
    // TODO you can add pointers to t and dt at the constructor, no point giving them at get_minimum_time but without the size.
public:
    Active_search(const int myRank, const int n_proc, const int n_loc, const int N)
        : myRank(myRank), n_proc(n_proc), n_loc(n_loc), N(N)
    {
        ind_act_loc = new int[n_loc];
    }
    ~Active_search() { delete[] ind_act_loc; };
    double get_minimum_time(const double t[], const double dt[])
    {
        double min_t_loc, min_t;
#ifdef ACT_DEF_GRAPITE
        min_t_loc = grapite_get_minimum_time();
#else
        min_t_loc = t[myRank*n_loc]+dt[myRank*n_loc];
        for (int j=myRank*n_loc+1; j<(myRank+1)*n_loc; j++) {
            double tmp = t[j] + dt[j];
            if (tmp < min_t_loc) min_t_loc = tmp;
        }
#endif
        /* Reduce the "global" min_t from min_t_loc "local" on all processors) */
        MPI_Allreduce(&min_t_loc, &min_t, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
        return min_t;
    }
    void get_active_indices(const double min_t, const double t[], const double dt[], int ind_act[], int *n_act)
    {
#ifdef ACT_DEF_GRAPITE
        int n_act_loc;
        grapite_active_search(min_t, ind_act_loc, &n_act_loc);
        if (myRank > 0)
            for (int i=0; i<n_act_loc; i++)
                ind_act_loc[i] += myRank*n_loc;
        int n_act_arr[256], displs[256]; // Assuming maximum of 256 processes... seems safe.
        MPI_Allgather(&n_act_loc, 1, MPI_INT, n_act_arr, 1, MPI_INT, MPI_COMM_WORLD);
        *n_act = n_act_arr[0];
        for (int i=1; i<n_proc; i++)
            *n_act += n_act_arr[i];
        displs[0] = 0;
        for (int i=1; i<n_proc; i++)
            displs[i]=displs[i-1]+n_act_arr[i-1];
        MPI_Allgatherv(ind_act_loc, n_act_loc, MPI_INT, ind_act, n_act_arr, displs, MPI_INT, MPI_COMM_WORLD);
#else
        *n_act = 0;
        for (int i=0; i<N; i++) {
            if (t[i]+dt[i] == min_t) ind_act[(*n_act)++] = i;
        } /* i */
#endif      // ACT_DEF_GRAPITE
    }
private:
    int myRank, n_proc, n_loc, N;
    int *ind_act_loc;
};

class Source_particle_list {
public:
    Source_particle_list(int N)
        : N(N)
    {
        ind  = new int[N];
        m    = new double[N];
        x    = new double3[N];
        v    = new double3[N];
        pot  = new double[N];
        a    = new double3[N];
        adot = new double3[N];
        t    = new double[N];
        dt   = new double[N];
    }
    ~Source_particle_list()
    {
        delete[] ind;
        delete[] m;
        delete[] x;
        delete[] v;
        delete[] pot;
        delete[] a;
        delete[] adot;
        delete[] t;
        delete[] dt;
    }
    int N;
    int *ind;
    double3 *x, *v, *a, *adot;
    double *m, *t, *dt, *pot;
};

inline void calc_high_derivatives(const double dt_tmp, const double3 a_act, const double3 a_act_new, const double3 adot_act, const double3 adot_act_new, double3 *a2, double3 *a3)
{
    double dtinv = 1.0/dt_tmp;
    double dt2inv = dtinv*dtinv;
    double dt3inv = dt2inv*dtinv;

    double3 a0mia1 = a_act-a_act_new;
    double3 ad04plad12 = 4.0*adot_act + 2.0*adot_act_new;
    double3 ad0plad1 = adot_act + adot_act_new;

    *a2 = -6.0*a0mia1*dt2inv - ad04plad12*dtinv;
    *a3 = 12.0*a0mia1*dt3inv + 6.0*ad0plad1*dt2inv;
}

inline void corrector(const double dt_tmp, const double3 a2, const double3 a3, double3* x_act_new, double3* v_act_new)
{
    double dt3over6 = dt_tmp*dt_tmp*dt_tmp/6.0;
    double dt4over24 = dt3over6*dt_tmp/4.0;
    double dt5over120 = dt4over24*dt_tmp/5.0;

    *x_act_new += dt4over24*a2 + dt5over120*a3;
    *v_act_new += dt3over6*a2 + dt4over24*a3;
}

inline double aarseth_step(const double eta, const double dt_tmp, const double3 a_act_new, const double3 adot_act_new, const double3 a2, const double3 a3)
{
    double a1abs = a_act_new.norm();
    double adot1abs = adot_act_new.norm();
    double3 a2dot1 = a2 + dt_tmp*a3;
    double a2dot1abs = a2dot1.norm();
    double a3dot1abs = a3.norm();

    return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
}

void binary_smbh_influence_sphere_output(int i_bh1, int i_bh2, int ind_act[], int n_act, double timesteps, double time_cur, double factor, int inf_event[], Source_particle_list *source_particle_list)
{
    //TODO !!IMPORTANT!! only open the file IF THERE IS ANYTHING TO WRITE!!!
    //TODO inf_event to be static or something?
    auto out = fopen("bbh_inf.dat", "a");

    double m_bh1 = source_particle_list->m[0];
    double m_bh2 = source_particle_list->m[1];
    double3 x_bh1 = source_particle_list->x[0];
    double3 x_bh2 = source_particle_list->x[1];
    double3 v_bh1 = source_particle_list->v[0];
    double3 v_bh2 = source_particle_list->v[1];

    double3 x_bbhc = (m_bh1*x_bh1 + m_bh2*x_bh2)/(m_bh1 + m_bh2);
    double3 v_bbhc = (m_bh1*v_bh1 + m_bh2*v_bh2)/(m_bh1 + m_bh2);

    double DR2 = (x_bh1 - x_bh2).norm2();
    double DV2 = (v_bh1 - v_bh2).norm2();
    double EB = -(m_bh1 + m_bh2) / sqrt(DR2) + 0.5 * DV2;
    double SEMI_a = -0.5 * (m_bh1 + m_bh2)/EB;
    double SEMI_a2 = SQR(SEMI_a);

    for (int i=0; i<n_act; i++) {
        int j_act = ind_act[i];
        if (j_act<2) continue;

        double& pot_bh1 = source_particle_list->pot[0];
        double& pot_bh2 = source_particle_list->pot[1];
        double& m_act   = source_particle_list->m[j_act];
        double3& x_act  = source_particle_list->x[j_act];
        double3& v_act  = source_particle_list->v[j_act];
        double& dt_act  = source_particle_list->dt[j_act];
        double& pot_act = source_particle_list->pot[j_act];
        double tmp_r2 = (x_act - x_bbhc).norm2();

        if (tmp_r2 < SEMI_a2*SQR(factor)) {

            if (inf_event[j_act] == 0) {

                fprintf(out,"INF1 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                    timesteps, time_cur, i, j_act,
                    sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                    m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_bh1,
                    m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_bh2,
                    sqrt(tmp_r2),
                    m_act, x_act[0], x_act[1], x_act[2], v_act[0], v_act[1], v_act[2], pot_act,
                    dt_act);

            inf_event[j_act] = 1;
            }

        } else {
            if (inf_event[j_act] == 1) {
                fprintf(out,"INF2 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                    timesteps, time_cur, i, ind_act[i],
                    sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                    m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_bh1,
                    m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_bh2,
                    sqrt(tmp_r2),
                    m_act, x_act[0], x_act[1], x_act[2], v_act[0], v_act[1], v_act[2], pot_act,
                    dt_act);
            }

            inf_event[j_act] = 0;

        } /* if (tmp_r2 < DR2*R_INF2) */

    } /* i */
    fclose(out);
}

int main(int argc, char *argv[])
{
    int skip_con=0; // skip_con should just be a static in the energy control function

    double E_tot_0, E_tot_corr_0, E_tot_corr_sd_0,
           rcm_sum=0.0, vcm_sum=0.0,
           timesteps=0.0, n_act_sum=0.0, n_act_distr[N_MAX];

    double3 xcm, vcm, xdc, vdc; // these should go away

    /* data for active particles */
    int    n_act, ind_act[N_MAX];
    double t_act[N_MAX],
           pot_act_new[N_MAX],
           pot_act_tmp[N_MAX];

    // x_act_new and v_act_new arrays hold the predicted position and velocity of i-particles, which is later corrected before moving into the j-particle memory. The _old arrays hold the previous values which are needed to calculate the high derivatives. The _old arrays hold the previous values which are needed to calculate the high derivatives. The [a,adot]_act_tmp arrays hold the calculation results from each node. The [a,adot]_act_new arrays hold the reduced calculation results from all nodes.
    double3 x_act_new[N_MAX], v_act_new[N_MAX],
            a_act_old[N_MAX], adot_act_old[N_MAX],
            a_act_tmp[N_MAX], adot_act_tmp[N_MAX],
            a_act_new[N_MAX], adot_act_new[N_MAX];

    double pot_ext[N_MAX], pot_act_ext[N_MAX];    // for all EXTPOT

    int    i_bh1, i_bh2;

    double m_bh1, m_bh2;

    int    inf_event[N_MAX];
    double3 x_bbhc, v_bbhc;

    int    new_tunit=51, new_xunit=51;

    double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.

    /* INIT the rand() !!! */
    srand(19640916);                 /* it is just my birthday :-) */

    /* Init MPI */
    MPI_Init(&argc, &argv);

    /* Define the total number of processors and the Rank of each processors */
    int n_proc, myRank;
    const int rootRank = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &n_proc);
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

    /* Define the processors names */
    int name_proc;
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(processor_name, &name_proc);

    /* Print the Rank and the names of processors */
    printf("Rank of the processor %03d on %s \n", myRank, processor_name);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* read the input parameters */
    config = new Config("phigrape.conf");

    Source_particle_list source_particle_list(N_MAX);

    int     *ind  = source_particle_list.ind;
    double  *m    = source_particle_list.m;
    double3 *x    = source_particle_list.x;
    double3 *v    = source_particle_list.v;
    double  *pot  = source_particle_list.pot;
    double3 *a    = source_particle_list.a;
    double3 *adot = source_particle_list.adot;
    double  *t    = source_particle_list.t;
    double  *dt   = source_particle_list.dt;

    int diskstep, N;
    double time_cur;
    if (is_hdf5(config->input_file_name)) {
#ifndef HAS_HDF5
        fprintf(stderr, "ERROR: input file is in HDF5 format, but the code was compiled without HDF5 support\n");
        return -1;
#endif
        h5_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
    }
    else
        ascii_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
    std::iota(ind, ind+N, 0);

    double eps = config->eps;
    double eta = config->eta;
    double t_end = config->t_end;
    double dt_disk = config->dt_disk;
    double dt_contr = config->dt_contr;
    double dt_bh = config->dt_bh;

    if (myRank == rootRank) {
        if (config->binary_smbh_influence_sphere_output) for (int i=0; i<N; i++) inf_event[i] = 0; // WARNING N wasn't set yet!

        printf("\n");
        printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
        printf("\n");
        printf("N       = %07d \t eps      = %.6E \n", N, eps);
        printf("t_beg   = %.6E \t t_end    = %.6E \n", time_cur, t_end);
        printf("dt_disk = %.6E \t dt_contr = %.6E \n", dt_disk, dt_contr);
        printf("dt_bh   = %.6E \n", dt_bh);
        printf("eta     = %.6E \n", eta);
        printf("\n");

        if ((diskstep == 0) && (time_cur == 0)) {
            FILE *out = fopen("contr.dat", "w");
            fclose(out);
#ifdef TIMING
            out = fopen("timing.dat", "w");
            fclose(out);
#endif
            if (config->live_smbh_output && (config->live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
            }
            if ((config->live_smbh_neighbor_output) && (config->live_smbh_count > 0)) {
                out = fopen("bh_neighbors.dat", "w");
                fclose(out);
            }
            if (config->binary_smbh_influence_sphere_output) {
                out = fopen("bbh_inf.dat", "w");
                fclose(out);
            }
        }

        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    double normalization_mass=1, normalization_length=1, normalization_velocity=1;
    if (config->ext_units_physical) {
        normalization_mass = 1/config->unit_mass;
        normalization_length = 1000/config->unit_length;
        normalization_velocity = 1.52484071426404437233e+01*sqrt(config->unit_length/config->unit_mass);
    }
    Plummer ext_bulge(config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    Miyamoto_Nagai ext_disk(config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    Plummer ext_halo_plummer(config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    Logarithmic_halo ext_log_halo(config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    Dehnen ext_dehnen(config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);

    std::vector<External_gravity*> external_gravity_components;
    external_gravity_components.push_back(&ext_bulge);
    external_gravity_components.push_back(&ext_disk);
    external_gravity_components.push_back(&ext_halo_plummer);
    external_gravity_components.push_back(&ext_log_halo);
    external_gravity_components.push_back(&ext_dehnen);
    bool has_external_gravity = false;
    for (auto component : external_gravity_components) {
        if (component->is_active) {
            has_external_gravity = true;
            break;
        }
    }

    if (has_external_gravity) printf("External Potential: \n\n");
    if (ext_bulge.is_active)        printf("m_bulge = %.4E                        b_bulge = %.4E\n", config->ext_m_bulge*normalization_mass, config->ext_b_bulge*normalization_length);
    if (ext_disk.is_active)         printf("m_disk  = %.4E  a_disk  = %.4E  b_disk  = %.4E\n", config->ext_m_disk*normalization_mass, config->ext_a_disk*normalization_length, config->ext_b_disk*normalization_length);
    if (ext_halo_plummer.is_active) printf("m_halo  = %.4E                        b_halo = %.4E\n", config->ext_m_halo_plummer*normalization_mass, config->ext_b_halo_plummer*normalization_length);
    if (ext_log_halo.is_active)     printf("v_halo = %.6E r_halo = %.6E \n", config->ext_log_halo_v*normalization_velocity, config->ext_log_halo_r*normalization_length);
    if (ext_dehnen.is_active)       printf("m_ext  = %.6E r_ext  = %.6E \t g_ext = %.3E \n", config->ext_dehnen_m*normalization_mass, config->ext_dehnen_r*normalization_length, config->ext_dehnen_gamma);
    printf("\n");
    fflush(stdout);

    double eta_bh = eta/ETA_BH_CORR;

    const double dt_min = pow(2, DTMINPOWER);
    const double dt_max = std::min({dt_disk, dt_contr, pow(2, DTMAXPOWER)});

    double t_disk  = dt_disk*(1+floor(time_cur/dt_disk));
    double t_contr = dt_contr*(1+floor(time_cur/dt_contr));
    double t_bh    = dt_bh*(1+floor(time_cur/dt_bh));

    if (myRank == rootRank) {
        printf("t_disk = %.6E   t_contr = %.6E   t_bh = %.6E \n", t_disk, t_contr, t_bh);
        printf("\n");
        fflush(stdout);
    } /* if (myRank == rootRank) */

    std::fill(t,  t+N,  time_cur);
    std::fill(dt, dt+N, dt_min);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);


    /* some local settings for G6a boards */
    int clusterid, numGPU;
    if (config->devices_per_node==0) {
        MPI_Comm shmcomm;
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(shmcomm, &numGPU);
        MPI_Comm_rank(shmcomm, &clusterid);
    } else {
        numGPU = config->devices_per_node;
        clusterid = myRank % numGPU;
    }
    printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
    fflush(stdout);

    /* init the local GRAPEs */
    g6_open(clusterid);
    int npipe = g6_npipes();
    g6_set_tunit(new_tunit);
    g6_set_xunit(new_xunit);

    int n_loc = N/n_proc;
    Calc_self_grav calc_self_grav(N, n_loc, clusterid, npipe, eps);
    Active_search active_search(myRank, n_proc, n_loc, N);
    Black_hole_physics black_hole_physics;
    if (config->live_smbh_count == 1)
        black_hole_physics = Black_hole_physics(m[0], 0, myRank, rootRank);
    else if (config->live_smbh_count == 2)
        black_hole_physics = Black_hole_physics(m[0], m[1], myRank, rootRank);
    if (config->binary_smbh_pn) {
        black_hole_physics.set_post_newtonian(config->pn_c, config->pn_usage.data());
        if (config->pn_usage[6]) black_hole_physics.set_spins(config->smbh1_spin.data(), config->smbh2_spin.data());
    }
    black_hole_physics.set_softening(eps, config->live_smbh_custom_eps);

#ifdef ETICS
    grapite_read_particle_tags(N, config->grapite_mask_file_name.c_str(), myRank, n_loc);
    grapite_set_dt_exp(config->dt_scf);
    grapite_set_t_exp(time_cur);
#endif

    /* load the nj particles to the G6 */

    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, zeros, zeros, v[j], x[j]);
    } /* k */

#ifdef ETICS
    double etics_length_scale;
    if (myRank == rootRank) etics_length_scale = grapite_get_length_scale(N, m, (double(*)[3])x, (double(*)[3])v); // We don't want all ranks to do it, because they need to write a file and might confuse each other
    MPI_Bcast(&etics_length_scale, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    grapite_set_length_scale(etics_length_scale);

    grapite_cep_index = grapite_get_cep_index();
    if (grapite_cep_index >= 0) {
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, zeros, zeros);
    }
#endif

    /* define the all particles as a active on all the processors for the first time grav calc. */

    calc_self_grav(time_cur, N, ind, x, v, pot_act_tmp, a_act_tmp, adot_act_tmp);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Reduce the "global" vectors from "local" on all processors) */
    MPI_Allreduce(pot_act_tmp,  pot,    N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(a_act_tmp,    a,    3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(adot_act_tmp, adot, 3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (config->live_smbh_count == 2) {
        black_hole_physics.set_xv(x[0], x[1], v[0], v[1]);
        if (config->live_smbh_custom_eps >= 0) black_hole_physics.adjust_softening(pot[0], pot[1], a[0], a[1], adot[0], adot[1]);
        if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[0], a[0], a[1], adot[0], adot[1]);
    }

    calc_ext_grav(external_gravity_components, N, x, v, pot_ext, a, adot);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);


    /* Energy control... */
    if (myRank == rootRank) {
        energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
    } /* if (myRank == rootRank) */

#ifdef ETICS
    if (config->etics_dump_coeffs && (diskstep==0)) {
        char out_fname[256];
        sprintf(out_fname, "coeffs.%06d.%02d.dat", 0, myRank);
        grapite_dump(out_fname, 2);
    }

    if (grapite_cep_index >= 0) {
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
    }
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Define initial timestep for all particles on all nodes */

    for (int j=0; j<N; j++) {
        double a2_mod = a[j].norm2();
        double adot2_mod = adot[j].norm2();

        double dt_tmp, eta_s = eta/ETA_S_CORR;
        if (adot2_mod==0) dt_tmp = eta_s; // That's weird, when will we have such a case?
        else              dt_tmp = eta_s*sqrt(a2_mod/adot2_mod);

        int power = log(dt_tmp)/log(2.0) - 1;

        dt_tmp = pow(2.0, (double)power);

        if (dt_tmp < dt_min) dt_tmp = dt_min;
        if (dt_tmp > dt_max) dt_tmp = dt_max;

        dt[j] = dt_tmp;

        if (config->dt_min_warning && (myRank == 0)) {
            if (dt[j] == dt_min) {
                printf("!!! Warning0: dt = dt_min = %.6E \t ind = %07d \n", dt[j], ind[j]);
                fflush(stdout);
            }
        }
    } /* j */

    if (config->live_smbh_count > 0) {
        double min_dt = *std::min_element(dt, dt+N);
        for (int i=0; i<config->live_smbh_count; i++) dt[i] = min_dt;
    }

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* load the new values for particles to the local GRAPE's */
    /* load the nj particles to the G6 */

    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, adot[j]*(1./6.), a[j]*0.5, v[j], x[j]);
    } /* k */

    if (myRank == rootRank) {

        /* Write BH data... */
        if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

        /* Write BH NB data... */
        if (config->live_smbh_neighbor_output) write_bh_nb_data(config->live_smbh_neighbor_number, config->live_smbh_count, time_cur, N, m, x, v);

    } /* if (myRank == rootRank) */

    /* Get the Starting time on rootRank */

    if (myRank == rootRank) {
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
    } /* if (myRank == rootRank) */

    timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
    n_act_sum = 0.0;

    for (int i=1; i<N+1; i++) { // TODO why like this?
        n_act_distr[i-1] = 0.0;
    }

#ifdef TIMING
    DT_TOT      = 0.0;

    DT_ACT_DEF1 = 0.0;
    DT_ACT_DEF2 = 0.0;
    DT_ACT_DEF3 = 0.0;
    DT_ACT_PRED = 0.0;
    DT_ACT_GRAV = 0.0;
    DT_EXT_GRAV = 0.0;
    DT_EXT_GMC_GRAV = 0.0;
    DT_GMC_GMC_GRAV = 0.0;
    DT_ACT_CORR = 0.0;
    DT_ACT_LOAD = 0.0;

    DT_STEVOL    = 0.0;
    DT_STARDISK  = 0.0;
    DT_STARDESTR = 0.0;

    DT_ACT_REDUCE = 0.0;
#endif

    /* The main integration loop */

    while (time_cur <= t_end) {

        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        double min_t = active_search.get_minimum_time(t, dt);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        active_search.get_active_indices(min_t, t, dt, ind_act, &n_act);

// TODO deal with it below
#ifdef ACT_DEF_GRAPITE
#error please fix here
#endif
#if defined(ACT_DEF_GRAPITE) && (defined(ADD_BH1) || defined(ADD_BH2))
#ifdef ADD_BH1
#define ACT_DEF_GRAPITE_NUMBH 1
#else
#define ACT_DEF_GRAPITE_NUMBH 2
#endif
        int acf_def_grapite_bh_count = 0;
        for (i=0; i<n_act; i++) {
            if (ind_act[i]==0) {
                i_bh1 = i;
                acf_def_grapite_bh_count++;
        }
#ifdef ADD_BH2
            else if (ind_act[i]==1) {
                i_bh2 = i;
                acf_def_grapite_bh_count++;
            }
#endif
            if (acf_def_grapite_bh_count==ACT_DEF_GRAPITE_NUMBH) break;
        }
        if (i==n_act) {
            fprintf(stderr, "ERROR: black holes were not found in the active particle list");
            exit(1);
        }
#else
        if (config->live_smbh_count > 0) {
            i_bh1 = 0;
            i_bh2 = 1;
        }
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif
        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            // NOTICE much of these are just not needed!
            // m_act[i] = m[j_act];
            // x_act[i] = x[j_act];
            // v_act[i] = v[j_act];
            t_act[i]  = t[j_act];
            //dt_act[i] = dt[j_act];
            // NOTICE Why do we need pot_act and pot_act_ext? Probably redundant.
            //pot_act[i] = pot[j_act];
            pot_act_ext[i] = pot_ext[j_act]; // TODO almost certainly redundant
            a_act_old[i] = a[j_act];
            adot_act_old[i] = adot[j_act];
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF3 += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* predict the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            double dt = min_t - t[j_act];
            double dt2half = 0.5*dt*dt;
            double dt3over6 = (1./3.)*dt*dt2half;
            x_act_new[i] = x[j_act] + v[j_act]*dt + a[j_act]*dt2half + adot[j_act]*dt3over6;
            v_act_new[i] = v[j_act] + a[j_act]*dt + adot[j_act]*dt2half;
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif
        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new,
                               pot_act_tmp, a_act_tmp, adot_act_tmp);

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Reduce the "global" vectors from "local" on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

        /* Wait to all processors to finish his works... */
        MPI_Barrier(MPI_COMM_WORLD);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
#endif

        if (config->live_smbh_count == 2) {
            black_hole_physics.set_xv(x_act_new[i_bh1], x_act_new[i_bh2], v_act_new[i_bh1], v_act_new[i_bh2]);
            if (config->live_smbh_custom_eps >= 0) black_hole_physics.adjust_softening(pot_act_new[i_bh1], pot_act_new[i_bh2], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
            if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[i_bh1], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
        }

        calc_ext_grav(external_gravity_components, n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);

        /* correct the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        double min_dt = dt_max;
        for (int i=0; i<n_act; i++) {
            // NOTICE looks like we're doing three unrelated things in this loop: (1) correcting positions and velocities (2) calculating new steps, and (3) putting the corrected values from the _act_new back in the _act arrays.
            // After going back to the _act arrays they don't do much before they go back to the main arrays, so this copy seems redundant (the SMBH influence sphere printout needs these values but it should be a function anyway).
            // TODO split this loop into the three tasks it is doing, and remove the redundancy.
            double dt_tmp = min_t - t_act[i];

            double3 a2, a3;
            calc_high_derivatives(dt_tmp, a_act_old[i], a_act_new[i], adot_act_old[i], adot_act_new[i], &a2, &a3);

            corrector(dt_tmp, a2, a3, &x_act_new[i], &v_act_new[i]);

            //TODO make beautiful
            double eta_curr;
            if ((config->live_smbh_count > 0) && (ind_act[i] < config->live_smbh_count)) eta_curr = eta_bh;
            else eta_curr = eta;

            double dt_new = aarseth_step(eta_curr, dt_tmp, a_act_new[i], adot_act_new[i], a2, a3);

            //TODO the below should be moved to a function
            if (dt_new < dt_min) dt_tmp = dt_min;
            if ((dt_new < dt_tmp) && (dt_new > dt_min)) {
                int power = log(dt_new)/log(2.0) - 1;
                dt_tmp = pow(2.0, (double)power); // TODO why is this casting needed here?
            }

            if ((dt_new > 2*dt_tmp) && (fmod(min_t, 2*dt_tmp) == 0) && (2*dt_tmp <= dt_max)) {
                dt_tmp *= 2;
            }

            if (config->dt_min_warning && (myRank == 0)) {
                if (dt_tmp == dt_min) {
                    printf("!!! Warning1: dt_act = dt_min = %.6E \t ind_act = %07d \n", dt_tmp, ind_act[i]);
                    fflush(stdout);
                }
            }
            if (dt_tmp < min_dt) min_dt = dt_tmp;

            int j_act = ind_act[i];
            x[j_act] = x_act_new[i];
            v[j_act] = v_act_new[i];
            t[j_act] = min_t;
            dt[j_act] = dt_tmp;
            pot[j_act] = pot_act_new[i];
            pot_ext[j_act] = pot_act_ext[i]; // ???
            a[j_act] = a_act_new[i];
            adot[j_act] = adot_act_new[i];
        } /* i */

        /* define the min. dt over all the act. part. and set it also for the BH... */

        if (config->live_smbh_count > 0) {
            if (config->live_smbh_count>=1) dt[0] = min_dt;
            if (config->live_smbh_count==2) dt[1] = min_dt;

        }

        if (config->binary_smbh_influence_sphere_output && (myRank == rootRank)) {
            //TODO clean up this mass. We don't want to have all these _act arrays; they are only needed for this lame function.
            binary_smbh_influence_sphere_output(i_bh1, i_bh2, ind_act, n_act, timesteps, time_cur, config->binary_smbh_influence_radius_factor, inf_event, &source_particle_list);
        }

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* load the new values for active particles to the local GRAPE's */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (int i=0; i<n_act; i++) {
#ifdef ETICS
            if (ind_act[i] == grapite_cep_index) grapite_update_cep(t[grapite_cep_index], x[grapite_cep_index], v[grapite_cep_index], a[grapite_cep_index], adot[grapite_cep_index]); // All ranks should do it.
#endif
            int cur_rank = ind_act[i]/n_loc;

            if (myRank == cur_rank) {
                int j_act = ind_act[i];
                int address = ind_act[i] - myRank*n_loc;
                g6_set_j_particle(clusterid, address, ind_act[i], t[j_act], dt[j_act], m[j_act], zeros, adot[j_act]*(1./6.), a[j_act]*0.5, v[j_act], x[j_act]);

            } /* if (myRank == cur_rank) */

        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Current time set to min_t */

        time_cur = min_t;

        timesteps += 1.0;
        n_act_sum += n_act;
        n_act_distr[n_act-1]++;

        if (time_cur >= t_bh) {
            if (myRank == rootRank) {
                /* Write BH data... */
                if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);

                /* Write BH NB data... */
                if (config->live_smbh_neighbor_output) write_bh_nb_data(config->live_smbh_neighbor_number, config->live_smbh_count, time_cur, N, m, x, v);

            } /* if (myRank == rootRank) */

            t_bh += dt_bh;
        } /* if (time_cur >= t_bh) */

        if (time_cur >= t_contr) {
            if (myRank == rootRank) {

                energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);

                /* write cont data */

                if (config->output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);

                /* possible OUT for timing !!! */

#ifdef TIMING
                FILE *out = fopen("timing.dat", "a");

                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
                         DT_ACT_CORR + DT_ACT_LOAD +
                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
                         DT_ACT_REDUCE;

                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
                        time_cur, DT_TOT,
                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
                        100.0*DT_ACT_REDUCE/DT_TOT,
                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);

                fclose(out);
#endif

            } /* if (myRank == rootRank) */

#ifdef ETICS
            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.
            if (grapite_cep_index >= 0) {
                grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
                x[grapite_cep_index] = xdc;
                v[grapite_cep_index] = vdc;
                grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
            }
#endif

            t_contr += dt_contr;
        } /* if (time_cur >= t_contr) */

        if (time_cur >= t_disk) {
            char out_fname[256];
            diskstep++;
            if (myRank == rootRank) {
                sprintf(out_fname, "%06d", diskstep);
                if (config->output_hdf5) h5_write(std::string(out_fname) + ".h5", diskstep, N, time_cur, m, x, v, pot, a, adot, config->output_extra_mode, config->output_hdf5_double_precision);
                else ascii_write(std::string(out_fname) + ".dat", diskstep, N, time_cur, m, x, v, config->output_ascii_precision);
            } /* if (myRank == rootRank) */

#ifdef ETICS
            if (config->etics_dump_coeffs) {
                sprintf(out_fname, "coeffs.%06d.%02d.dat", diskstep, myRank);
                grapite_dump(out_fname, 2);
            }
#endif
            t_disk += dt_disk;
        } /* if (time_cur >= t_disk) */
    } /* while (time_cur < t_end) */

    /* close the local GRAPEs */
    g6_close(clusterid);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    double g6_calls_sum;
    MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (myRank == rootRank) {

        /* Write some output for the timestep annalize... */

        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
        printf("\n");
        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
        fflush(stdout);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Finalize the MPI work */
    MPI_Finalize();

    return 0;
}