/*****************************************************************************
File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
                :
Contents       : N-body code with integration by individual block time step
                : together with the parallel using of GRAPE6a board's.
                :
                : Added the GPU support via SAPPORO library.
                :
                : Normalization to the physical units!!!
                :
                : External Potential added
                : Plummer-Kuzmin: Bulge, Disk, Halo
                : Kharchenko+Andreas...
                :
                : SC extra POT for Bek SC test runs...
                :
                : Rebuced to the Single BH -> Plummer
                : Andreas+Fazeel...
                :
                : Stellar evolution added
                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
                : IMS mass loss: van den Hoeg & Groenewegen (1997)
                :
                : STARDESTR_EXT: Tidal disruption of stars by external BH...
                : Chingis, Denis & Maxim...
                :
                : STARDESTR: Tidal disruption of stars by BH...
                : Jose, Li Shuo & Shiyan Zhong
                :
                : STARDISK: Drag force...
                : Chingis, Denis & Maxim...
                :
                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
                : Taras, Andreas...
                :
                : Live BH (1 || 2) + ACC + EJECT...
                : Li Shuo & Shiyan Zhong
                :
                : dt_min for BH (1 || 2)...
                :
                : added the PN calculus for the BBH
                : PN0, PN1, PN2, PN2.5 (coded on the base of
                : Gabor Kupi original routine)
                :
                : added the "name" array...
                :
                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
                : for Alexey SC runs... and also for Fazeel Zurich runs...
                :
                : CPU_TIMELIMIT added for the Julich MW cluster runs...
                :
Coded by       : Peter Berczik
Version number : 19.04
Last redaction : 2019.04.16 12:55
*****************************************************************************/
#include "phigrape.h"
Parameters parameters;

// struct Parameters {
//     double eps = 1E-4;
//     double t_end = 1;
//     double dt_disk = 0.125;
//     double dt_contr = 0.125;
//     double dt_bh = 0.125;
//     double eta = 0.01;
//     std::string input_file_name = "data.con";
//     bool   dt_min_warning = true; // DT_MIN_WARNING
//     int    live_smbh_count = 2; // ADD_BH1 or ADD_BH2
//     double live_smbh_custom_eps = 0; // ADD_N_BH
//     bool   live_smbh_output = true; // BH_OUT
//     bool   live_smbh_neighbor_output = true; // BH_OUT_NB // NOTE needs `live_smbh_neighbor_number`
//     int    live_smbh_neighbor_number = 10; // TODO make sure it's smaller than N? or just warn if it's not
//     bool   binary_smbh_pn = true; // ADD_PN_BH
//     bool   binary_smbh_influence_sphere_output = true; // BBH_INF // NOTE needs `binary_smbh_influence_radius_factor`
//     double binary_smbh_influence_radius_factor = 3.162277660168379497918067e+03; // R_INF
// } parameters;

//#define NORM            // Physical normalization

// // // //#define ADD_BH1        // add the Single BH

// // // // #define ADD_BH2            // add the Binary BH's
// // // // #define ADD_N_BH        // eps_BH = 0.0, but added only the Newtonian forces
// // // // #define ADD_PN_BH        // extra - added also the Post-Newton forces
// // // // #define ADD_SPIN_BH        // extra - added the SPIN for the BH's - DEFAULT !!!

// // // // #define BH_OUT           // extra output for BH's (live)
// // // // #define BH_OUT_NB        // extra output for the BH's neighbours (live)

// // // // #define BBH_INF            // BBH influence sphere...
// // // // #define R_INF  3.162277660168379497918067e+03        // Factor for the influence sphere... if ( R < R_INF * DR_BBH )
// // // // #define R_INF2 (R_INF*R_INF)

// // // // #define DT_MIN_WARNING         // dt < dt_min warning !!!

//#define BH_OUT_NB_EXT        // extra output for the BH's neighbours (external)

//#define STARDESTR        // disruption of stars by BH tidal forces

//#define STARDESTR_EXT        // disruption of stars by external BH tidal forces

//#define EXTPOT            // external potential (BH? or galactic?)
//#define EXTPOT_BH        // BH - usually NB units

//#define EXTPOT_GAL        // Galactic B+D+H PK - usually physical units

//#define EXTPOT_GAL_DEH      // Dehnen Galactic - usually physical units
//#define EXTPOT_GAL_LOG      // Log Galactic - usually physical units

//#define EXTPOT_SC        // SC extra POT for Bek test runs...
//#define DATAFILE        eff0.05.tsf10.00.tab
//#define M_SC_DIM        100001

//#define CMCORR        // CM correction in the zero step and in every dt_contr

//#define LIMITS        // for "0" mass particles !!!
//#define R_LIMITS        1.0E+03        // for "0" mass particles !!!

//#define LIMITS_NEW        // for "0" mass particles !!!

//#define LIMITS_ALL_BEG    // for ALL particles at the beginning...
//#define LIMITS_ALL        // for ALL particles
//#define R_LIMITS_ALL        1.0E+03    // for ALL particles

#ifdef ETICS
#include "grapite.h"
// why do we need CEP as a compilaion flag... just have it always on when ETICS is on. IF there is no CEP, there should be a graceful skipping of those operations.
//#define ETICS_CEP
#ifndef ETICS_DTSCF
#error "ETICS_DTSCF must be defined"
#endif
#endif

#define TIMING

#define ETA_S_CORR   4.0
#define ETA_BH_CORR  4.0

#define DTMAXPOWER  -3.0
#define DTMINPOWER -36.0

/*
-3.0     0.125
-4.0   0.0625
-5.0   0.03125
-7.0     ~1e-2
-10.0    ~1e-3
.............
-20.0     ~1e-6
-23.0    ~1e-7
-26.0    ~1e-8
-30.0    ~1e-9
*/

//#define ACT_DEF_LL

#if defined(ACT_DEF_LL) && defined(ACT_DEF_GRAPITE)
#error "Contradicting preprocessor flags!"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <algorithm>
#include <numeric>

#define G6_NPIPE 2048
#include "grape6.h"

#include <mpi.h>

/* Some "good" functions and constants... */
#define SIG(x)   (((x)<0) ? (-1):(1) )
#define ABS(x)   (((x)<0) ? (-x):(x) )
#define MAX(a,b) (((a)>(b)) ? (a):(b) )
#define MIN(a,b) (((a)<(b)) ? (a):(b) )
#define SQR(x)   ((x)*(x) )
#define POW3(x)  ((x)*SQR(x) )

#define Pi          3.14159265358979323846
#define TWOPi       6.283185307179
#define sqrt_TWOPi  2.506628274631

#ifdef NORM
//http://pdg.lbl.gov/2015/reviews/rpp2015-rev-astrophysical-constants.pdf

#define G      6.67388E-11        // (m/s^2) * (m^2/kg)
#define Msol   1.988489E+30        // kg
#define Rsol   6.957E+08        // m
#define AU     149597870700.0        // m
#define pc     3.08567758149E+16    // m
#define Year   31556925.2        // s
#define c_feny 299792458.0        // m/s

#define kpc    (1.0E+03*pc)        // m
#define km     1.0E+03            // km   -> m
#define cm3    1.0E-06            // cm^3 -> m^3
#define Myr    (1.0E+06*Year)        // s
#define Gyr    (1.0E+09*Year)        // s
#define R_gas  8.31447215        // J/(K*mol)
#define k_gas  1.380650424E-23        // J/K
#define N_A    6.022141510E+23      // 1/mol
#define mu     1.6605388628E-27        // kg
#define mp     1.67262163783E-27     // kg
#define me     9.1093821545E-31     // kg

#define pc2    (pc*pc)
#define pc3    (pc*pc*pc)
#define kpc2   (kpc*kpc)
#define kpc3   (kpc*kpc*kpc)
#endif

/*
    1KB =    1024
    2KB =    2048
    4KB =    4096
    8KB =    8192
16KB =   16384
32KB =   32768
64KB =   65536
128KB =  131072
256KB =  262144
512KB =  524288
1024KB = 1048576   ->   1MB
*/

#define KB         1024
#define MB         (KB*KB)

#define N_MAX      (6*MB)
#define N_MAX_loc  (2*MB)

struct double3 {
    double data[3];
    double3() {}
    double3(const double x, const double y, const double z)
    {
        data[0] = x;
        data[1] = y;
        data[2] = z;
    }
    double& operator[](int i) {return data[i];}
    double3& operator=(const double3& a)
    {
        data[0] = a.data[0];
        data[1] = a.data[1];
        data[2] = a.data[2];
        return *this;
    }
    double3& operator+=(const double3& a)
    {
        data[0] += a.data[0];
        data[1] += a.data[1];
        data[2] += a.data[2];
        return *this;
    }
    double3& operator-=(const double3& a)
    {
        data[0] -= a.data[0];
        data[1] -= a.data[1];
        data[2] -= a.data[2];
        return *this;
    }
    double3& operator/=(const double& c)
    {
        data[0] /= c;
        data[1] /= c;
        data[2] /= c;
        return *this;
    }
    double norm()
    {
        return sqrt(data[0]*data[0]+data[1]*data[1]+data[2]*data[2]);
    }
    operator double*() {return data;}
};

double3 operator*(const double& c, const double3& a)
{
    return double3(a.data[0]*c, a.data[1]*c, a.data[2]*c);
}

double3 operator*(const double3& a, const double& c)
{
    return double3(a.data[0]*c, a.data[1]*c, a.data[2]*c);
}

double3 operator/(const double3& a, const double& c)
{
    return double3(a.data[0]/c, a.data[1]/c, a.data[2]/c);
}

double3 operator+(const double3& a, const double3& b)
{
    return double3(a.data[0]+b.data[0], a.data[1]+b.data[1], a.data[2]+b.data[2]);
}

double3 operator-(const double3& a, const double3& b)
{
    return double3(a.data[0]-b.data[0], a.data[1]-b.data[1], a.data[2]-b.data[2]);
}


int    name_proc, n_proc=1, myRank=0, rootRank=0, cur_rank,
    i, j, k, ni, nj, diskstep=0, power, jjj, iii,
    skip_con=0, tmp_i;

double dt_disk, dt_contr, t_disk=0.0, t_contr=0.0,
    dt_bh, t_bh=0.0, dt_bh_tmp,
    t_end, time_cur, dt_min, dt_max, min_t, min_t_loc, dt_new, min_dt,
    eta_s, eta, eta_bh,
    E_pot, E_pot_ext, E_kin, E_tot, E_tot_0, DE_tot,
    E_tot_corr, E_tot_corr_0, DE_tot_corr,
    E_tot_corr_sd, E_tot_corr_sd_0, DE_tot_corr_sd,
    E_corr = 0.0, E_sd = 0.0, t_diss_on = 0.125,
    mcm, rcm_mod, vcm_mod,
    rcm_sum=0.0, vcm_sum=0.0,
    eps=0.0, eps2,
    a2_mod, adot2_mod,
    dt_tmp, dt2half, dt3over6, dt4over24, dt5over120,
    dtinv, dt2inv, dt3inv,
    a0mia1, ad04plad12, ad0plad1,
    a1abs, adot1abs, a2dot1abs, a3dot1abs,
    timesteps=0.0, n_act_sum=0.0, n_act_distr[N_MAX], g6_calls=0.0, g6_calls_sum=0.0,
    tmp, tmp_r, tmp_v, tmp_rv, tmp_cpu,
    tmp_pot, tmp_a, tmp_adot,
    tmp_a_bh, tmp_adot_bh,
    tmp_a_bh_pn0, tmp_a_bh_pn1, tmp_a_bh_pn2, tmp_a_bh_pn2_5, tmp_a_bh_pn3, tmp_a_bh_pn3_5, tmp_a_bh_spin,
    tmp_adot_bh_pn0, tmp_adot_bh_pn1, tmp_adot_bh_pn2, tmp_adot_bh_pn2_5, tmp_adot_bh_pn3, tmp_adot_bh_pn3_5, tmp_adot_bh_spin;


double L[3]; // needed in pn_bh_spin.c
// Needed for things related to BHs
#include "debug.h"
int    ind_sort[N_MAX];
double var_sort[N_MAX];


double3 xcm, vcm, mom,
        xdc, vdc,
        a2, a3, a2dot1;

char   processor_name[MPI_MAX_PROCESSOR_NAME],
    inp_fname[30],
    out_fname[30];

/* global variables */

int    N, N_star, N_bh,
    ind[N_MAX], name[N_MAX];
double m[N_MAX], pot[N_MAX], t[N_MAX], dt[N_MAX];
double3 x[N_MAX], v[N_MAX], a[N_MAX], adot[N_MAX];

/* local variables */

int    n_loc, ind_loc[N_MAX_loc];
double m_loc[N_MAX_loc], pot_loc[N_MAX_loc], t_loc[N_MAX_loc], dt_loc[N_MAX_loc];
double3 x_loc[N_MAX_loc], v_loc[N_MAX_loc],
        a_loc[N_MAX_loc], adot_loc[N_MAX_loc];

/* data for active particles */

int    n_act, ind_act[N_MAX];
double m_act[N_MAX],
       pot_act[N_MAX], t_act[N_MAX], dt_act[N_MAX],
       pot_act_new[N_MAX],
       pot_act_tmp[N_MAX],
       pot_act_tmp_loc[N_MAX];

double3 x_act[N_MAX], v_act[N_MAX],
        a_act[N_MAX], adot_act[N_MAX],
        x_act_new[N_MAX], v_act_new[N_MAX],
        a_act_new[N_MAX], adot_act_new[N_MAX],
        a_act_tmp[N_MAX], adot_act_tmp[N_MAX],
        a_act_tmp_loc[N_MAX], adot_act_tmp_loc[N_MAX];

FILE   *inp, *out;

double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
double CPU_time_real,  CPU_time_user,  CPU_time_syst;

#ifdef TIMING

double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;

double DT_TOT,
    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
    DT_ACT_GRAV, DT_EXT_GRAV,
    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
    DT_ACT_CORR, DT_ACT_LOAD,
    DT_STEVOL, DT_STARDISK, DT_STARDESTR;

double DT_ACT_REDUCE;

#endif

/* some local settings for G6a board's */

int clusterid, ii, nn, numGPU;

int    npipe=G6_NPIPE, index_i[G6_NPIPE];
double h2_i[G6_NPIPE], p_i[G6_NPIPE];
double3 x_i[G6_NPIPE], v_i[G6_NPIPE],
        a_i[G6_NPIPE], jerk_i[G6_NPIPE];
int    new_tunit=51, new_xunit=51;

double ti=0.0;
double3 a2by18, a1by6, aby2;

/* normalization... */

#ifdef NORM
double m_norm, r_norm, v_norm, t_norm;
#endif

double eps_BH=0.0;

/* external potential... */

#ifdef EXTPOT

#ifdef EXTPOT_GAL
double m_bulge, a_bulge, b_bulge,
    m_disk,    a_disk,    b_disk,
    m_halo,    a_halo,    b_halo, // NOTE there are other "halo" variables in EXTPOT_GAL_LOG
    x2_ij, y2_ij, z2_ij, // NOTE this variable exist also in EXTPOT_GAL_LOG
    r_tmp, r2_tmp, z_tmp, z2_tmp;
#endif

#ifdef EXTPOT_GAL_DEH
double m_ext, r_ext, g_ext,
    tmp_r2, tmp_r3, dum, dum2, dum3, dum_g, tmp_g;
#endif

#ifdef EXTPOT_GAL_LOG
double v_halo, r_halo,
    v2_halo, r2_halo, r2_r2_halo,
    x2_ij, y2_ij, z2_ij;
#endif

#ifdef EXTPOT_BH
double  r2, rv_ij,
        m_bh, b_bh, eps_bh; // NOTE different from eps_BH
        // NOTE there are other m_bh and b_bh defined outside this ifdef block
        // NOTE this is a mess. Looks like eps_bh is never used, and the m_bh from outside the block is never used.
#endif

double pot_ext[N_MAX], pot_act_ext[N_MAX];    // for all EXTPOT

#endif

int    i_bh, i_bh1, i_bh2,
    num_bh = 0, num_bh1 = 0, num_bh2 = 0;

double m_bh, m_bh1, m_bh2, b_bh,
    r, r2,
    x_ij, y_ij, z_ij,
    vx_ij, vy_ij, vz_ij, rv_ij;

// #ifdef BBH_INF
int    inf_event[N_MAX];
double DR2, tmp_r2;
double3 x_bbhc, v_bbhc;
double DV2, EB, SEMI_a, SEMI_a2;
// #endif // BBH_INF

// #ifdef ADD_N_BH

// double x_bh1[3], x_bh2[3], v_bh1[3], v_bh2[3];
double3 x_bh1, x_bh2, v_bh1, v_bh2;

// double pot_bh1, a_bh1[3], adot_bh1[3],
//     pot_bh2, a_bh2[3], adot_bh2[3];
double pot_bh1, pot_bh2;
double3 a_bh1, adot_bh1, a_bh2, adot_bh2;


//double eps_BH = 0.0;

#include "n_bh.c"

/*
int calc_force_n_BH(double m1, double xx1[], double vv1[],
                    double m2, double xx2[], double vv2[],
                    double eps_BH,
                    double pot_n1, double a_n1[], double adot_n1[],
                    double pot_n2, double a_n2[], double adot_n2[])
*/

/*
                INPUT

m1         - mass of the 1 BH
xx1[0,1,2] - coordinate of the 1 BH
vv1[0,1,2] - velocity of the 1 BH

m2         - mass of the 2 BH
xx2[0,1,2] - coordinate of the 2 BH
vv2[0,1,2] - velocity of the 2 BH

eps_BH     - force softening, can be even exactly 0.0 !

                OUTPUT

pot_n1          for the 1 BH
a_n1    [0,1,2] for the 1 BH
adot_n1 [0,1,2] for the 1 BH

pot_n2          for the 2 BH
a_n2    [0,1,2] for the 2 BH
adot_n2 [0,1,2] for the 2 BH

return -   0 if everything OK
*/

// #endif // ADD_N_BH

// #ifdef ADD_PN_BH

double C_NB = 477.12;

int    usedOrNot[7] = {1, 1, 1, 1, 0, 0, 0};

// double a_pn1[7][3], adot_pn1[7][3],
//     a_pn2[7][3], adot_pn2[7][3];
double3 a_pn1[7], adot_pn1[7], a_pn2[7], adot_pn2[7];

double s_bh1[3] = {0.0, 0.0, 1.0};
double s_bh2[3] = {0.0, 0.0, 1.0};

#include "pn_bh_spin.c"

/*
int calc_force_pn_BH(double m1, double xx1[], double vv1[], double ss1[],
                    double m2, double xx2[], double vv2[], double ss2[],
                    double CCC_NB, double dt_bh,
                    int usedOrNot[],
                    double a_pn1[][3], double adot_pn1[][3],
                    double a_pn2[][3], double adot_pn2[][3])
*/

/*
                INPUT

m1         - mass of the 1 BH
xx1[0,1,2] - coordinate of the 1 BH
vv1[0,1,2] - velocity of the 1 BH
spin1[0,1,2] - normalized spin of the 1 BH

m2         - mass of the 2 BH
xx2[0,1,2] - coordinate of the 2 BH
vv2[0,1,2] - velocity of the 2 BH
spin2[0,1,2] - normalized spin of the 2 BH

CCC_NB           - Speed of light "c" in internal units
dt_BH         - timestep of the BH's, needed for the SPIN integration

usedOrNot[PN0, PN1, PN2, PN2.5, PN3, PN3.5, SPIN] - different PN term usage: PN1, PN2, PN2.5, PN3, PN3.5, SPIN
        0    1    2    3      4    5      6

                OUTPUT

a_pn1   [0 - PN0; 1 - PN1; 2 - PN2; 3 - PN2.5, 4 - PN3, 5 - PN3.5, 6 - SPIN] [3]    for the 1 BH
adot_pn1[0 - PN0; 1 - PN1; 2 - PN2; 3 - PN2.5, 4 - PN3, 5 - PN3.5, 6 - SPIN] [3]    for the 1 BH

a_pn2   [0 - PN0; 1 - PN1; 2 - PN2; 3 - PN2.5, 4 - PN3, 5 - PN3.5, 6 - SPIN] [3]    for the 2 BH
adot_pn2[0 - PN0; 1 - PN1; 2 - PN2; 3 - PN2.5, 4 - PN3, 5 - PN3.5, 6 - SPIN] [3]    for the 2 BH

return -   0 if everything OK
    - 505 if BH's separation < 4 x (RSwarch1 + RSwarch2)
*/

// #endif // ADD_N_BH

#ifdef ETICS
double t_exp, dt_exp=ETICS_DTSCF; // t_exp is just the initial value
#ifdef ETICS_CEP
int grapite_cep_index;
#endif
#endif

void get_CPU_time(double *time_real, double *time_user, double *time_syst)
{
struct rusage xxx;
double sec_u, microsec_u, sec_s, microsec_s;
struct timeval tv;

getrusage(RUSAGE_SELF,&xxx);

sec_u = xxx.ru_utime.tv_sec;
sec_s = xxx.ru_stime.tv_sec;

microsec_u = xxx.ru_utime.tv_usec;
microsec_s = xxx.ru_stime.tv_usec;

*time_user = sec_u + microsec_u * 1.0E-06;
*time_syst = sec_s + microsec_s * 1.0E-06;

//  *time_real = time(NULL);

gettimeofday(&tv, NULL);
*time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;

*time_user = *time_real;

}

void read_data()
{
inp = fopen(inp_fname,"r");
fscanf(inp,"%d \n", &diskstep);

fscanf(inp,"%d \n", &N);

fscanf(inp,"%lE \n", &time_cur);

for (i=0; i<N; i++) fscanf(inp,"%d %lE %lE %lE %lE %lE %lE %lE \n", &ind[i], &m[i], &x[i][0], &x[i][1], &x[i][2], &v[i][0], &v[i][1], &v[i][2]);

fclose(inp);
}

void write_snap_data()
{
    sprintf(out_fname, "%06d.dat", diskstep);
    out = fopen(out_fname, "w");
    fprintf(out,"%06d \n", diskstep);
    fprintf(out,"%07d \n", N);
    fprintf(out,"%.10E \n", time_cur);
    for (i=0; i<N; i++) {
        fprintf(out,"%07d \t %.10E \t % .10E % .10E % .10E \t % .10E % .10E % .10E \n",
            ind[i],
            m[i],
            x[i][0], x[i][1], x[i][2],
            v[i][0], v[i][1], v[i][2]);
        }
    fclose(out);
}

void write_cont_data() // virtually identical to write_snap_data()
{
out = fopen("data.con","w");
fprintf(out,"%06d \n", diskstep);

fprintf(out,"%07d \n", N);

fprintf(out,"%.16E \n", time_cur);

for (i=0; i<N; i++)
    {
    fprintf(out,"%07d \t %.10E \t % .10E % .10E % .10E \t % .10E % .10E % .10E \n",
        ind[i],
        m[i],
        x[i][0], x[i][1], x[i][2],
        v[i][0], v[i][1], v[i][2]);
    }

fclose(out);
}

void write_bh_data()
{
    if (parameters.live_smbh_count == 2) {

        out = fopen("bh.dat", "a");

        for (int i=0; i < 2; i++) {
//             double (*a_pn)[3], (*adot_pn)[3], pot_bh, *a_bh, *adot_bh;
            double3 *a_pn, *adot_pn, a_bh, adot_bh;
            double pot_bh;
            if (i==0) {
                a_pn    = a_pn1;
                adot_pn = adot_pn1;
                pot_bh  = pot_bh1;
                a_bh    = a_bh1;
                adot_bh = adot_bh1;
            } else {
                a_pn    = a_pn2;
                adot_pn = adot_pn2;
                pot_bh  = pot_bh2;
                a_bh    = a_bh2;
                adot_bh = adot_bh2;
            }

            tmp_r = sqrt( SQR(x[i][0]) + SQR(x[i][1]) + SQR(x[i][2]) );
            tmp_v = sqrt( SQR(v[i][0]) + SQR(v[i][1]) + SQR(v[i][2]) );

            tmp_a = sqrt( SQR(a[i][0]) + SQR(a[i][1]) + SQR(a[i][2]) );
            tmp_adot = sqrt( SQR(adot[i][0]) + SQR(adot[i][1]) + SQR(adot[i][2]) );

            if (parameters.live_smbh_custom_eps >= 0) {

                tmp_a_bh = sqrt( SQR(a_bh[0]) + SQR(a_bh[1]) + SQR(a_bh[2]) );
                tmp_adot_bh = sqrt( SQR(adot_bh[0]) + SQR(adot_bh[1]) + SQR(adot_bh[2]) );

                if (parameters.binary_smbh_spin) {
                    tmp_a_bh_pn0   = sqrt( SQR(a_pn[0][0]) + SQR(a_pn[0][1]) + SQR(a_pn[0][2]) );
                    tmp_a_bh_pn1   = sqrt( SQR(a_pn[1][0]) + SQR(a_pn[1][1]) + SQR(a_pn[1][2]) );
                    tmp_a_bh_pn2   = sqrt( SQR(a_pn[2][0]) + SQR(a_pn[2][1]) + SQR(a_pn[2][2]) );
                    tmp_a_bh_pn2_5 = sqrt( SQR(a_pn[3][0]) + SQR(a_pn[3][1]) + SQR(a_pn[3][2]) );
                    tmp_a_bh_pn3   = sqrt( SQR(a_pn[4][0]) + SQR(a_pn[4][1]) + SQR(a_pn[4][2]) );
                    tmp_a_bh_pn3_5 = sqrt( SQR(a_pn[5][0]) + SQR(a_pn[5][1]) + SQR(a_pn[5][2]) );
                    tmp_a_bh_spin  = sqrt( SQR(a_pn[6][0]) + SQR(a_pn[6][1]) + SQR(a_pn[6][2]) );

                    tmp_adot_bh_pn0   = sqrt( SQR(adot_pn[0][0]) + SQR(adot_pn[0][1]) + SQR(adot_pn[0][2]) );
                    tmp_adot_bh_pn1   = sqrt( SQR(adot_pn[1][0]) + SQR(adot_pn[1][1]) + SQR(adot_pn[1][2]) );
                    tmp_adot_bh_pn2   = sqrt( SQR(adot_pn[2][0]) + SQR(adot_pn[2][1]) + SQR(adot_pn[2][2]) );
                    tmp_adot_bh_pn2_5 = sqrt( SQR(adot_pn[3][0]) + SQR(adot_pn[3][1]) + SQR(adot_pn[3][2]) );
                    tmp_adot_bh_pn3   = sqrt( SQR(adot_pn[4][0]) + SQR(adot_pn[4][1]) + SQR(adot_pn[4][2]) );
                    tmp_adot_bh_pn3_5 = sqrt( SQR(adot_pn[5][0]) + SQR(adot_pn[5][1]) + SQR(adot_pn[5][2]) );
                    tmp_adot_bh_spin  = sqrt( SQR(adot_pn[6][0]) + SQR(adot_pn[6][1]) + SQR(adot_pn[6][2]) );

                    fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \t\t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t\t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \n",
                            time_cur, m[i],
                            x[i][0], x[i][1], x[i][2], tmp_r,
                            v[i][0], v[i][1], v[i][2], tmp_v,
                            pot[i],
                            a[i][0],       a[i][1],       a[i][2],       tmp_a,
                            adot[i][0],    adot[i][1],    adot[i][2],    tmp_adot,
                            dt[i],
                            pot_bh,
                            a_bh[0],       a_bh[1],       a_bh[2],       tmp_a_bh,
                            adot_bh[0],    adot_bh[1],    adot_bh[2],    tmp_adot_bh,
                            a_pn[0][0],    a_pn[0][1],    a_pn[0][2],    tmp_a_bh_pn0,
                            adot_pn[0][0], adot_pn[0][1], adot_pn[0][2], tmp_adot_bh_pn0,
                            a_pn[1][0],    a_pn[1][1],    a_pn[1][2],    tmp_a_bh_pn1,
                            adot_pn[1][0], adot_pn[1][1], adot_pn[1][2], tmp_adot_bh_pn1,
                            a_pn[2][0],    a_pn[2][1],    a_pn[2][2],    tmp_a_bh_pn2,
                            adot_pn[2][0], adot_pn[2][1], adot_pn[2][2], tmp_adot_bh_pn2,
                            a_pn[3][0],    a_pn[3][1],    a_pn[3][2],    tmp_a_bh_pn2_5,
                            adot_pn[3][0], adot_pn[3][1], adot_pn[3][2], tmp_adot_bh_pn2_5,
                            a_pn[4][0],    a_pn[4][1],    a_pn[4][2],    tmp_a_bh_pn3,
                            adot_pn[4][0], adot_pn[4][1], adot_pn[4][2], tmp_adot_bh_pn3,
                            a_pn[5][0],    a_pn[5][1],    a_pn[5][2],    tmp_a_bh_pn3_5,
                            adot_pn[5][0], adot_pn[5][1], adot_pn[5][2], tmp_adot_bh_pn3_5,
                            a_pn[6][0],    a_pn[6][1],    a_pn[6][2],    tmp_a_bh_spin,
                            adot_pn[6][0], adot_pn[6][1], adot_pn[6][2], tmp_adot_bh_spin);
                } else {
                    fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \t\t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \n",
                            time_cur, m[i],
                            x[i][0], x[i][1], x[i][2], tmp_r,
                            v[i][0], v[i][1], v[i][2], tmp_v,
                            pot[i],
                            a[i][0], a[i][1], a[i][2], tmp_a,
                            adot[i][0],  adot[i][1],  adot[i][2],  tmp_adot,
                            dt[i],
                            pot_bh,
                            a_bh[0],    a_bh[1],    a_bh[2],    tmp_a_bh,
                            adot_bh[0], adot_bh[1], adot_bh[2], tmp_adot_bh);
                }
            } else {
                fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \n",
                        time_cur, m[i],
                        x[i][0], x[i][1], x[i][2], tmp_r,
                        v[i][0], v[i][1], v[i][2], tmp_v,
                        pot[i],
                        a[i][0], a[i][1], a[i][2], tmp_a,
                        adot[i][0], adot[i][1], adot[i][2], tmp_adot,
                        dt[i]);
            }
        }
        fprintf(out,"\n");
        fclose(out);
    } else if (parameters.live_smbh_count == 1) {
        out = fopen("bh.dat", "a");
        tmp_r = sqrt( SQR(x[0][0]) + SQR(x[0][1]) + SQR(x[0][2]) );
        tmp_v = sqrt( SQR(v[0][0]) + SQR(v[0][1]) + SQR(v[0][2]) );
        tmp_a = sqrt( SQR(a[0][0]) + SQR(a[0][1]) + SQR(a[0][2]) );
        tmp_adot = sqrt( SQR(adot[0][0]) + SQR(adot[0][1]) + SQR(adot[0][2]) );
        fprintf(out,"%.16E \t %.8E \t % .16E % .16E % .16E \t %.16E \t % .16E % .16E % .16E \t %.16E \t % .8E \t % .8E % .8E % .8E \t %.8E \t % .8E % .8E % .8E \t %.8E \t %.8E \n",
                time_cur, m[0],
                x[0][0], x[0][1], x[0][2], tmp_r,
                v[0][0], v[0][1], v[0][2], tmp_v,
                pot[0],
                a[0][0], a[0][1], a[0][2], tmp_a,
                adot[0][0], adot[0][1], adot[0][2], tmp_adot,
                dt[0]);
        fprintf(out,"\n");
        fclose(out);
    }
}

void write_bh_nb_data()
{
    int i_bh, nb = parameters.live_smbh_neighbor_number;
    double tmp, tmp_r, tmp_v;

    out = fopen("bh_neighbors.dat", "a");

    /* 1st BH */

    i_bh = 0;

    for (int i_bh=0; i_bh < parameters.live_smbh_count; i_bh++) {
        for (i=0; i<N; i++) var_sort[i] = (x[i]-x[i_bh]).norm();
        std::iota(ind_sort, ind_sort+N, 0);
        std::partial_sort(ind_sort, ind_sort + nb, ind_sort + N, [&](int i, int j) {return var_sort[i] < var_sort[j];});

        fprintf(out,"%.16E \t %07d \t %.8E \t % .8E % .8E % .8E \t % .8E % .8E % .8E \t",
                time_cur,
                i_bh,
                m[i_bh],
                x[i_bh][0], x[i_bh][1], x[i_bh][2],
                v[i_bh][0], v[i_bh][1], v[i_bh][2]);

        for (int j=1; j < nb; j++) {
            i = ind_sort[j];
            fprintf(out,"%02d %07d   %.8E   % .8E % .8E % .8E   %.8E   % .8E % .8E % .8E   %.8E \t",
                    j, i,
                    m[i],
                    x[i][0], x[i][1], x[i][2], (x[i]-x[i_bh]).norm(),
                    v[i][0], v[i][1], v[i][2], (v[i]-v[i_bh]).norm());
        }

        fprintf(out,"\n");
    }
    fprintf(out, "\n"); // this is redundant
    fclose(out);
}

void calc_ext_grav_zero()
{

#ifdef EXTPOT

/* Define the external potential for all particles on all nodes */

ni = n_act;

for (i=0; i<ni; i++)
    {
    pot_ext[i] = 0.0;
    }

#ifdef EXTPOT_GAL

for (i=0; i<ni; i++)
    {
/*
    x_ij = x_act[i][0]-x_ext;
    y_ij = x_act[i][1]-y_ext;
    z_ij = x_act[i][2]-z_ext;

    vx_ij = v_act[i][0]-vx_ext;
    vy_ij = v_act[i][1]-vy_ext;
    vz_ij = v_act[i][2]-vz_ext;
*/

    x_ij = x_act[i][0];
    y_ij = x_act[i][1];
    z_ij = x_act[i][2];

    vx_ij = v_act[i][0];
    vy_ij = v_act[i][1];
    vz_ij = v_act[i][2];

    x2_ij = SQR(x_ij);
    y2_ij = SQR(y_ij);
    z2_ij = SQR(z_ij);

    /* for BULGE */

    r2 = SQR(b_bulge);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_bulge / r;

    pot_ext[i] -= tmp;

    tmp /= r2;

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij;

    adot[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    /* for DISK */

    z2_tmp = z2_ij + SQR(b_disk);
    z_tmp  = sqrt(z2_tmp);

    r2_tmp = x2_ij + y2_ij + SQR(z_tmp + a_disk);
    r_tmp  = sqrt(r2_tmp);

    pot_ext[i] -= m_disk / r_tmp;

    tmp = m_disk / (r2_tmp*r_tmp);

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij * (z_tmp + a_disk)/z_tmp;

    tmp = m_disk / (z_tmp*r2_tmp*r2_tmp*r_tmp);

    adot[i][0] += tmp * (- vx_ij*z_tmp*r2_tmp
                        + 3.0*x_ij*vx_ij*x_ij*z_tmp
                        + 3.0*x_ij*vy_ij*y_ij*z_tmp
                        + 3.0*x_ij*vz_ij*z_ij*SQR(z_tmp + a_disk) );

    adot[i][1] += tmp * (- vy_ij*z_tmp*r2_tmp
                        + 3.0*y_ij*vx_ij*x_ij*z_tmp
                        + 3.0*y_ij*vy_ij*y_ij*z_tmp
                        + 3.0*y_ij*vz_ij*z_ij*SQR(z_tmp + a_disk) );

    adot[i][2] += tmp * (- vz_ij*(z_tmp + a_disk)*( x2_ij*( z2_tmp*z_tmp + a_disk*SQR(b_disk) ) +
                                                y2_ij*( z2_tmp*z_tmp + a_disk*SQR(b_disk) ) -
                                                ( 2.0*a_disk*(SQR(z_ij) - SQR(b_disk))*z_tmp +
                                                    2.0*SQR(z_ij*z_ij) +
                                                    SQR(b_disk)*SQR(z_ij) -
                                                    SQR(b_disk)*( SQR(a_disk) + SQR(b_disk) ) ) )
                        + 3.0*vx_ij*x_ij*z_ij*z2_tmp*(z_tmp + a_disk)
                        + 3.0*vy_ij*y_ij*z_ij*z2_tmp*(z_tmp + a_disk) ) / z2_tmp ;

    /* for HALO */

    r2 = SQR(b_halo);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_halo / r;

    pot_ext[i] -= tmp;

    tmp /= r2;

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij;

    adot[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    } /* i */

#endif

#ifdef EXTPOT_GAL_DEH

for (i=0; i<ni; i++)
    {

    x_ij = x_act[i][0];
    y_ij = x_act[i][1];
    z_ij = x_act[i][2];

    vx_ij = v_act[i][0];
    vy_ij = v_act[i][1];
    vz_ij = v_act[i][2];

    tmp_r = sqrt( SQR(x_ij) + SQR(y_ij) + SQR(z_ij) );
    tmp_r2 = SQR(tmp_r);
    tmp_r3 = POW3(tmp_r);

    dum   = tmp_r/(tmp_r + r_ext);
    dum2  = SQR(dum);
    dum3  = POW3(dum);
    dum_g = pow( dum , -g_ext );

    pot_ext[i] -= ((m_ext/r_ext) / (2.0-g_ext) ) * ( 1.0 - dum2 * dum_g );

    tmp = (m_ext/tmp_r3) * dum3 * dum_g;

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij;

    rv_ij = ( vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij );

    tmp = ( m_ext/POW3(tmp_r+r_ext) ) * dum_g;
    tmp_g = ((r_ext*g_ext + 3.0*tmp_r)/(tmp_r2*(tmp_r+r_ext)) ) * rv_ij;

    adot[i][0] += tmp * ( tmp_g * x_ij - vx_ij );
    adot[i][1] += tmp * ( tmp_g * y_ij - vy_ij );
    adot[i][2] += tmp * ( tmp_g * z_ij - vz_ij );

    } /* i */

#endif

/* For simple Log potential... */

#ifdef EXTPOT_GAL_LOG

for (i=0; i<ni; i++)
    {

    x_ij = x_act[i][0];
    y_ij = x_act[i][1];
    z_ij = x_act[i][2];

    vx_ij = v_act[i][0];
    vy_ij = v_act[i][1];
    vz_ij = v_act[i][2];

    r2 = SQR(x_ij) + SQR(y_ij) + SQR(z_ij);
    rv_ij = 2.0 * ( vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij );

    r2_r2_halo = (r2 + r2_halo);

    pot_ext[i] -= 0.5*v2_halo * log(1.0 + r2/r2_halo);

    tmp = v2_halo/r2_r2_halo;

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij;

    tmp /= (r2 + r2_halo);

    adot[i][0] += tmp * (rv_ij * x_ij - r2_r2_halo * vx_ij);
    adot[i][1] += tmp * (rv_ij * y_ij - r2_r2_halo * vy_ij);
    adot[i][2] += tmp * (rv_ij * z_ij - r2_r2_halo * vz_ij);

    } /* i */

#endif

/* For simple Plummer potential... */

#ifdef EXTPOT_BH

for (i=0; i<ni; i++)
    {

    x_ij = x_act[i][0];
    y_ij = x_act[i][1];
    z_ij = x_act[i][2];

    vx_ij = v_act[i][0];
    vy_ij = v_act[i][1];
    vz_ij = v_act[i][2];

    r2 = SQR(b_bh);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_bh / r;

    pot_ext[i] -= tmp;

    tmp /= r2;

    a[i][0] -= tmp * x_ij;
    a[i][1] -= tmp * y_ij;
    a[i][2] -= tmp * z_ij;

    adot[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    } /* i */

#endif

#endif        // EXTPOT

}

void calc_self_grav(double t)
{
    /* calc the new grav for the active particles */

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

    g6_set_ti(clusterid, t);

    ni = n_act;

    /* define the local phi, a, adot for these active particles */

    for (i=0; i<ni; i+=npipe) {
        nn = npipe;
        if (ni-i < npipe) nn = ni - i;
        for (ii=0; ii<nn; ii++) {
            h2_i[ii]    = eps2; // TODO This should be a global or something
        } /* ii */
        //TODO any way we can clean up this ugly casting?
        g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i, eps2, h2_i);
        g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, eps2, h2_i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i);
        g6_calls++;
    } /* i */

    /* Store the new value of the local partial force etc... */

    for (i=0; i<n_act; i++) {
        iii = ind_act[i];
        pot_act_tmp_loc[iii] = pot_act_tmp[i];
        a_act_tmp_loc[iii] = a_act_tmp[i];
        adot_act_tmp_loc[iii] = adot_act_tmp[i];
    } /* i */

#ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif
}

void calc_ext_grav()
{

#ifdef EXTPOT

/* Define the external potential for all active particles on all nodes */

ni = n_act;

for (i=0; i<ni; i++)
    {
    pot_act_ext[i] = 0.0;
    }

#ifdef TIMING
get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

#ifdef EXTPOT_GAL

for (i=0; i<ni; i++)
    {
/*
    x_ij = x_act_new[i][0]-x_ext;
    y_ij = x_act_new[i][1]-y_ext;
    z_ij = x_act_new[i][2]-z_ext;

    vx_ij = v_act_new[i][0]-vx_ext;
    vy_ij = v_act_new[i][1]-vy_ext;
    vz_ij = v_act_new[i][2]-vz_ext;
*/

    x_ij = x_act_new[i][0];
    y_ij = x_act_new[i][1];
    z_ij = x_act_new[i][2];

    vx_ij = v_act_new[i][0];
    vy_ij = v_act_new[i][1];
    vz_ij = v_act_new[i][2];

    x2_ij = SQR(x_ij);
    y2_ij = SQR(y_ij);
    z2_ij = SQR(z_ij);

    /* for BULGE */

    r2 = SQR(b_bulge);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_bulge / r;

    pot_act_ext[i] -= tmp;

    tmp /= r2;

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij;

    adot_act_new[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot_act_new[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot_act_new[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    /* for DISK */

    z2_tmp = z2_ij + SQR(b_disk);
    z_tmp  = sqrt(z2_tmp);

    r2_tmp = x2_ij + y2_ij + SQR(z_tmp + a_disk);
    r_tmp  = sqrt(r2_tmp);

    pot_act_ext[i] -= m_disk / r_tmp;

    tmp = m_disk / (r2_tmp*r_tmp);

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij * (z_tmp + a_disk)/z_tmp;

    tmp = m_disk / (z_tmp*r2_tmp*r2_tmp*r_tmp);

    adot_act_new[i][0] += tmp * (- vx_ij*z_tmp*r2_tmp
                        + 3.0*x_ij*vx_ij*x_ij*z_tmp
                        + 3.0*x_ij*vy_ij*y_ij*z_tmp
                        + 3.0*x_ij*vz_ij*z_ij*SQR(z_tmp + a_disk) );

    adot_act_new[i][1] += tmp * (- vy_ij*z_tmp*r2_tmp
                        + 3.0*y_ij*vx_ij*x_ij*z_tmp
                        + 3.0*y_ij*vy_ij*y_ij*z_tmp
                        + 3.0*y_ij*vz_ij*z_ij*SQR(z_tmp + a_disk) );

    adot_act_new[i][2] += tmp * (- vz_ij*(z_tmp + a_disk)*( x2_ij*( z2_tmp*z_tmp + a_disk*SQR(b_disk) ) +
                                                y2_ij*( z2_tmp*z_tmp + a_disk*SQR(b_disk) ) -
                                                ( 2.0*a_disk*(SQR(z_ij) - SQR(b_disk))*z_tmp +
                                                    2.0*SQR(z_ij*z_ij) +
                                                    SQR(b_disk)*SQR(z_ij) -
                                                    SQR(b_disk)*( SQR(a_disk) + SQR(b_disk) ) ) )
                        + 3.0*vx_ij*x_ij*z_ij*z2_tmp*(z_tmp + a_disk)
                        + 3.0*vy_ij*y_ij*z_ij*z2_tmp*(z_tmp + a_disk) ) / z2_tmp ;

    /* for HALO */

    r2 = SQR(b_halo);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_halo / r;

    pot_act_ext[i] -= tmp;

    tmp /= r2;

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij;

    adot_act_new[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot_act_new[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot_act_new[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    } /* i */

#endif

#ifdef EXTPOT_GAL_DEH

for (i=0; i<ni; i++)
    {

    x_ij = x_act_new[i][0];
    y_ij = x_act_new[i][1];
    z_ij = x_act_new[i][2];

    vx_ij = v_act_new[i][0];
    vy_ij = v_act_new[i][1];
    vz_ij = v_act_new[i][2];

    tmp_r = sqrt( SQR(x_ij) + SQR(y_ij) + SQR(z_ij) );
    tmp_r2 = SQR(tmp_r);
    tmp_r3 = POW3(tmp_r);

    dum   = tmp_r/(tmp_r + r_ext);
    dum2  = SQR(dum);
    dum3  = POW3(dum);
    dum_g = pow( dum , -g_ext );

    pot_act_ext[i] -= ((m_ext/r_ext) / (2.0-g_ext) ) * ( 1.0 - dum2 * dum_g );

    tmp = (m_ext/tmp_r3) * dum3 * dum_g;

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij;

    rv_ij = ( vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij );

    tmp = ( m_ext/POW3(tmp_r+r_ext) ) * dum_g;
    tmp_g = ((r_ext*g_ext + 3.0*tmp_r)/(tmp_r2*(tmp_r+r_ext)) ) * rv_ij;

    adot_act_new[i][0] += tmp * ( tmp_g * x_ij - vx_ij );
    adot_act_new[i][1] += tmp * ( tmp_g * y_ij - vy_ij );
    adot_act_new[i][2] += tmp * ( tmp_g * z_ij - vz_ij );

    } /* i */

#endif

/* For simple Log potential... */

#ifdef EXTPOT_GAL_LOG

for (i=0; i<ni; i++)
    {

    x_ij = x_act_new[i][0];
    y_ij = x_act_new[i][1];
    z_ij = x_act_new[i][2];

    vx_ij = v_act_new[i][0];
    vy_ij = v_act_new[i][1];
    vz_ij = v_act_new[i][2];

    r2 = SQR(x_ij) + SQR(y_ij) + SQR(z_ij);
    rv_ij = 2.0 * ( vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij );

    r2_r2_halo = (r2 + r2_halo);

    pot_act_ext[i] -= 0.5*v2_halo * log(1.0 + r2/r2_halo);

    tmp = v2_halo/r2_r2_halo;

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij;

    tmp /= (r2 + r2_halo);

    adot_act_new[i][0] += tmp * (rv_ij * x_ij - r2_r2_halo * vx_ij);
    adot_act_new[i][1] += tmp * (rv_ij * y_ij - r2_r2_halo * vy_ij);
    adot_act_new[i][2] += tmp * (rv_ij * z_ij - r2_r2_halo * vz_ij);

    } /* i */

#endif

    /* For simple Plummer potential... */

#ifdef EXTPOT_BH

for (i=0; i<ni; i++)
    {

    x_ij = x_act_new[i][0];
    y_ij = x_act_new[i][1];
    z_ij = x_act_new[i][2];

    vx_ij = v_act_new[i][0];
    vy_ij = v_act_new[i][1];
    vz_ij = v_act_new[i][2];

    r2 = SQR(b_bh);

    r2 += x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;   r = sqrt(r2);

    rv_ij = vx_ij*x_ij + vy_ij*y_ij + vz_ij*z_ij;

    tmp = m_bh / r;

    pot_act_ext[i] -= tmp;

    tmp /= r2;

    a_act_new[i][0] -= tmp * x_ij;
    a_act_new[i][1] -= tmp * y_ij;
    a_act_new[i][2] -= tmp * z_ij;

    adot_act_new[i][0] -= tmp * (vx_ij - 3.0*rv_ij * x_ij/r2);
    adot_act_new[i][1] -= tmp * (vy_ij - 3.0*rv_ij * y_ij/r2);
    adot_act_new[i][2] -= tmp * (vz_ij - 3.0*rv_ij * z_ij/r2);

    } /* i */

#endif

#ifdef TIMING
get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
#endif

#endif        // EXTPOT
}

void energy_contr()
{
    E_pot = 0.0;
    for (i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;

    E_kin = 0.0;
    for (i=0; i<N; i++) E_kin += m[i]*( SQR(v[i][0]) + SQR(v[i][1]) + SQR(v[i][2]) );
    E_kin *= 0.5;

    E_pot_ext = 0.0;
#ifdef EXTPOT
    for (i=0; i<N; i++) E_pot_ext += m[i]*pot_ext[i];
#endif

    if (timesteps == 0.0) {
        rcm_sum = 0.0;
        vcm_sum = 0.0;
    }

    mcm     = 0.0;
    rcm_mod = 0.0;
    vcm_mod = 0.0;

    xcm = {0, 0, 0};
    vcm = {0, 0, 0};

    for (i=0; i<N; i++) {
        mcm += m[i];
        xcm += m[i] * x[i];
        vcm += m[i] * v[i];
    } /* i */

    xcm /= mcm;
    vcm /= mcm;

    rcm_mod = xcm.norm();
    vcm_mod = vcm.norm();

    rcm_sum += rcm_mod;
    vcm_sum += vcm_mod;

    mom = {0, 0, 0};

    for (i=0; i<N; i++) {
        mom[0] += m[i] * ( x[i][1] * v[i][2] - x[i][2] * v[i][1] );
        mom[1] += m[i] * ( x[i][2] * v[i][0] - x[i][0] * v[i][2] );
        mom[2] += m[i] * ( x[i][0] * v[i][1] - x[i][1] * v[i][0] );
    } /* i */

    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);

    E_tot = E_pot + E_kin + E_pot_ext;

    E_tot_corr    = E_tot + E_corr;
    E_tot_corr_sd = E_tot + E_corr + E_sd;

    if (timesteps == 0.0) {
        /* initialize the E_tot_0 + etc... */

        E_tot_0 = E_tot;
        E_tot_corr_0 = E_tot;
        E_tot_corr_sd_0 = E_tot;

        skip_con = 1;
    } else {
        /* read the E_tot_0 + etc... */
        if (skip_con == 0) {
            inp = fopen("contr.con","r");
            fscanf(inp,"%lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE %lE",
                   &tmp, &tmp, &tmp, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
                   &E_tot_0, &tmp,
                   &E_tot_corr_0, &tmp,
                   &E_tot_corr_sd_0, &tmp,
                   &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp);
            fclose(inp);
            skip_con = 1;
        }
    }
    DE_tot         = E_tot         - E_tot_0;
    DE_tot_corr    = E_tot_corr    - E_tot_corr_0;
    DE_tot_corr_sd = E_tot_corr_sd - E_tot_corr_sd_0;

    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
            CPU_time_user-CPU_time_user0);

    fflush(stdout);

    out = fopen("contr.dat", "a");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    out = fopen("contr.con","w");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
    fclose(out);

    E_tot_0 = E_tot;
    E_tot_corr_0 = E_tot_corr;
    E_tot_corr_sd_0 = E_tot_corr_sd;
}

int main(int argc, char *argv[])
{
    /* INIT the rand() !!! */
    srand(19640916);                 /* it is just my birthday :-) */

    /* Init MPI */
    MPI_Init(&argc, &argv);

    /* Define the total number of processors */
    MPI_Comm_size(MPI_COMM_WORLD, &n_proc);

    /* Define of the Rank of each processors */
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

    /* Define the processors names */
    MPI_Get_processor_name(processor_name, &name_proc);

    /* Print the Rank and the names of processors */
    printf("Rank of the processor %03d on %s \n", myRank, processor_name);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* read the input parameters to the rootRank */

    if (myRank == rootRank) {
        inp = fopen("phi-GRAPE.cfg","r");
        fscanf(inp,"%lE %lE %lE %lE %lE %lE %s", &eps, &t_end, &dt_disk, &dt_contr, &dt_bh, &eta, inp_fname);
        fclose(inp);

        if (parameters.binary_smbh_influence_sphere_output) for (i=0; i<N; i++) inf_event[i] = 0;

        /*
        eps      : Plummer softening parameter (can be even 0)
        t_end    : end time of calculation
        dt_disk  : interval of snapshot files output (0xxx.dat)
        dt_contr : interval for the energy control output (contr.dat)
        dt_bh    : interval for BH output (bh.dat & bh_neighbors.dat)
        eta      : parameter for timestep determination
        inp_data : name of the input file (data.inp)
        */

    /* Normalization... */
#ifdef NORM
        inp = fopen("phi-GRAPE.norm","r");
        fscanf(inp,"%lE %lE", &m_norm, &r_norm);
        fclose(inp);

        m_norm *= Msol;
        r_norm *= pc;

        v_norm = sqrt(G*m_norm/r_norm);
        t_norm = (r_norm/v_norm);
#endif        // NORM

        /* Read the data for EXTernal POTential */
#ifdef EXTPOT
        inp = fopen("phi-GRAPE.ext","r");

#ifdef EXTPOT_GAL
        fscanf(inp,"%lE %lE %lE", &m_bulge, &a_bulge, &b_bulge);
        fscanf(inp,"%lE %lE %lE", &m_disk, &a_disk, &b_disk);
        fscanf(inp,"%lE %lE %lE", &m_halo, &a_halo, &b_halo);

        m_bulge *= (Msol/m_norm);  a_bulge *= (kpc/r_norm);  b_bulge *= (kpc/r_norm);
        m_disk  *= (Msol/m_norm);  a_disk  *= (kpc/r_norm);  b_disk  *= (kpc/r_norm);
        m_halo  *= (Msol/m_norm);  a_halo  *= (kpc/r_norm);  b_halo  *= (kpc/r_norm);
#endif

#ifdef EXTPOT_BH
        fscanf(inp,"%lE %lE", &m_bh, &b_bh);
#endif

#ifdef EXTPOT_GAL_DEH
        fscanf(inp,"%lE %lE %lE", &m_ext, &r_ext, &g_ext);
#endif

#ifdef EXTPOT_GAL_LOG
        fscanf(inp,"%lE %lE", &v_halo, &r_halo);
        v_halo *= (km/v_norm);  r_halo *= (pc/r_norm);

        v2_halo = SQR(v_halo);
        r2_halo = SQR(r_halo);
#endif

        fclose(inp);
#endif        // EXTPOT

        /* read the global data for particles to the rootRank */

        read_data();

        /* possible coordinate & velocity limits for ALL particles !!! */

        printf("\n");
        printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
        printf("\n");
        printf("N       = %07d \t eps      = %.6E \n", N, eps);
        printf("t_beg   = %.6E \t t_end    = %.6E \n", time_cur, t_end);
        printf("dt_disk = %.6E \t dt_contr = %.6E \n", dt_disk, dt_contr);
        printf("dt_bh   = %.6E \n", dt_bh);
        printf("eta     = %.6E \n", eta);
        printf("\n");

#ifdef NORM
        printf("Normalization: \n");
        printf("\n");
        printf("m_norm = %.4E [Msol]  r_norm = %.4E [pc] \n", m_norm/Msol, r_norm/pc);
        printf("v_morm = %.4E [km/s]  t_morm = %.4E [Myr] \n", v_norm/km, t_norm/Myr);
        printf("\n");
#endif

#ifdef EXTPOT
        printf("External Potential: \n");
        printf("\n");

#ifdef EXTPOT_GAL
        printf("m_bulge = %.4E [Msol]  a_bulge = %.4E  b_bulge = %.4E [kpc] \n", m_bulge*m_norm/Msol, a_bulge*r_norm/kpc, b_bulge*r_norm/kpc);
        printf("m_disk  = %.4E [Msol]  a_disk  = %.4E  b_disk  = %.4E [kpc] \n", m_disk*m_norm/Msol,  a_disk*r_norm/kpc,  b_disk*r_norm/kpc);
        printf("m_halo  = %.4E [Msol]  a_halo  = %.4E  b_halo  = %.4E [kpc] \n", m_halo*m_norm/Msol,  a_halo*r_norm/kpc,  b_halo*r_norm/kpc);
#endif

#ifdef EXTPOT_BH
        printf("m_bh = %.4E  b_bh = %.4E \n", m_bh, b_bh);
#endif

#ifdef EXTPOT_GAL_DEH
        printf("m_ext  = %.6E r_ext  = %.6E \t g_ext = %.3E \n", m_ext, r_ext, g_ext);
#endif

#ifdef EXTPOT_GAL_LOG
        printf("v_halo = %.6E r_halo = %.6E \n", v_halo, r_halo);
#endif

        printf("\n");
#endif

        fflush(stdout);

        if ((diskstep == 0) && (time_cur == 0.0)) {
            out = fopen("contr.dat", "w");
            fclose(out);

#ifdef TIMING
            out = fopen("timing.dat", "w");
            fclose(out);
#endif

            if (parameters.live_smbh_output && (parameters.live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
            }
            if ((parameters.live_smbh_neighbor_output) && (parameters.live_smbh_count > 0)) {
                out = fopen("bh_neighbors.dat", "w");
                fclose(out);
            }
            if (parameters.binary_smbh_influence_sphere_output) {
                out = fopen("bbh_inf.dat", "w");
                fclose(out);
            }

        } else {   // if (diskstep == 0)

        }        // if (diskstep == 0)

        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Broadcast all useful values to all processors... */
    MPI_Bcast(&N,        1, MPI_INT,    rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&eps,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&eta,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&t_end,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_disk,  1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_contr, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&dt_bh,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&time_cur, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

#ifdef NORM
    MPI_Bcast(&m_norm, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&r_norm, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&v_norm, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&t_norm, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
#endif        // NORM

#ifdef EXTPOT

#ifdef EXTPOT_GAL
    MPI_Bcast(&m_bulge, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&a_bulge, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&b_bulge, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    MPI_Bcast(&m_disk, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&a_disk, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&b_disk, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    MPI_Bcast(&m_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&a_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&b_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

#endif

#ifdef EXTPOT_BH
    MPI_Bcast(&m_bh, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&b_bh, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
#endif

#ifdef EXTPOT_GAL_DEH
    MPI_Bcast(&m_ext, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&r_ext, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&g_ext, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
#endif

#ifdef EXTPOT_GAL_LOG
    MPI_Bcast(&v_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&r_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    MPI_Bcast(&v2_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(&r2_halo, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
#endif

#endif        // EXTPOT

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    eta_s  = eta/ETA_S_CORR;
    eta_bh = eta/ETA_BH_CORR;

    eps2 = SQR(eps);

    dt_min = 1.0*pow(2.0, DTMINPOWER);
    dt_max = 1.0*pow(2.0, DTMAXPOWER);

    if (dt_disk == dt_contr)
        dt_max = dt_contr;
    else
        dt_max = MIN(dt_disk, dt_contr);

    if (dt_max > 1.0) dt_max = 1.0;

    t_disk  = dt_disk*(1.0+floor(time_cur/dt_disk));
    t_contr = dt_contr*(1.0+floor(time_cur/dt_contr));
    t_bh    = dt_bh*(1.0+floor(time_cur/dt_bh));

    if (myRank == rootRank) {
        printf("t_disk = %.6E   t_contr = %.6E   t_bh = %.6E \n", t_disk, t_contr, t_bh);
        printf("\n");
        fflush(stdout);
    } /* if (myRank == rootRank) */

    for (i=0; i<N; i++) {
        t[i] = time_cur;
        dt[i] = dt_min;
    }

    n_loc = N/n_proc;

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Broadcast the values of all particles to all processors... */
    MPI_Bcast(ind, N, MPI_INT,    rootRank, MPI_COMM_WORLD);
    MPI_Bcast(m,   N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(x, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Bcast(v, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Scatter the "local" vectors from "global" */
    MPI_Scatter(ind, n_loc, MPI_INT,    ind_loc, n_loc, MPI_INT,    rootRank, MPI_COMM_WORLD);
    MPI_Scatter(m,   n_loc, MPI_DOUBLE, m_loc,   n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(t,   n_loc, MPI_DOUBLE, t_loc,   n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(dt,  n_loc, MPI_DOUBLE, dt_loc,  n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(x, 3*n_loc, MPI_DOUBLE, x_loc, 3*n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(v, 3*n_loc, MPI_DOUBLE, v_loc, 3*n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* init the local GRAPE's */

#ifdef MPI_OVERRIDE
    numGPU = 1;
    clusterid = myRank % numGPU;
#else
    MPI_Comm shmcomm;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
    MPI_Comm_size(shmcomm, &numGPU);
    MPI_Comm_rank(shmcomm, &clusterid);
#endif
    printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
    fflush(stdout);

    g6_open(clusterid);
    npipe = g6_npipes();
    g6_set_tunit(new_tunit);
    g6_set_xunit(new_xunit);

#ifdef ETICS
    grapite_read_particle_tags(N, "grapite.mask", myRank, n_loc);
    grapite_set_dt_exp(dt_exp);
    dt_exp = time_cur; // if we don't have a binary restart then we don't remember the coefficients, and we need to calculate them now.
    grapite_set_t_exp(t_exp);
#endif

    /* initial load the particles to the local GRAPE's */

    nj=n_loc;

    /* load the nj particles to the G6 */

    for (j=0; j<nj; j++) {
        a2by18 = {0, 0, 0};
        a1by6  = {0, 0, 0};
        aby2   = {0, 0, 0};
        g6_set_j_particle(clusterid, j, ind_loc[j], t_loc[j], dt_loc[j], m_loc[j], a2by18, a1by6, aby2, v_loc[j], x_loc[j]);

    } /* j */

#ifdef ETICS
    double etics_length_scale;
    if (myRank == rootRank) etics_length_scale = grapite_get_length_scale(N, m, x, v); // We don't want all ranks to do it, because they need to write a file and might confuse each other
    MPI_Bcast(&etics_length_scale, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    grapite_set_length_scale(etics_length_scale);
#endif

#ifdef ETICS_CEP
    // First time only: get the CEP index
    grapite_cep_index = grapite_get_cep_index();

    // First calculate the DC
    grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

    // Now copy it to the global particle list
    memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
    memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

    // Now copy it to the local particle list for tha appropriate rank
    if (myRank == grapite_cep_index/n_loc) {
        memcpy(x_loc[grapite_cep_index - myRank*n_loc], xdc, 3*sizeof(double));
        memcpy(v_loc[grapite_cep_index - myRank*n_loc], vdc, 3*sizeof(double));
    }

    double zeros[3] = {0., 0., 0.}; // We haven't calculated the force yet!
    grapite_update_cep(time_cur, xdc, vdc, zeros, zeros);
#endif

    /* define the all particles as a active on all the processors for the first time grav calc. */

    n_act = N;

    for (i=0; i<n_act; i++) {
        ind_act[i]  = ind[i];
        iii         = ind_act[i];
        m_act[i]    = m[iii];

        x_act[i] = x[iii];
        v_act[i] = v[iii];

        t_act[i]  = t[iii];
        dt_act[i] = dt[iii];

    } /* i */

    // NOTE this is where calc_self_grav_zero() used to be. Hopefully the copying from *_act to *_act_new is just a temporary measure.
    std::copy(&x_act[0][0], &x_act[0][0]+3*N, &x_act_new[0][0]);
    std::copy(&v_act[0][0], &v_act[0][0]+3*N, &v_act_new[0][0]);
    calc_self_grav(time_cur);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Reduce the "global" vectors from "local" on all processors) */
    MPI_Allreduce(pot_act_tmp,  pot,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(a_act_tmp,    a,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(adot_act_tmp, adot, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (parameters.live_smbh_count == 2) {
        i_bh1 = 0;
        i_bh2 = 1;

        if (parameters.live_smbh_custom_eps >= 0) {

            m_bh1 = m[i_bh1];
            m_bh2 = m[i_bh2];

            for (k=0;k<3;k++) {
                x_bh1[k] = x[i_bh1][k];
                v_bh1[k] = v[i_bh1][k];

                x_bh2[k] = x[i_bh2][k];
                v_bh2[k] = v[i_bh2][k];
            }

            // calculate and "minus" the BH <-> BH _softened_ pot, acc & jerk

            tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                    m_bh2, x_bh2, v_bh2,
                                    eps,
                                    &pot_bh1, a_bh1, adot_bh1,
                                    &pot_bh2, a_bh2, adot_bh2);

            pot[i_bh1] -= pot_bh1;
            pot[i_bh2] -= pot_bh2;

            for (k=0;k<3;k++) {
                a[i_bh1][k] -= a_bh1[k];
                a[i_bh2][k] -= a_bh2[k];

                adot[i_bh1][k] -= adot_bh1[k];
                adot[i_bh2][k] -= adot_bh2[k];
            }

            // calculate and "plus" the new BH <-> BH _unsoftened_ pot, acc, jerk

            tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                    m_bh2, x_bh2, v_bh2,
                                    parameters.live_smbh_custom_eps,
                                    &pot_bh1, a_bh1, adot_bh1,
                                    &pot_bh2, a_bh2, adot_bh2);

            pot[i_bh1] += pot_bh1;
            pot[i_bh2] += pot_bh2;

            for (k=0;k<3;k++) {
                a[i_bh1][k] += a_bh1[k];
                a[i_bh2][k] += a_bh2[k];

                adot[i_bh1][k] += adot_bh1[k];
                adot[i_bh2][k] += adot_bh2[k];
            }
        }

        if (parameters.binary_smbh_pn) {

            // calculate and "plus" the new BH <-> BH : PN1, PN2, PN2.5, PN3, PN3.5 : acc, jerk

            dt_bh_tmp = dt[0];

            tmp_i = calc_force_pn_BH(m_bh1, x_bh1, v_bh1, s_bh1,
                                    m_bh2, x_bh2, v_bh2, s_bh2,
                                    C_NB,  dt_bh_tmp, usedOrNot,
                                    (double(*)[3])a_pn1, (double(*)[3])adot_pn1,
                                    (double(*)[3])a_pn2, (double(*)[3])adot_pn2);

            for (k=0;k<3;k++) {
                a[i_bh1][k] += a_pn1[1][k] + a_pn1[2][k] + a_pn1[3][k] + a_pn1[4][k] + a_pn1[5][k] + a_pn1[6][k];
                a[i_bh2][k] += a_pn2[1][k] + a_pn2[2][k] + a_pn2[3][k] + a_pn2[4][k] + a_pn2[5][k] + a_pn2[6][k];

                adot[i_bh1][k] += adot_pn1[1][k] + adot_pn1[2][k] + adot_pn1[3][k] + adot_pn1[4][k] + adot_pn1[5][k] + adot_pn1[6][k];
                adot[i_bh2][k] += adot_pn2[1][k] + adot_pn2[2][k] + adot_pn2[3][k] + adot_pn2[4][k] + adot_pn2[5][k] + adot_pn2[6][k];
            }

            if (myRank == rootRank) {
                if (tmp_i == 505) {
                    printf("PN RSDIST: %.8E \t %.8E \n", timesteps, time_cur);
                    fflush(stdout);
                    exit(-1);
                }
            }
        }
    }

#ifdef EXTPOT
    calc_ext_grav_zero();
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Energy control... */
    if (myRank == rootRank) {
        energy_contr();
    } /* if (myRank == rootRank) */

#ifdef ETICS_DUMP
    if (diskstep==0) {
        sprintf(out_fname, "coeffs.%06d.%02d.dat", 0, myRank);
        grapite_dump(out_fname, 2);
    }
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Scatter the "local" vectors from "global" */
    MPI_Scatter(pot,    n_loc, MPI_DOUBLE, pot_loc,    n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(a,    3*n_loc, MPI_DOUBLE, a_loc,    3*n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    MPI_Scatter(adot, 3*n_loc, MPI_DOUBLE, adot_loc, 3*n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

#ifdef ETICS_CEP
    // First calculate the DC
    grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

    // Now copy it to the global particle list
    memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
    memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

    // Now copy it to the local particle list for tha appropriate rank
    if (myRank == grapite_cep_index/n_loc) {
        memcpy(x_loc[grapite_cep_index - myRank*n_loc], xdc, 3*sizeof(double));
        memcpy(v_loc[grapite_cep_index - myRank*n_loc], vdc, 3*sizeof(double));
    }

    grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
#endif

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Define initial timestep for all particles on all nodes */

    for (i=0; i<N; i++) {
        a2_mod = a[i][0]*a[i][0]+a[i][1]*a[i][1]+a[i][2]*a[i][2];
        adot2_mod = adot[i][0]*adot[i][0]+adot[i][1]*adot[i][1]+adot[i][2]*adot[i][2];

        if (adot2_mod == 0)
            dt_tmp = eta_s;
        else
            dt_tmp = eta_s*sqrt(a2_mod/adot2_mod);

        power = log(dt_tmp)/log(2.0) - 1;

        dt_tmp = pow(2.0, (double)power);

        if (dt_tmp < dt_min) dt_tmp = dt_min;
        if (dt_tmp > dt_max) dt_tmp = dt_max;

        dt[i] = dt_tmp;

        if (parameters.dt_min_warning && (myRank == 0)) {
            if (dt[i] == dt_min) {
                printf("!!! Warning0: dt = dt_min = %.6E \t ind = %07d \n", dt[i], ind[i]);
                fflush(stdout);
            }
        }

    } /* i */

    if (parameters.live_smbh_count > 0) {
        double min_dt = *std::min_element(dt, dt+N);
        for (int i=0; i<parameters.live_smbh_count; i++) dt[i] = min_dt;
    }

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Scatter the "local" vectors from "global" */
    MPI_Scatter(dt,  n_loc, MPI_DOUBLE, dt_loc,  n_loc, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* load the new values for particles to the local GRAPE's */

    nj=n_loc;

    /* load the nj particles to the G6 */

    for (j=0; j<nj; j++) {
        a2by18 = {0, 0, 0};
        a1by6 = adot_loc[j]*(1./6.);
        aby2 = a_loc[j]*0.5;
        g6_set_j_particle(clusterid, j, ind_loc[j], t_loc[j], dt_loc[j], m_loc[j], a2by18, a1by6, aby2, v_loc[j], x_loc[j]);
    } /* j */

    if (myRank == rootRank) {

        /* Write BH data... */
        if (parameters.live_smbh_output) write_bh_data();

        /* Write BH NB data... */
        if (parameters.live_smbh_neighbor_output) write_bh_nb_data();

    } /* if (myRank == rootRank) */

    /* Get the Starting time on rootRank */

    if (myRank == rootRank) {
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
        tmp_cpu = CPU_time_real-CPU_time_real0;
    } /* if (myRank == rootRank) */

    timesteps = 0.0;
    n_act_sum = 0.0;

    for (i=1; i<N+1; i++) {
        n_act_distr[i-1] = 0.0;
    }

    g6_calls = 0.0;

#ifdef TIMING
    DT_TOT      = 0.0;

    DT_ACT_DEF1 = 0.0;
    DT_ACT_DEF2 = 0.0;
    DT_ACT_DEF3 = 0.0;
    DT_ACT_PRED = 0.0;
    DT_ACT_GRAV = 0.0;
    DT_EXT_GRAV = 0.0;
    DT_EXT_GMC_GRAV = 0.0;
    DT_GMC_GMC_GRAV = 0.0;
    DT_ACT_CORR = 0.0;
    DT_ACT_LOAD = 0.0;

    DT_STEVOL    = 0.0;
    DT_STARDISK  = 0.0;
    DT_STARDESTR = 0.0;

    DT_ACT_REDUCE = 0.0;
#endif

    /* The main integration loop */

    while (time_cur <= t_end) {

        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

#ifdef ACT_DEF_GRAPITE
        min_t_loc = grapite_get_minimum_time();
#else
        min_t_loc = t[0]+dt[0];

        for (j=0; j<n_loc; j++) {
            jjj = j + myRank*n_loc;
            tmp = t[jjj] + dt[jjj];
            if (tmp < min_t_loc) min_t_loc = tmp;
        }
#endif

        /* Wait to all processors to finish his works... */
        MPI_Barrier(MPI_COMM_WORLD);

        /* Reduce the "global" min_t from min_t_loc "local" on all processors) */
        MPI_Allreduce(&min_t_loc, &min_t, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);

        /* Wait to all processors to finish his works... */
        MPI_Barrier(MPI_COMM_WORLD);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

#ifdef ACT_DEF_GRAPITE
        int ind_act_loc[N_MAX], n_act_loc;
        grapite_active_search(min_t, ind_act_loc, &n_act_loc);
        if (myRank > 0)
            for (int i=0; i<n_act_loc; i++)
                ind_act_loc[i] += myRank*n_loc;
        int n_act_arr[256], displs[256]; // Assuming maximum of 256 processes... seems safe.
        MPI_Allgather(&n_act_loc, 1, MPI_INT, n_act_arr, 1, MPI_INT, MPI_COMM_WORLD);
        n_act = n_act_arr[0];
        for (int i=1; i<n_proc; i++)
            n_act += n_act_arr[i];
        displs[0] = 0;
        for (int i=1; i<n_proc; i++)
            displs[i]=displs[i-1]+n_act_arr[i-1];
        MPI_Allgatherv(ind_act_loc, n_act_loc, MPI_INT, ind_act, n_act_arr, displs, MPI_INT, MPI_COMM_WORLD);
#else
        n_act = 0;

        for (i=0; i<N; i++) {
            if (t[i]+dt[i] == min_t ) {
                ind_act[n_act] = i;
                n_act++;
            }
        } /* i */
#endif      // ACT_DEF_GRAPITE

// TODO deal with it below
#ifdef ACT_DEF_GRAPITE
#error please fix here
#endif
#if defined(ACT_DEF_GRAPITE) && (defined(ADD_BH1) || defined(ADD_BH2))
#ifdef ADD_BH1
#define ACT_DEF_GRAPITE_NUMBH 1
#else
#define ACT_DEF_GRAPITE_NUMBH 2
#endif
        int acf_def_grapite_bh_count = 0;
        for (i=0; i<n_act; i++) {
            if (ind_act[i]==0) {
                i_bh1 = i;
                acf_def_grapite_bh_count++;
        }
#ifdef ADD_BH2
            else if (ind_act[i]==1) {
                i_bh2 = i;
                acf_def_grapite_bh_count++;
            }
#endif
            if (acf_def_grapite_bh_count==ACT_DEF_GRAPITE_NUMBH) break;
        }
        if (i==n_act) {
            fprintf(stderr, "ERROR: black holes were not found in the active particle list");
            exit(1);
        }
#else
        if (parameters.live_smbh_count > 0) {
            i_bh1 = 0;
            i_bh2 = 1;
        }
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
#endif

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif
        for (i=0; i<n_act; i++) {
            iii = ind_act[i];

            m_act[i] = m[iii];

            x_act[i] = x[iii];
            v_act[i] = v[iii];

            t_act[i]  = t[iii];
            dt_act[i] = dt[iii];

            pot_act[i] = pot[iii];

#ifdef EXTPOT
            pot_act_ext[i] = pot_ext[iii];
#endif

            a_act[i] = a[iii];
            adot_act[i] = adot[iii];

        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF3 += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* predict the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (i=0; i<n_act; i++) {
            dt_tmp = min_t - t_act[i];
            dt2half = 0.5*dt_tmp*dt_tmp;
            dt3over6 = (1./3.)*dt_tmp*dt2half;
            x_act_new[i] = x_act[i] + v_act[i]*dt_tmp + a_act[i]*dt2half + adot_act[i]*dt3over6;
            v_act_new[i] = v_act[i] + a_act[i]*dt_tmp + adot_act[i]*dt2half;
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
#endif

        calc_self_grav(min_t);

        /* Reduce the "global" vectors from "local" on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

        /* Wait to all processors to finish his works... */
        MPI_Barrier(MPI_COMM_WORLD);

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
#endif

        if (parameters.live_smbh_count == 2) {
            if (parameters.live_smbh_custom_eps >= 0) {
                m_bh1 = m_act[i_bh1];
                m_bh2 = m_act[i_bh2];

                x_bh1 = x_act_new[i_bh1];
                v_bh1 = v_act_new[i_bh1];

                x_bh2 = x_act_new[i_bh2];
                v_bh2 = v_act_new[i_bh2];

                // calculate and "minus" the BH <-> BH softened pot, acc & jerk

                tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                        m_bh2, x_bh2, v_bh2,
                                        eps,
                                        &pot_bh1, a_bh1, adot_bh1,
                                        &pot_bh2, a_bh2, adot_bh2);

                pot_act_new[i_bh1] -= pot_bh1;
                pot_act_new[i_bh2] -= pot_bh2;

                a_act_new[i_bh1] -= a_bh1;
                a_act_new[i_bh2] -= a_bh2;

                adot_act_new[i_bh1] -= adot_bh1;
                adot_act_new[i_bh2] -= adot_bh2;

                // calculate and "plus" the new BH <-> BH unsoftened pot, acc, jerk

                tmp_i = calc_force_n_BH(m_bh1, x_bh1, v_bh1,
                                        m_bh2, x_bh2, v_bh2,
                                        parameters.live_smbh_custom_eps,
                                        &pot_bh1, a_bh1, adot_bh1,
                                        &pot_bh2, a_bh2, adot_bh2);

                pot_act_new[i_bh1] += pot_bh1;
                pot_act_new[i_bh2] += pot_bh2;

                a_act_new[i_bh1] += a_bh1;
                a_act_new[i_bh2] += a_bh2;

                adot_act_new[i_bh1] += adot_bh1;
                adot_act_new[i_bh2] += adot_bh2;
            }

            if (parameters.binary_smbh_pn) {
                // calculate and "plus" the new BH <-> BH : PN1, PN2, PN2.5, PN3, PN3.5 : acc, jerk

                dt_bh_tmp = dt[0];

                tmp_i = calc_force_pn_BH(m_bh1, x_bh1, v_bh1, s_bh1,
                                        m_bh2, x_bh2, v_bh2, s_bh2,
                                        C_NB,  dt_bh_tmp, usedOrNot,
                                        (double(*)[3])a_pn1, (double(*)[3])adot_pn1,
                                        (double(*)[3])a_pn2, (double(*)[3])adot_pn2);

                a_act_new[i_bh1] += a_pn1[1] + a_pn1[2] + a_pn1[3] + a_pn1[4] + a_pn1[5] + a_pn1[6];
                a_act_new[i_bh2] += a_pn2[1] + a_pn2[2] + a_pn2[3] + a_pn2[4] + a_pn2[5] + a_pn2[6];

                adot_act_new[i_bh1] += adot_pn1[1] + adot_pn1[2] + adot_pn1[3] + adot_pn1[4] + adot_pn1[5] + adot_pn1[6];
                adot_act_new[i_bh2] += adot_pn2[1] + adot_pn2[2] + adot_pn2[3] + adot_pn2[4] + adot_pn2[5] + adot_pn2[6];

                if (myRank == rootRank) {
                    if (tmp_i == 505) {
                        printf("PN RSDIST: TS = %.8E \t t  = %.8E \n", timesteps, time_cur);
                        fflush(stdout);
                        exit(-1);
                    }
                }
            }
        }

#ifdef EXTPOT
        calc_ext_grav();
#endif

        /* correct the active particles positions etc... on all the nodes */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (i=0; i<n_act; i++) {
            dt_tmp = min_t - t_act[i];

            dt3over6 = dt_tmp*dt_tmp*dt_tmp/6.0;
            dt4over24 = dt3over6*dt_tmp/4.0;
            dt5over120 = dt4over24*dt_tmp/5.0;

            dtinv = 1.0/dt_tmp;
            dt2inv = dtinv*dtinv;
            dt3inv = dt2inv*dtinv;

            double3 a0mia1 = a_act[i]-a_act_new[i];
            double3 ad04plad12 = 4.0*adot_act[i] + 2.0*adot_act_new[i];
            double3 ad0plad1 = adot_act[i] + adot_act_new[i];

            a2 = -6.0*a0mia1*dt2inv - ad04plad12*dtinv;
            a3 = 12.0*a0mia1*dt3inv + 6.0*ad0plad1*dt2inv;

            x_act_new[i] += dt4over24*a2 + dt5over120*a3;
            v_act_new[i] += dt3over6*a2 + dt4over24*a3;

            a1abs = sqrt(a_act_new[i][0]*a_act_new[i][0]+a_act_new[i][1]*a_act_new[i][1]+a_act_new[i][2]*a_act_new[i][2]);
            adot1abs = sqrt(adot_act_new[i][0]*adot_act_new[i][0]+adot_act_new[i][1]*adot_act_new[i][1]+adot_act_new[i][2]*adot_act_new[i][2]);

            a2dot1 = a2 + dt_tmp*a3;

            a2dot1abs = a2dot1.norm();
            a3dot1abs = a3.norm();

            if ((parameters.live_smbh_count > 0) && (ind_act[i] < parameters.live_smbh_count))
                dt_new = sqrt(eta_bh*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
            else
                dt_new = sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));

            if (dt_new < dt_min) dt_tmp = dt_min;

            if ((dt_new < dt_tmp) && (dt_new > dt_min)) {
                power = log(dt_new)/log(2.0) - 1;

                dt_tmp = pow(2.0, (double)power);
            }

            if ((dt_new > 2.0*dt_tmp) && (fmod(min_t, 2.0*dt_tmp) == 0.0) && (2.0*dt_tmp <= dt_max)) {
                dt_tmp *= 2.0;
            }

            dt_act[i] = dt_tmp;

            t_act[i] = min_t;

            pot_act[i] = pot_act_new[i];

            x_act[i]    =    x_act_new[i];
            v_act[i]    =    v_act_new[i];
            a_act[i]    =    a_act_new[i];
            adot_act[i] = adot_act_new[i];

            if (parameters.dt_min_warning && (myRank == 0)) {
                if (dt_act[i] == dt_min) {
                    printf("!!! Warning1: dt_act = dt_min = %.6E \t ind_act = %07d \n", dt[i], ind_act[i]);
                    fflush(stdout);
                }
            }

        } /* i */

        /* define the min. dt over all the act. part. and set it also for the BH... */

        if (parameters.live_smbh_count > 0) {
            double min_dt = *std::min_element(dt_act, dt_act+n_act);
            if (parameters.live_smbh_count>=1) dt_act[i_bh1] = min_dt;
            if (parameters.live_smbh_count==2) dt_act[i_bh2] = min_dt;
        }

        if (parameters.binary_smbh_influence_sphere_output) {
            if (myRank == rootRank) {

                out = fopen("bbh_inf.dat", "a");

                m_bh1 = m_act[i_bh1];
                m_bh2 = m_act[i_bh2];

                x_bh1 = x_act[i_bh1];
                x_bh2 = x_act[i_bh2];

                v_bh1 = v_act[i_bh1];
                v_bh2 = v_act[i_bh2];

                x_bbhc = (m_bh1*x_bh1 + m_bh2*x_bh2)/(m_bh1 + m_bh2);
                v_bbhc = (m_bh1*v_bh1 + m_bh2*v_bh2)/(m_bh1 + m_bh2);

                DR2 = SQR(x_bh1[0] - x_bh2[0]) + SQR(x_bh1[1] - x_bh2[1]) + SQR(x_bh1[2] - x_bh2[2]);
                DV2 = SQR(v_bh1[0] - v_bh2[0]) + SQR(v_bh1[1] - v_bh2[1]) + SQR(v_bh1[2] - v_bh2[2]);

                EB = -(m_bh1 + m_bh2) / sqrt(DR2) + 0.5 * DV2;

                SEMI_a = -0.5 * (m_bh1 + m_bh2)/EB;
                SEMI_a2 = SQR(SEMI_a);

                for (i=2; i<n_act; i++) {

                    tmp_r2 = SQR(x_act[i][0] - x_bbhc[0]) + SQR(x_act[i][1] - x_bbhc[1]) + SQR(x_act[i][2] - x_bbhc[2]);

                    iii = ind_act[i];

                    if (tmp_r2 < SEMI_a2*SQR(parameters.binary_smbh_influence_radius_factor)) {

                        if (inf_event[iii] == 0) {

                            fprintf(out,"INF1 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                                timesteps, time_cur, i, ind_act[i],
                                sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                                m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_act[0],
                                m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_act[1],
                                sqrt(tmp_r2),
                                m_act[i], x_act[i][0], x_act[i][1], x_act[i][2], v_act[i][0], v_act[i][1], v_act[i][2], pot_act[i],
                                dt_act[i]);

                        inf_event[iii] = 1;
                        }

                    } else {
                        if (inf_event[iii] == 1) {
                            fprintf(out,"INF2 %.6E %.16E %07d %07d   %.6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E   %.6E   %.6E % .6E % .6E % .6E % .6E % .6E % .6E % .6E  %.6E \n",
                                timesteps, time_cur, i, ind_act[i],
                                sqrt(DR2), x_bbhc[0], x_bbhc[1], x_bbhc[2], v_bbhc[0], v_bbhc[1], v_bbhc[2],
                                m_bh1, x_bh1[0], x_bh1[1], x_bh1[2], v_bh1[0], v_bh1[1], v_bh1[2], pot_act[0],
                                m_bh2, x_bh2[0], x_bh2[1], x_bh2[2], v_bh2[0], v_bh2[1], v_bh2[2], pot_act[1],
                                sqrt(tmp_r2),
                                m_act[i], x_act[i][0], x_act[i][1], x_act[i][2], v_act[i][0], v_act[i][1], v_act[i][2], pot_act[i],
                                dt_act[i]);
                        }

                        inf_event[iii] = 0;

                    } /* if (tmp_r2 < DR2*R_INF2) */

                } /* i */

                fclose(out);

            } /* if (myRank == rootRank) */
        }

        /* Return back the new coordinates + etc... of active part. to the global data... */

        for (i=0; i<n_act; i++) {

            iii = ind_act[i];

            m[iii] = m_act[i];

            x[iii] = x_act[i];
            v[iii] = v_act[i];

            t[iii] = t_act[i];
            dt[iii] = dt_act[i];

            pot[iii] = pot_act[i];

#ifdef EXTPOT
            pot_ext[iii] = pot_act_ext[i];
#endif

            a[iii] = a_act[i];
            adot[iii] = adot_act[i];
        } /* i */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* load the new values for active particles to the local GRAPE's */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
#endif

        for (j=0; j<n_act; j++) {
#ifdef ETICS_CEP
            if (ind_act[j] == grapite_cep_index) grapite_update_cep(t_act[j], x_act[j], v_act[j], a_act[j], adot_act[j]); // All ranks should do it.
#endif
            cur_rank = ind_act[j]/n_loc;

            if (myRank == cur_rank) {

                jjj = ind_act[j] - myRank*n_loc;

                a2by18 = {0, 0, 0};
                a1by6 = adot_act[j]*(1./6.);
                aby2 = a_act[j]*0.5;

                g6_set_j_particle(clusterid, jjj, ind_act[j], t_act[j], dt_act[j], m_act[j], a2by18, a1by6, aby2, v_act[j], x_act[j]);

            } /* if (myRank == cur_rank) */

        } /* j */

#ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
#endif

        /* Current time set to min_t */

        time_cur = min_t;

        timesteps += 1.0;
        n_act_sum += n_act;
        n_act_distr[n_act-1]++;

        if (time_cur >= t_bh) {
            if (myRank == rootRank) {
                /* Write BH data... */
                if (parameters.live_smbh_output) write_bh_data();

                /* Write BH NB data... */
                if (parameters.live_smbh_neighbor_output) write_bh_nb_data();

            } /* if (myRank == rootRank) */

            t_bh += dt_bh;
        } /* if (time_cur >= t_bh) */

        if (time_cur >= t_contr) {
            if (myRank == rootRank) {

                energy_contr();

                /* write cont data */

                write_cont_data();

                /* possible OUT for timing !!! */

#ifdef TIMING
                out = fopen("timing.dat", "a");

                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
                         DT_ACT_CORR + DT_ACT_LOAD +
                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
                         DT_ACT_REDUCE;

                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
                        time_cur, DT_TOT,
                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
                        100.0*DT_ACT_REDUCE/DT_TOT,
                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);

                fclose(out);
#endif

            } /* if (myRank == rootRank) */

#ifdef ETICS_CEP
            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.

            // First calculate the DC
            grapite_calc_center(N, m, x, v, xcm, vcm, xdc, vdc);

            // Now copy it to the global particle list
            memcpy(x[grapite_cep_index], xdc, 3*sizeof(double));
            memcpy(v[grapite_cep_index], vdc, 3*sizeof(double));

            // Now copy it to the local particle list for tha appropriate rank
            if (myRank == grapite_cep_index/n_loc) {
                memcpy(x_loc[grapite_cep_index - myRank*n_loc], xdc, 3*sizeof(double));
                memcpy(v_loc[grapite_cep_index - myRank*n_loc], vdc, 3*sizeof(double));
            }
            grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
#endif

            t_contr += dt_contr;
        } /* if (time_cur >= t_contr) */

        if (time_cur >= t_disk) {
            if (myRank == rootRank) {
                diskstep++;
                write_snap_data();
            } /* if (myRank == rootRank) */

#ifdef ETICS_DUMP
            sprintf(out_fname, "coeffs.%06d.%02d.dat", diskstep, myRank);
            grapite_dump(out_fname, 2);
#endif

            t_disk += dt_disk;
        } /* if (time_cur >= t_disk) */

    } /* while (time_cur < t_end) */

    /* close the local GRAPEs */

    g6_close(clusterid);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Reduce(&g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    if (myRank == rootRank) {

        /* Write some output for the timestep annalize... */

        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls);
        printf("\n");
        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
        fflush(stdout);

    } /* if (myRank == rootRank) */

    /* Wait to all processors to finish his works... */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Finalize the MPI work */
    MPI_Finalize();

    return 0;

}