New timing, moved MPI reduction into function call, started converting some of the pointers into std::vectors

2020-05-09 18:21:01 -04:00 · 2020-05-09 18:21:01 -04:00 · 329dd2ca4d
commit 329dd2ca4d
parent 1a438449a8
5 changed files with 88 additions and 312 deletions
--- a/black_holes.cpp
+++ b/black_holes.cpp
@ -99,7 +99,7 @@ void Black_hole_physics::adjust_post_newtonian(
    jrk2 += jrk2_corr;
 }
-void Black_hole_physics::write_bh_data(double time_cur, double m[], double3 x[], double3 v[], double pot[], double3 a[], double3 adot[], double dt[])
+void Black_hole_physics::write_bh_data(double time_cur, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double3 a[], double3 adot[], double dt[])
 {
    // This function logs data on the black hole(s). It uses both external data
    // (the arguments to this function) and optionall internal data to this
@ -195,7 +195,7 @@ void Write_bh_nb_data::operator()(double time_cur)
    fflush(out);
 }
-void Binary_smbh_influence_sphere_output::operator()(int ind_act[], int n_act, double timesteps, double time_cur)
+void Binary_smbh_influence_sphere_output::operator()(const std::vector<int>& ind_act, int n_act, double timesteps, double time_cur)
 {
    double m_bh1  = m[0];
    double m_bh2  = m[1];
@ -216,13 +216,13 @@ void Binary_smbh_influence_sphere_output::operator()(int ind_act[], int n_act, d
    for (int i=0; i<n_act; i++) {
        int j_act = ind_act[i];
        if (j_act<2) continue;
-        double& pot_bh1 = pot[0];
+        const double& pot_bh1 = pot[0];
-        double& pot_bh2 = pot[1];
+        const double& pot_bh2 = pot[1];
-        double& m_act   = m[j_act];
+        const double& m_act   = m[j_act];
-        double3& x_act  = x[j_act];
+        const double3& x_act  = x[j_act];
-        double3& v_act  = v[j_act];
+        const double3& v_act  = v[j_act];
-        double& dt_act  = dt[j_act];
+        const double& dt_act  = dt[j_act];
-        double& pot_act = pot[j_act];
+        const double& pot_act = pot[j_act];
        double tmp_r2 = (x_act - x_bbhc).norm2();
        if (tmp_r2 < SEMI_a2*factor*factor) {
            if (inf_event[j_act] == 0) {
--- a/black_holes.h
+++ b/black_holes.h
@ -46,7 +46,7 @@ public:
        const double dt_bh, // pn_usage should be const
        double3& acc1, double3& acc2,
        double3& jrk1, double3& jrk2);
-    void write_bh_data(double time_cur, double m[], double3 x[], double3 v[], double pot[], double3 a[], double3 adot[], double dt[]);
+    void write_bh_data(double time_cur, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double3 a[], double3 adot[], double dt[]);
 public: //TODO make private
    double m1, m2;
    int count;
@ -85,7 +85,7 @@ private:
 class Binary_smbh_influence_sphere_output {
 public:
-    Binary_smbh_influence_sphere_output(double factor, int N, double *m, double3 *x, double3 *v, double *pot, double *dt)
+    Binary_smbh_influence_sphere_output(double factor, int N, double *m, double3 *x, double3 *v, const std::vector<double>& pot, double *dt)
        : factor(factor), m(m), x(x), v(v), pot(pot), dt(dt)
    {
        inf_event.assign(N, 0);
@ -96,10 +96,11 @@ public:
    {
        fclose(out);
    }
-    void operator()(int ind_act[], int n_act, double timesteps, double time_cur);
+    void operator()(const std::vector<int>& ind_act, int n_act, double timesteps, double time_cur);
 private:
    double factor;
-    double *m, *pot, *dt;
+    const std::vector<double>& pot;
    double *m, /**pot,*/ *dt;
    double3 *x, *v;
    std::vector<int> inf_event;
    FILE *out;
--- a/io.cpp
+++ b/io.cpp
@ -146,7 +146,7 @@ void h5_read(const std::string file_name, int *step_num, int *N, double *t, doub
 #endif
 }
-void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const double *pot, const double3 *acc, const double3 *jrk, const int extra_mode=0, const bool use_double_precision=true)
+void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const std::vector<double>& pot, const double3 *acc, const double3 *jrk, const int extra_mode=0, const bool use_double_precision=true)
 {
 #ifdef HAS_HDF5
    hid_t file_id, group_id, attribute_id, dataspace_id;
@ -181,7 +181,7 @@ void h5_write(const std::string file_name, const int step_num, const int N, cons
    bool write_pot = (extra_mode     ) & 1;
    bool write_acc = (extra_mode >> 1) & 1;
    bool write_jrk = (extra_mode >> 2) & 1;
-    if (write_pot) write_dataset("POT", 1, (double*)pot);
+    if (write_pot) write_dataset("POT", 1, (double*)pot.data());
    if (write_acc) write_dataset("ACC", 2, (double*)acc);
    if (write_jrk) write_dataset("JRK", 2, (double*)jrk);
--- a/io.h
+++ b/io.h
@ -12,5 +12,5 @@ void ascii_write(const std::string file_name, const int step_num, const int N, c
 void h5_read(const std::string file_name, int *step_num, int *N, double *t, double m[], double3 x[], double3 v[]);
 // In case the code is compiled without HDF5 support, the implementation of this function just throws an error
-void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const double *pot, const double3 *acc, const double3 *jrk, const int write_mode=0, const bool use_double_precision=true);
+void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const std::vector<double>& pot, const double3 *acc, const double3 *jrk, const int write_mode=0, const bool use_double_precision=true);
 // In case the code is compiled without HDF5 support, the implementation of this function just throws an error
--- a/phigrape.cpp
+++ b/phigrape.cpp
@ -1,60 +1,3 @@
 /*****************************************************************************
 File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
                :
 Contents       : N-body code with integration by individual block time step
                : together with the parallel using of GRAPE6a board's.
                :
                : Added the GPU support via SAPPORO library.
                :
                : Normalization to the physical units!!!
                :
                : External Potential added
                : Plummer-Kuzmin: Bulge, Disk, Halo
                : Kharchenko+Andreas...
                :
                : SC extra POT for Bek SC test runs...
                :
                : Rebuced to the Single BH -> Plummer
                : Andreas+Fazeel...
                :
                : Stellar evolution added
                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
                : IMS mass loss: van den Hoeg & Groenewegen (1997)
                :
                : STARDESTR_EXT: Tidal disruption of stars by external BH...
                : Chingis, Denis & Maxim...
                :
                : STARDESTR: Tidal disruption of stars by BH...
                : Jose, Li Shuo & Shiyan Zhong
                :
                : STARDISK: Drag force...
                : Chingis, Denis & Maxim...
                :
                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
                : Taras, Andreas...
                :
                : Live BH (1 || 2) + ACC + EJECT...
                : Li Shuo & Shiyan Zhong
                :
                : dt_min for BH (1 || 2)...
                :
                : added the PN calculus for the BBH
                : PN0, PN1, PN2, PN2.5 (coded on the base of
                : Gabor Kupi original routine)
                :
                : added the "name" array...
                :
                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
                : for Alexey SC runs... and also for Fazeel Zurich runs...
                :
                : CPU_TIMELIMIT added for the Julich MW cluster runs...
                :
 Coded by       : Peter Berczik
 Version number : 19.04
 Last redaction : 2019.04.16 12:55
 *****************************************************************************/
 #define TIMING
 #define ETA_S_CORR   4.0
 #define ETA_BH_CORR  4.0
@ -62,6 +5,7 @@ Last redaction : 2019.04.16 12:55
 #define DTMINPOWER -36.0
 #include <algorithm>
 #include <chrono>
 #include <math.h>
 #include <mpi.h>
 #include <numeric>
@ -85,47 +29,24 @@ Last redaction : 2019.04.16 12:55
 #endif
 Config *config;
 //chrono::steady_clock::time_point walltime_start;
-// These are used in the energy control, could be static but will probably be removed in the end anyway
+namespace std::chrono {
-double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
+struct Timer {
-double CPU_time_real,  CPU_time_user,  CPU_time_syst;
+    void start()
-
+    {
-#ifdef TIMING
+        t_start = steady_clock::now();
-// TODO clean up here
+    }
-double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
+    void stop()
-double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;
+    {
-
+        t_stop = steady_clock::now();
-double DT_TOT,
+        time = duration_cast<nanoseconds>(t_stop - t_start).count()*1E-9;
-    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
+    }
-    DT_ACT_GRAV, DT_EXT_GRAV,
+    double time; // seconds
-    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
+    steady_clock::time_point t_start, t_stop;
-    DT_ACT_CORR, DT_ACT_LOAD,
+};
    DT_STEVOL, DT_STARDISK, DT_STARDESTR;
 double DT_ACT_REDUCE;
 #endif
 void get_CPU_time(double *time_real, double *time_user, double *time_syst)
 {
    struct rusage xxx;
    double sec_u, microsec_u, sec_s, microsec_s;
    struct timeval tv;
    getrusage(RUSAGE_SELF,&xxx);
    sec_u = xxx.ru_utime.tv_sec;
    sec_s = xxx.ru_stime.tv_sec;
    microsec_u = xxx.ru_utime.tv_usec;
    microsec_s = xxx.ru_stime.tv_usec;
    *time_user = sec_u + microsec_u * 1.0E-06;
    *time_syst = sec_s + microsec_s * 1.0E-06;
    gettimeofday(&tv, NULL);
    *time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;
    *time_user = *time_real;
 }
 std::chrono::Timer timer;
 class Calc_self_grav {
 public:
@ -133,49 +54,49 @@ public:
        : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
    {
        h2.assign(N, eps2); 
        pot_loc.resize(N);
        acc_loc.resize(N);
        jrk_loc.resize(N);
    }
-    void operator()(const double t, const int n_act, int ind_act[], const double3 x_act[], const double3 v_act[],
+    void operator()(const double t, const int n_act, std::vector<int>& ind_act, const double3 x_act[], const double3 v_act[],
-              double pot[], double3 acc[], double3 jrk[])
+              std::vector<double>& pot, double3 acc[], double3 jrk[])
    {
        g6_set_ti(clusterid, t);
        for (int i=0; i<n_act; i+=npipe) {
            int nn = npipe;
            if (n_act-i < npipe) nn = n_act - i;
            //TODO any way we can clean up this ugly casting?
-            g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i, eps2, h2.data());
+            g6calc_firsthalf(clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i], eps2, h2.data());
-            g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, eps2, h2.data(), (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i);
+            g6calc_lasthalf( clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], eps2, h2.data(), (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i]);
            g6_calls++;
        } /* i */
        /* Reduce the "global" vectors from "local" on all the nodes */
        MPI_Allreduce(pot_loc.data(), pot.data(), n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(acc_loc.data(), acc, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(jrk_loc.data(), jrk, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    double g6_calls;
 private:
    int n_loc, clusterid, npipe;
    double eps2;
    std::vector<double> h2;
    std::vector<double> pot_loc; // the _loc variables are for this node only.
    std::vector<double3> acc_loc, jrk_loc;
 };
 void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n, double3 *x, double3 *v, double *pot, double3 *acc, double3* jrk)
 // TODO should just be a class that has this pointer array as a member
 {
 #ifdef TIMING
    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
    std::fill(pot, pot+n, 0.);
    for (auto component : external_gravity_components) {
        if (component->is_active)
            component->apply(n, x, v, pot, acc, jrk);
    }
 #ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
 #endif
 }
-void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, double m[], double3 x[], double3 v[], double pot[], double pot_ext[])
+void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double pot_ext[])
 {
-     double E_pot = 0;
+    double E_pot = 0;
    for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;
@ -206,7 +127,7 @@ void energy_contr(const double time_cur, const double timesteps, const double n_
        mom[2] += m[i] * (x[i][0]* v[i][1] - x[i][1]*v[i][0]);
    }
-    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);
+    timer.stop();
    double E_tot = E_pot + E_kin + E_pot_ext;
@ -219,18 +140,18 @@ void energy_contr(const double time_cur, const double timesteps, const double n_
    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
-            CPU_time_user-CPU_time_user0);
+            timer.time);
    fflush(stdout);
    auto out = fopen("contr.dat", "a");
-    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
+    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E\n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
-            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
+            timer.time);
    fclose(out);
    E_tot_prev = E_tot;
@ -330,14 +251,13 @@ inline double aarseth_step(const double eta, const double dt, const double3 a, c
    return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
 }
 int main(int argc, char *argv[])
 {
    timer.start();
    double timesteps=0.0, n_act_sum=0.0;
    double3 xcm, vcm, xdc, vdc; // these should go away
    double3 x_bbhc, v_bbhc;
    double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.
    /* INIT the rand() !!! */
@ -365,6 +285,7 @@ int main(int argc, char *argv[])
    int diskstep, N;
    double time_cur;
    // The memory for m, x, and v is allocated inside h5_read or ascii_read
    double *m;
    double3 *x, *v;
    if (is_hdf5(config->input_file_name)) {
@ -377,18 +298,19 @@ int main(int argc, char *argv[])
    else
        ascii_read(config->input_file_name, diskstep, N, time_cur, &m, &x, &v);
-    int *ind  = new int[N];
+    std::vector<int> ind(N);
-    std::iota(ind, ind+N, 0);
+    std::iota(begin(ind), end(ind), 0);
    double3 *a = new double3[N], *adot = new double3[N];
-    double *pot  = new double[N], *pot_ext = new double[N], *t = new double[N], *dt = new double[N];
+    std::vector<double> pot(N);
    double *pot_ext = new double[N], *t = new double[N], *dt = new double[N];
    /* data for active particles */
-    // x_act_new and v_act_new arrays hold the predicted position and velocity of i-particles, which is later corrected before moving into the j-particle memory. The [pot,a,adot]_act_tmp arrays hold the calculation results from each node. The [pot,a,adot]_act_new arrays hold the reduced calculation results from all nodes.
+    int n_act;
-    int n_act, *ind_act = new int[N];
+    std::vector<int> ind_act(N);
-    double *pot_act_new = new double[N], *pot_act_tmp = new double[N], *pot_act_ext = new double[N];
+    std::vector<double> pot_act_new(N);
-    double3 *x_act_new = new double3[N], *v_act_new    = new double3[N],
+    double  *pot_act_ext = new double[N];
-            *a_act_tmp = new double3[N], *adot_act_tmp = new double3[N],
+    double3 *x_act_new = new double3[N],  *v_act_new    = new double3[N],
-            *a_act_new = new double3[N], *adot_act_new = new double3[N];
+            *a_act_new = new double3[N],  *adot_act_new = new double3[N];
    double eps = config->eps;
    double eta = config->eta;
@ -411,10 +333,6 @@ int main(int argc, char *argv[])
        if ((diskstep == 0) && (time_cur == 0)) {
            FILE *out = fopen("contr.dat", "w");
            fclose(out);
 #ifdef TIMING
            out = fopen("timing.dat", "w");
            fclose(out);
 #endif
            if (config->live_smbh_output && (config->live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
@ -424,9 +342,6 @@ int main(int argc, char *argv[])
                fclose(out);
            }
        }
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
    } /* if (myRank == rootRank) */
    double normalization_mass=1, normalization_length=1, normalization_velocity=1;
@ -531,7 +446,6 @@ int main(int argc, char *argv[])
 #endif
    /* load the nj particles to the G6 */
    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, zeros, zeros, v[j], x[j]);
@ -545,6 +459,7 @@ int main(int argc, char *argv[])
    int grapite_cep_index = grapite_get_cep_index();
    if (grapite_cep_index >= 0) {
        double3 xcm, vcm, xdc, vdc;
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
@ -553,13 +468,7 @@ int main(int argc, char *argv[])
 #endif
    /* define the all particles as a active on all the processors for the first time grav calc. */
-    calc_self_grav(time_cur, N, ind, x, v, pot_act_tmp, a_act_tmp, adot_act_tmp);
+    calc_self_grav(time_cur, N, ind, x, v, pot, a, adot);
    /* Reduce the "global" vectors from "local" on all processors) */
    // TODO why won't we do the MPI_Allreduce inside the calc_self_grav function, and get rid of these _tmp arrays?
    MPI_Allreduce(pot_act_tmp,  pot,    N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(a_act_tmp,    a,    3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(adot_act_tmp, adot, 3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (config->live_smbh_count == 2) {
        black_hole_physics.set_xv(x[0], x[1], v[0], v[1]);
@ -580,6 +489,7 @@ int main(int argc, char *argv[])
    }
    if (grapite_cep_index >= 0) {
        double3 xcm, vcm, xdc, vdc;
        grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
@ -588,7 +498,6 @@ int main(int argc, char *argv[])
 #endif
    /* Define initial timestep for all particles on all nodes */
    for (int j=0; j<N; j++) {
        double a2_mod = a[j].norm2();
        double adot2_mod = adot[j].norm2();
@ -620,14 +529,12 @@ int main(int argc, char *argv[])
    }
    /* load the new values for particles to the local GRAPEs */
    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, adot[j]*(1./6.), a[j]*0.5, v[j], x[j]);
    } /* k */
    if (myRank == rootRank) {
        /* Write BH data... */
        if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);
@ -636,60 +543,19 @@ int main(int argc, char *argv[])
    } /* if (myRank == rootRank) */
    /* Get the Starting time on rootRank */
    if (myRank == rootRank) {
        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
    } /* if (myRank == rootRank) */
    timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
    n_act_sum = 0.0;
 #ifdef TIMING
    DT_TOT      = 0.0;
    DT_ACT_DEF1 = 0.0;
    DT_ACT_DEF2 = 0.0;
    DT_ACT_DEF3 = 0.0;
    DT_ACT_PRED = 0.0;
    DT_ACT_GRAV = 0.0;
    DT_EXT_GRAV = 0.0;
    DT_EXT_GMC_GRAV = 0.0;
    DT_GMC_GMC_GRAV = 0.0;
    DT_ACT_CORR = 0.0;
    DT_ACT_LOAD = 0.0;
    DT_STEVOL    = 0.0;
    DT_STARDISK  = 0.0;
    DT_STARDESTR = 0.0;
    DT_ACT_REDUCE = 0.0;
 #endif
    /* The main integration loop */
    while (time_cur <= t_end) {
-        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */
+        /* Define the minimal time and the active particles on all the nodes */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        double min_t = active_search.get_minimum_time(t, dt);
-#ifdef TIMING
+        /* Get indices of all particles that will be active in this bunch */
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
+        active_search.get_active_indices(min_t, t, dt, ind_act.data(), n_act);
        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
 #endif
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        active_search.get_active_indices(min_t, t, dt, ind_act, n_act);
        /* Find the BH(s) indices in the active list */
        int i_bh1=0, i_bh2=1;
 #ifdef ETICS
        int n_bh = config->live_smbh_count;
@ -707,18 +573,7 @@ int main(int argc, char *argv[])
        }
 #endif
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
 #endif
        /* predict the active particles positions etc... on all the nodes */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            double dt = min_t - t[j_act];
@ -728,36 +583,8 @@ int main(int argc, char *argv[])
            v_act_new[i] = v[j_act] + a[j_act]*dt + adot[j_act]*dt2half;
        } /* i */
-#ifdef TIMING
+        /* Calculate gravity on active particles */
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
+        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new, pot_act_new, a_act_new, adot_act_new);
        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
 #endif
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new,
                               pot_act_tmp, a_act_tmp, adot_act_tmp);
 #ifdef TIMING
    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
 #endif
        /* Reduce the "global" vectors from "local" on all the nodes */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
 #endif
        if (config->live_smbh_count == 2) {
            black_hole_physics.set_xv(x_act_new[i_bh1], x_act_new[i_bh2], v_act_new[i_bh1], v_act_new[i_bh2]);
@ -765,19 +592,12 @@ int main(int argc, char *argv[])
            if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[i_bh1], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
        }
        /* Calculate gravity on active particles due to external forces */
        calc_ext_grav(external_gravity_components, n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);
        /* correct the active particles positions etc... on all the nodes */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        double min_dt = dt_max;
        for (int i=0; i<n_act; i++) {
            // NOTICE looks like we're doing three unrelated things in this loop: (1) correcting positions and velocities (2) calculating new steps, and (3) putting the corrected values from the _act_new back in the _act arrays.
            // After going back to the _act arrays they don't do much before they go back to the main arrays, so this copy seems redundant (the SMBH influence sphere printout needs these values but it should be a function anyway).
            // TODO split this loop into the three tasks it is doing, and remove the redundancy.
            int j_act = ind_act[i];
            double dt_tmp = min_t - t[j_act];
@ -823,11 +643,9 @@ int main(int argc, char *argv[])
        } /* i */
        /* define the min. dt over all the act. part. and set it also for the BH... */
        if (config->live_smbh_count > 0) {
            if (config->live_smbh_count>=1) dt[0] = min_dt;
            if (config->live_smbh_count==2) dt[1] = min_dt;
        }
        if (config->binary_smbh_influence_sphere_output && (myRank == rootRank)) {
@ -835,41 +653,22 @@ int main(int argc, char *argv[])
            binary_smbh_influence_sphere_output(ind_act, n_act, timesteps, time_cur);
        }
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
 #endif
        /* load the new values for active particles to the local GRAPE's */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
 #endif
        for (int i=0; i<n_act; i++) {
 #ifdef ETICS
            if (ind_act[i] == grapite_cep_index) grapite_update_cep(t[grapite_cep_index], x[grapite_cep_index], v[grapite_cep_index], a[grapite_cep_index], adot[grapite_cep_index]); // All ranks should do it.
 #endif
            int cur_rank = ind_act[i]/n_loc;
            if (myRank == cur_rank) {
                int j_act = ind_act[i];
                int address = ind_act[i] - myRank*n_loc;
                g6_set_j_particle(clusterid, address, ind_act[i], t[j_act], dt[j_act], m[j_act], zeros, adot[j_act]*(1./6.), a[j_act]*0.5, v[j_act], x[j_act]);
            } /* if (myRank == cur_rank) */
        } /* i */
 #ifdef TIMING
        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
 #endif
        /* Current time set to min_t */
        time_cur = min_t;
        timesteps += 1.0;
        n_act_sum += n_act;
@ -888,43 +687,21 @@ int main(int argc, char *argv[])
        if (time_cur >= t_contr) {
            if (myRank == rootRank) {
                energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);
                /* write cont data */
                if (config->output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);
                /* possible OUT for timing !!! */
 #ifdef TIMING
                FILE *out = fopen("timing.dat", "a");
                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
                         DT_ACT_CORR + DT_ACT_LOAD +
                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
                         DT_ACT_REDUCE;
                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
                        time_cur, DT_TOT,
                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
                        100.0*DT_ACT_REDUCE/DT_TOT,
                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
                fclose(out);
 #endif
            } /* if (myRank == rootRank) */
 #ifdef ETICS
-            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.
+            // We are /inside/ a control step, so all particles must be
            // synchronized; we can safely calculate their density centre. The
            // acceleration and jerk currently in the memory are for the
            // predicted position of the CEP, by calling grapite_calc_center we
            // "correct" the position and velocity, but not the gravity at that
            // point.
            if (grapite_cep_index >= 0) {
                double3 xcm, vcm, xdc, vdc;
                grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
                x[grapite_cep_index] = xdc;
                v[grapite_cep_index] = vdc;
@ -955,24 +732,22 @@ int main(int argc, char *argv[])
    } /* while (time_cur < t_end) */
    /* close the local GRAPEs */
    timer.stop();
    g6_close(clusterid);
    double g6_calls_sum;
    MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);
    if (myRank == rootRank) {
        /* Write some output for the timestep annalize... */
        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
        printf("\n");
-        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
+        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(timer.time)/1.0E+09);
        fflush(stdout);
    } /* if (myRank == rootRank) */
-    delete[] m; delete[] x; delete[] v; delete[] ind; delete[] a; delete[] adot; delete[] pot; delete[] pot_ext; delete[] t; delete[] dt; delete[] ind_act; delete[] pot_act_new; delete[] pot_act_tmp; delete[] x_act_new; delete[] v_act_new; delete[] a_act_tmp; delete[] adot_act_tmp; delete[] a_act_new; delete[] adot_act_new; delete[] pot_act_ext;
+    delete config;
    delete[] m; delete[] x; delete[] v; delete[] a; delete[] adot; delete[] pot_ext; delete[] t; delete[] dt; delete[] x_act_new; delete[] v_act_new; delete[] a_act_new; delete[] adot_act_new; delete[] pot_act_ext;
    /* Finalize the MPI work */
    MPI_Finalize();