diff --git a/black_holes.cpp b/black_holes.cpp
index 0db9a13..4f5ffce 100644
--- a/black_holes.cpp
+++ b/black_holes.cpp
@@ -99,7 +99,7 @@ void Black_hole_physics::adjust_post_newtonian(
     jrk2 += jrk2_corr;
 }
 
-void Black_hole_physics::write_bh_data(double time_cur, double m[], double3 x[], double3 v[], double pot[], double3 a[], double3 adot[], double dt[])
+void Black_hole_physics::write_bh_data(double time_cur, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double3 a[], double3 adot[], double dt[])
 {
     // This function logs data on the black hole(s). It uses both external data
     // (the arguments to this function) and optionall internal data to this
@@ -195,7 +195,7 @@ void Write_bh_nb_data::operator()(double time_cur)
     fflush(out);
 }
 
-void Binary_smbh_influence_sphere_output::operator()(int ind_act[], int n_act, double timesteps, double time_cur)
+void Binary_smbh_influence_sphere_output::operator()(const std::vector<int>& ind_act, int n_act, double timesteps, double time_cur)
 {
     double m_bh1  = m[0];
     double m_bh2  = m[1];
@@ -216,13 +216,13 @@ void Binary_smbh_influence_sphere_output::operator()(int ind_act[], int n_act, d
     for (int i=0; i<n_act; i++) {
         int j_act = ind_act[i];
         if (j_act<2) continue;
-        double& pot_bh1 = pot[0];
-        double& pot_bh2 = pot[1];
-        double& m_act   = m[j_act];
-        double3& x_act  = x[j_act];
-        double3& v_act  = v[j_act];
-        double& dt_act  = dt[j_act];
-        double& pot_act = pot[j_act];
+        const double& pot_bh1 = pot[0];
+        const double& pot_bh2 = pot[1];
+        const double& m_act   = m[j_act];
+        const double3& x_act  = x[j_act];
+        const double3& v_act  = v[j_act];
+        const double& dt_act  = dt[j_act];
+        const double& pot_act = pot[j_act];
         double tmp_r2 = (x_act - x_bbhc).norm2();
         if (tmp_r2 < SEMI_a2*factor*factor) {
             if (inf_event[j_act] == 0) {
diff --git a/black_holes.h b/black_holes.h
index ad9788f..3d15efa 100644
--- a/black_holes.h
+++ b/black_holes.h
@@ -46,7 +46,7 @@ public:
         const double dt_bh, // pn_usage should be const
         double3& acc1, double3& acc2,
         double3& jrk1, double3& jrk2);
-    void write_bh_data(double time_cur, double m[], double3 x[], double3 v[], double pot[], double3 a[], double3 adot[], double dt[]);
+    void write_bh_data(double time_cur, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double3 a[], double3 adot[], double dt[]);
 public: //TODO make private
     double m1, m2;
     int count;
@@ -85,7 +85,7 @@ private:
 
 class Binary_smbh_influence_sphere_output {
 public:
-    Binary_smbh_influence_sphere_output(double factor, int N, double *m, double3 *x, double3 *v, double *pot, double *dt)
+    Binary_smbh_influence_sphere_output(double factor, int N, double *m, double3 *x, double3 *v, const std::vector<double>& pot, double *dt)
         : factor(factor), m(m), x(x), v(v), pot(pot), dt(dt)
     {
         inf_event.assign(N, 0);
@@ -96,10 +96,11 @@ public:
     {
         fclose(out);
     }
-    void operator()(int ind_act[], int n_act, double timesteps, double time_cur);
+    void operator()(const std::vector<int>& ind_act, int n_act, double timesteps, double time_cur);
 private:
     double factor;
-    double *m, *pot, *dt;
+    const std::vector<double>& pot;
+    double *m, /**pot,*/ *dt;
     double3 *x, *v;
     std::vector<int> inf_event;
     FILE *out;
diff --git a/io.cpp b/io.cpp
index 2ca64c8..3a1b086 100644
--- a/io.cpp
+++ b/io.cpp
@@ -146,7 +146,7 @@ void h5_read(const std::string file_name, int *step_num, int *N, double *t, doub
 #endif
 }
 
-void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const double *pot, const double3 *acc, const double3 *jrk, const int extra_mode=0, const bool use_double_precision=true)
+void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const std::vector<double>& pot, const double3 *acc, const double3 *jrk, const int extra_mode=0, const bool use_double_precision=true)
 {
 #ifdef HAS_HDF5
     hid_t file_id, group_id, attribute_id, dataspace_id;
@@ -181,7 +181,7 @@ void h5_write(const std::string file_name, const int step_num, const int N, cons
     bool write_pot = (extra_mode     ) & 1;
     bool write_acc = (extra_mode >> 1) & 1;
     bool write_jrk = (extra_mode >> 2) & 1;
-    if (write_pot) write_dataset("POT", 1, (double*)pot);
+    if (write_pot) write_dataset("POT", 1, (double*)pot.data());
     if (write_acc) write_dataset("ACC", 2, (double*)acc);
     if (write_jrk) write_dataset("JRK", 2, (double*)jrk);
 
diff --git a/io.h b/io.h
index 770a0f5..8e8d2ad 100644
--- a/io.h
+++ b/io.h
@@ -12,5 +12,5 @@ void ascii_write(const std::string file_name, const int step_num, const int N, c
 void h5_read(const std::string file_name, int *step_num, int *N, double *t, double m[], double3 x[], double3 v[]);
 // In case the code is compiled without HDF5 support, the implementation of this function just throws an error
 
-void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const double *pot, const double3 *acc, const double3 *jrk, const int write_mode=0, const bool use_double_precision=true);
+void h5_write(const std::string file_name, const int step_num, const int N, const double t, const double *m, const double3 *x, const double3 *v, const std::vector<double>& pot, const double3 *acc, const double3 *jrk, const int write_mode=0, const bool use_double_precision=true);
 // In case the code is compiled without HDF5 support, the implementation of this function just throws an error
diff --git a/phigrape.cpp b/phigrape.cpp
index 548468e..f8cc761 100644
--- a/phigrape.cpp
+++ b/phigrape.cpp
@@ -1,60 +1,3 @@
-/*****************************************************************************
-File Name      : "phi-GRAPE/GPU.c"              // BH (1 || 2) + ACC + EJECT
-                :
-Contents       : N-body code with integration by individual block time step
-                : together with the parallel using of GRAPE6a board's.
-                :
-                : Added the GPU support via SAPPORO library.
-                :
-                : Normalization to the physical units!!!
-                :
-                : External Potential added
-                : Plummer-Kuzmin: Bulge, Disk, Halo
-                : Kharchenko+Andreas...
-                :
-                : SC extra POT for Bek SC test runs...
-                :
-                : Rebuced to the Single BH -> Plummer
-                : Andreas+Fazeel...
-                :
-                : Stellar evolution added
-                : Stellar lifetimes: Raiteri, Villata & Navarro (1996)
-                : IMS mass loss: van den Hoeg & Groenewegen (1997)
-                :
-                : STARDESTR_EXT: Tidal disruption of stars by external BH...
-                : Chingis, Denis & Maxim...
-                :
-                : STARDESTR: Tidal disruption of stars by BH...
-                : Jose, Li Shuo & Shiyan Zhong
-                :
-                : STARDISK: Drag force...
-                : Chingis, Denis & Maxim...
-                :
-                : STARDISK: variable hz = HZ*(R/R_crit) up to R_crit...
-                : Taras, Andreas...
-                :
-                : Live BH (1 || 2) + ACC + EJECT...
-                : Li Shuo & Shiyan Zhong
-                :
-                : dt_min for BH (1 || 2)...
-                :
-                : added the PN calculus for the BBH
-                : PN0, PN1, PN2, PN2.5 (coded on the base of
-                : Gabor Kupi original routine)
-                :
-                : added the "name" array...
-                :
-                : added the GMC's calculus (GMC on CPU; GMC2 on GPU)
-                : for Alexey SC runs... and also for Fazeel Zurich runs...
-                :
-                : CPU_TIMELIMIT added for the Julich MW cluster runs...
-                :
-Coded by       : Peter Berczik
-Version number : 19.04
-Last redaction : 2019.04.16 12:55
-*****************************************************************************/
-#define TIMING
-
 #define ETA_S_CORR   4.0
 #define ETA_BH_CORR  4.0
 
@@ -62,6 +5,7 @@ Last redaction : 2019.04.16 12:55
 #define DTMINPOWER -36.0
 
 #include <algorithm>
+#include <chrono>
 #include <math.h>
 #include <mpi.h>
 #include <numeric>
@@ -85,47 +29,24 @@ Last redaction : 2019.04.16 12:55
 #endif
 
 Config *config;
+//chrono::steady_clock::time_point walltime_start;
 
-// These are used in the energy control, could be static but will probably be removed in the end anyway
-double CPU_time_real0, CPU_time_user0, CPU_time_syst0;
-double CPU_time_real,  CPU_time_user,  CPU_time_syst;
-
-#ifdef TIMING
-// TODO clean up here
-double CPU_tmp_real0, CPU_tmp_user0, CPU_tmp_syst0;
-double CPU_tmp_real,  CPU_tmp_user,  CPU_tmp_syst;
-
-double DT_TOT,
-    DT_ACT_DEF1, DT_ACT_DEF2, DT_ACT_DEF3, DT_ACT_PRED,
-    DT_ACT_GRAV, DT_EXT_GRAV,
-    DT_GMC_GRAV, DT_GMC_GMC_GRAV, DT_EXT_GMC_GRAV,
-    DT_ACT_CORR, DT_ACT_LOAD,
-    DT_STEVOL, DT_STARDISK, DT_STARDESTR;
-double DT_ACT_REDUCE;
-#endif
-
-void get_CPU_time(double *time_real, double *time_user, double *time_syst)
-{
-    struct rusage xxx;
-    double sec_u, microsec_u, sec_s, microsec_s;
-    struct timeval tv;
-
-    getrusage(RUSAGE_SELF,&xxx);
-
-    sec_u = xxx.ru_utime.tv_sec;
-    sec_s = xxx.ru_stime.tv_sec;
-
-    microsec_u = xxx.ru_utime.tv_usec;
-    microsec_s = xxx.ru_stime.tv_usec;
-
-    *time_user = sec_u + microsec_u * 1.0E-06;
-    *time_syst = sec_s + microsec_s * 1.0E-06;
-
-    gettimeofday(&tv, NULL);
-    *time_real = tv.tv_sec + 1.0E-06 * tv.tv_usec;
-
-    *time_user = *time_real;
+namespace std::chrono {
+struct Timer {
+    void start()
+    {
+        t_start = steady_clock::now();
+    }
+    void stop()
+    {
+        t_stop = steady_clock::now();
+        time = duration_cast<nanoseconds>(t_stop - t_start).count()*1E-9;
+    }
+    double time; // seconds
+    steady_clock::time_point t_start, t_stop;
+};
 }
+std::chrono::Timer timer;
 
 class Calc_self_grav {
 public:
@@ -133,49 +54,49 @@ public:
         : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
     {
         h2.assign(N, eps2); 
+        pot_loc.resize(N);
+        acc_loc.resize(N);
+        jrk_loc.resize(N);
     }
-    void operator()(const double t, const int n_act, int ind_act[], const double3 x_act[], const double3 v_act[],
-              double pot[], double3 acc[], double3 jrk[])
+    void operator()(const double t, const int n_act, std::vector<int>& ind_act, const double3 x_act[], const double3 v_act[],
+              std::vector<double>& pot, double3 acc[], double3 jrk[])
     {
         g6_set_ti(clusterid, t);
         for (int i=0; i<n_act; i+=npipe) {
             int nn = npipe;
             if (n_act-i < npipe) nn = n_act - i;
             //TODO any way we can clean up this ugly casting?
-            g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i, eps2, h2.data());
-            g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, eps2, h2.data(), (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i);
+            g6calc_firsthalf(clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i], eps2, h2.data());
+            g6calc_lasthalf( clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], eps2, h2.data(), (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i]);
             g6_calls++;
         } /* i */
+        /* Reduce the "global" vectors from "local" on all the nodes */
+        MPI_Allreduce(pot_loc.data(), pot.data(), n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(acc_loc.data(), acc, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(jrk_loc.data(), jrk, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     }
     double g6_calls;
 private:
     int n_loc, clusterid, npipe;
     double eps2;
     std::vector<double> h2;
+    std::vector<double> pot_loc; // the _loc variables are for this node only.
+    std::vector<double3> acc_loc, jrk_loc;
 };
 
 void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n, double3 *x, double3 *v, double *pot, double3 *acc, double3* jrk)
 // TODO should just be a class that has this pointer array as a member
 {
-#ifdef TIMING
-    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
     std::fill(pot, pot+n, 0.);
     for (auto component : external_gravity_components) {
         if (component->is_active)
             component->apply(n, x, v, pot, acc, jrk);
     }
-
-#ifdef TIMING
-    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-    DT_EXT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
-#endif
 }
 
-void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, double m[], double3 x[], double3 v[], double pot[], double pot_ext[])
+void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, double m[], double3 x[], double3 v[], const std::vector<double>& pot, double pot_ext[])
 {
-     double E_pot = 0;
+    double E_pot = 0;
     for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
     E_pot *= 0.5;
 
@@ -206,7 +127,7 @@ void energy_contr(const double time_cur, const double timesteps, const double n_
         mom[2] += m[i] * (x[i][0]* v[i][1] - x[i][1]*v[i][0]);
     }
 
-    get_CPU_time(&CPU_time_real, &CPU_time_user, &CPU_time_syst);
+    timer.stop();
 
     double E_tot = E_pot + E_kin + E_pot_ext;
     
@@ -219,18 +140,18 @@ void energy_contr(const double time_cur, const double timesteps, const double n_
     printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
             time_cur, timesteps,
             E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
-            CPU_time_user-CPU_time_user0);
+            timer.time);
 
     fflush(stdout);
 
     auto out = fopen("contr.dat", "a");
-    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E %.8E %.8E \n",
+    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E\n",
             time_cur, timesteps, n_act_sum, g6_calls,
             E_pot, E_kin, E_pot_ext,
             E_tot, DE_tot,
             rcm_mod, vcm_mod,
             mom[0], mom[1], mom[2],
-            CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0);
+            timer.time);
     fclose(out);
 
     E_tot_prev = E_tot;
@@ -330,14 +251,13 @@ inline double aarseth_step(const double eta, const double dt, const double3 a, c
     return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
 }
 
+
 int main(int argc, char *argv[])
 {
+    timer.start();
+
     double timesteps=0.0, n_act_sum=0.0;
 
-    double3 xcm, vcm, xdc, vdc; // these should go away
-
-    double3 x_bbhc, v_bbhc;
-
     double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.
 
     /* INIT the rand() !!! */
@@ -365,6 +285,7 @@ int main(int argc, char *argv[])
 
     int diskstep, N;
     double time_cur;
+    // The memory for m, x, and v is allocated inside h5_read or ascii_read
     double *m;
     double3 *x, *v;
     if (is_hdf5(config->input_file_name)) {
@@ -377,18 +298,19 @@ int main(int argc, char *argv[])
     else
         ascii_read(config->input_file_name, diskstep, N, time_cur, &m, &x, &v);
 
-    int *ind  = new int[N];
-    std::iota(ind, ind+N, 0);
+    std::vector<int> ind(N);
+    std::iota(begin(ind), end(ind), 0);
     double3 *a = new double3[N], *adot = new double3[N];
-    double *pot  = new double[N], *pot_ext = new double[N], *t = new double[N], *dt = new double[N];
+    std::vector<double> pot(N);
+    double *pot_ext = new double[N], *t = new double[N], *dt = new double[N];
 
     /* data for active particles */
-    // x_act_new and v_act_new arrays hold the predicted position and velocity of i-particles, which is later corrected before moving into the j-particle memory. The [pot,a,adot]_act_tmp arrays hold the calculation results from each node. The [pot,a,adot]_act_new arrays hold the reduced calculation results from all nodes.
-    int n_act, *ind_act = new int[N];
-    double *pot_act_new = new double[N], *pot_act_tmp = new double[N], *pot_act_ext = new double[N];
-    double3 *x_act_new = new double3[N], *v_act_new    = new double3[N],
-            *a_act_tmp = new double3[N], *adot_act_tmp = new double3[N],
-            *a_act_new = new double3[N], *adot_act_new = new double3[N];
+    int n_act;
+    std::vector<int> ind_act(N);
+    std::vector<double> pot_act_new(N);
+    double  *pot_act_ext = new double[N];
+    double3 *x_act_new = new double3[N],  *v_act_new    = new double3[N],
+            *a_act_new = new double3[N],  *adot_act_new = new double3[N];
 
     double eps = config->eps;
     double eta = config->eta;
@@ -411,10 +333,6 @@ int main(int argc, char *argv[])
         if ((diskstep == 0) && (time_cur == 0)) {
             FILE *out = fopen("contr.dat", "w");
             fclose(out);
-#ifdef TIMING
-            out = fopen("timing.dat", "w");
-            fclose(out);
-#endif
             if (config->live_smbh_output && (config->live_smbh_count > 0)) {
                 out = fopen("bh.dat", "w");
                 fclose(out);
@@ -424,9 +342,6 @@ int main(int argc, char *argv[])
                 fclose(out);
             }
         }
-
-        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
-
     } /* if (myRank == rootRank) */
 
     double normalization_mass=1, normalization_length=1, normalization_velocity=1;
@@ -531,7 +446,6 @@ int main(int argc, char *argv[])
 #endif
 
     /* load the nj particles to the G6 */
-
     for (int k=0; k<n_loc; k++) {
         int j = k + myRank*n_loc;
         g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, zeros, zeros, v[j], x[j]);
@@ -545,6 +459,7 @@ int main(int argc, char *argv[])
 
     int grapite_cep_index = grapite_get_cep_index();
     if (grapite_cep_index >= 0) {
+        double3 xcm, vcm, xdc, vdc;
         grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
         x[grapite_cep_index] = xdc;
         v[grapite_cep_index] = vdc;
@@ -553,13 +468,7 @@ int main(int argc, char *argv[])
 #endif
 
     /* define the all particles as a active on all the processors for the first time grav calc. */
-    calc_self_grav(time_cur, N, ind, x, v, pot_act_tmp, a_act_tmp, adot_act_tmp);
-
-    /* Reduce the "global" vectors from "local" on all processors) */
-    // TODO why won't we do the MPI_Allreduce inside the calc_self_grav function, and get rid of these _tmp arrays?
-    MPI_Allreduce(pot_act_tmp,  pot,    N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(a_act_tmp,    a,    3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(adot_act_tmp, adot, 3*N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    calc_self_grav(time_cur, N, ind, x, v, pot, a, adot);
 
     if (config->live_smbh_count == 2) {
         black_hole_physics.set_xv(x[0], x[1], v[0], v[1]);
@@ -580,6 +489,7 @@ int main(int argc, char *argv[])
     }
 
     if (grapite_cep_index >= 0) {
+        double3 xcm, vcm, xdc, vdc;
         grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
         x[grapite_cep_index] = xdc;
         v[grapite_cep_index] = vdc;
@@ -588,7 +498,6 @@ int main(int argc, char *argv[])
 #endif
 
     /* Define initial timestep for all particles on all nodes */
-
     for (int j=0; j<N; j++) {
         double a2_mod = a[j].norm2();
         double adot2_mod = adot[j].norm2();
@@ -620,14 +529,12 @@ int main(int argc, char *argv[])
     }
 
     /* load the new values for particles to the local GRAPEs */
-
     for (int k=0; k<n_loc; k++) {
         int j = k + myRank*n_loc;
         g6_set_j_particle(clusterid, k, ind[j], t[j], dt[j], m[j], zeros, adot[j]*(1./6.), a[j]*0.5, v[j], x[j]);
     } /* k */
 
     if (myRank == rootRank) {
-
         /* Write BH data... */
         if (config->live_smbh_output) black_hole_physics.write_bh_data(time_cur, m, x, v, pot, a, adot, dt);
 
@@ -636,60 +543,19 @@ int main(int argc, char *argv[])
 
     } /* if (myRank == rootRank) */
 
-    /* Get the Starting time on rootRank */
-
-    if (myRank == rootRank) {
-        get_CPU_time(&CPU_time_real0, &CPU_time_user0, &CPU_time_syst0);
-        get_CPU_time(&CPU_time_real,  &CPU_time_user,  &CPU_time_syst);
-    } /* if (myRank == rootRank) */
-
     timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
     n_act_sum = 0.0;
 
-#ifdef TIMING
-    DT_TOT      = 0.0;
-
-    DT_ACT_DEF1 = 0.0;
-    DT_ACT_DEF2 = 0.0;
-    DT_ACT_DEF3 = 0.0;
-    DT_ACT_PRED = 0.0;
-    DT_ACT_GRAV = 0.0;
-    DT_EXT_GRAV = 0.0;
-    DT_EXT_GMC_GRAV = 0.0;
-    DT_GMC_GMC_GRAV = 0.0;
-    DT_ACT_CORR = 0.0;
-    DT_ACT_LOAD = 0.0;
-
-    DT_STEVOL    = 0.0;
-    DT_STARDISK  = 0.0;
-    DT_STARDESTR = 0.0;
-
-    DT_ACT_REDUCE = 0.0;
-#endif
-
     /* The main integration loop */
-
     while (time_cur <= t_end) {
 
-        /* Define the minimal time and the active particles on all the nodes (exclude the ZERO masses!!!) */
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
+        /* Define the minimal time and the active particles on all the nodes */
         double min_t = active_search.get_minimum_time(t, dt);
 
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_DEF1 += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
-        active_search.get_active_indices(min_t, t, dt, ind_act, n_act);
+        /* Get indices of all particles that will be active in this bunch */
+        active_search.get_active_indices(min_t, t, dt, ind_act.data(), n_act);
 
+        /* Find the BH(s) indices in the active list */
         int i_bh1=0, i_bh2=1;
 #ifdef ETICS
         int n_bh = config->live_smbh_count;
@@ -707,18 +573,7 @@ int main(int argc, char *argv[])
         }
 #endif
 
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_DEF2 += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
         /* predict the active particles positions etc... on all the nodes */
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
         for (int i=0; i<n_act; i++) {
             int j_act = ind_act[i];
             double dt = min_t - t[j_act];
@@ -728,36 +583,8 @@ int main(int argc, char *argv[])
             v_act_new[i] = v[j_act] + a[j_act]*dt + adot[j_act]*dt2half;
         } /* i */
 
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new,
-                               pot_act_tmp, a_act_tmp, adot_act_tmp);
-
-#ifdef TIMING
-    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
-        /* Reduce the "global" vectors from "local" on all the nodes */
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
-        MPI_Allreduce(pot_act_tmp,  pot_act_new,    n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(a_act_tmp,    a_act_new,    3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(adot_act_tmp, adot_act_new, 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_REDUCE += (CPU_tmp_user - CPU_tmp_user0);
-#endif
+        /* Calculate gravity on active particles */
+        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new, pot_act_new, a_act_new, adot_act_new);
 
         if (config->live_smbh_count == 2) {
             black_hole_physics.set_xv(x_act_new[i_bh1], x_act_new[i_bh2], v_act_new[i_bh1], v_act_new[i_bh2]);
@@ -765,19 +592,12 @@ int main(int argc, char *argv[])
             if (config->binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[i_bh1], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
         }
 
+        /* Calculate gravity on active particles due to external forces */
         calc_ext_grav(external_gravity_components, n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);
 
         /* correct the active particles positions etc... on all the nodes */
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
         double min_dt = dt_max;
         for (int i=0; i<n_act; i++) {
-            // NOTICE looks like we're doing three unrelated things in this loop: (1) correcting positions and velocities (2) calculating new steps, and (3) putting the corrected values from the _act_new back in the _act arrays.
-            // After going back to the _act arrays they don't do much before they go back to the main arrays, so this copy seems redundant (the SMBH influence sphere printout needs these values but it should be a function anyway).
-            // TODO split this loop into the three tasks it is doing, and remove the redundancy.
             int j_act = ind_act[i];
             double dt_tmp = min_t - t[j_act];
 
@@ -823,11 +643,9 @@ int main(int argc, char *argv[])
         } /* i */
 
         /* define the min. dt over all the act. part. and set it also for the BH... */
-
         if (config->live_smbh_count > 0) {
             if (config->live_smbh_count>=1) dt[0] = min_dt;
             if (config->live_smbh_count==2) dt[1] = min_dt;
-            
         }
 
         if (config->binary_smbh_influence_sphere_output && (myRank == rootRank)) {
@@ -835,41 +653,22 @@ int main(int argc, char *argv[])
             binary_smbh_influence_sphere_output(ind_act, n_act, timesteps, time_cur);
         }
 
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_CORR += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
         /* load the new values for active particles to the local GRAPE's */
-
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
         for (int i=0; i<n_act; i++) {
 #ifdef ETICS
             if (ind_act[i] == grapite_cep_index) grapite_update_cep(t[grapite_cep_index], x[grapite_cep_index], v[grapite_cep_index], a[grapite_cep_index], adot[grapite_cep_index]); // All ranks should do it.
 #endif
             int cur_rank = ind_act[i]/n_loc;
-
             if (myRank == cur_rank) {
                 int j_act = ind_act[i];
                 int address = ind_act[i] - myRank*n_loc;
                 g6_set_j_particle(clusterid, address, ind_act[i], t[j_act], dt[j_act], m[j_act], zeros, adot[j_act]*(1./6.), a[j_act]*0.5, v[j_act], x[j_act]);
 
             } /* if (myRank == cur_rank) */
-
         } /* i */
 
-#ifdef TIMING
-        get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-        DT_ACT_LOAD += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-
         /* Current time set to min_t */
-
         time_cur = min_t;
-
         timesteps += 1.0;
         n_act_sum += n_act;
 
@@ -888,43 +687,21 @@ int main(int argc, char *argv[])
 
         if (time_cur >= t_contr) {
             if (myRank == rootRank) {
-
                 energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);
-
                 /* write cont data */
                 if (config->output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                 else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);
-
-                /* possible OUT for timing !!! */
-#ifdef TIMING
-                FILE *out = fopen("timing.dat", "a");
-
-                DT_TOT = DT_ACT_DEF1 + DT_ACT_DEF2 + DT_ACT_DEF3 + DT_ACT_PRED +
-                         DT_ACT_GRAV + DT_EXT_GRAV + DT_GMC_GRAV +
-                         DT_GMC_GMC_GRAV + DT_EXT_GMC_GRAV +
-                         DT_ACT_CORR + DT_ACT_LOAD +
-                         DT_STEVOL + DT_STARDISK + DT_STARDESTR +
-                         DT_ACT_REDUCE;
-
-                fprintf(out,"%.8E \t %.6E \t %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f \t %.3f \t %.8E %.8E %.8E \t %.8E %.8E %.8E \n",
-                        time_cur, DT_TOT,
-                        100.0*DT_ACT_DEF1/DT_TOT, 100.0*DT_ACT_DEF2/DT_TOT, 100.0*DT_ACT_DEF3/DT_TOT, 100.0*DT_ACT_PRED/DT_TOT,
-                        100.0*DT_ACT_GRAV/DT_TOT, 100.0*DT_EXT_GRAV/DT_TOT, 100.0*DT_GMC_GRAV/DT_TOT,
-                        100.0*DT_GMC_GMC_GRAV/DT_TOT, 100.0*DT_EXT_GMC_GRAV/DT_TOT,
-                        100.0*DT_ACT_CORR/DT_TOT, 100.0*DT_ACT_LOAD/DT_TOT,
-                        100.0*DT_STEVOL/DT_TOT, 100.0*DT_STARDISK/DT_TOT, 100.0*DT_STARDESTR/DT_TOT,
-                        100.0*DT_ACT_REDUCE/DT_TOT,
-                        CPU_time_real-CPU_time_real0, CPU_time_user-CPU_time_user0, CPU_time_syst-CPU_time_syst0,
-                        timesteps, n_act_sum, 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
-
-                fclose(out);
-#endif
-
             } /* if (myRank == rootRank) */
 
 #ifdef ETICS
-            // We are /inside/ a control step, so all particles must be synchronized; we can safely calculate their density centre. The acceleration and jerk currently in the memory are for the predicted position of the CEP, by calling grapite_calc_center we "correct" the position and velocity, but not the gravity at that point.
+            // We are /inside/ a control step, so all particles must be
+            // synchronized; we can safely calculate their density centre. The
+            // acceleration and jerk currently in the memory are for the
+            // predicted position of the CEP, by calling grapite_calc_center we
+            // "correct" the position and velocity, but not the gravity at that
+            // point.
             if (grapite_cep_index >= 0) {
+                double3 xcm, vcm, xdc, vdc;
                 grapite_calc_center(N, m, (double(*)[3])x, (double(*)[3])v, xcm, vcm, xdc, vdc);
                 x[grapite_cep_index] = xdc;
                 v[grapite_cep_index] = vdc;
@@ -955,24 +732,22 @@ int main(int argc, char *argv[])
     } /* while (time_cur < t_end) */
 
     /* close the local GRAPEs */
+    timer.stop();
     g6_close(clusterid);
 
     double g6_calls_sum;
     MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);
-
     if (myRank == rootRank) {
-
         /* Write some output for the timestep annalize... */
-
         printf("\n");
         printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
         printf("\n");
-        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
+        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(timer.time)/1.0E+09);
         fflush(stdout);
-
     } /* if (myRank == rootRank) */
 
-    delete[] m; delete[] x; delete[] v; delete[] ind; delete[] a; delete[] adot; delete[] pot; delete[] pot_ext; delete[] t; delete[] dt; delete[] ind_act; delete[] pot_act_new; delete[] pot_act_tmp; delete[] x_act_new; delete[] v_act_new; delete[] a_act_tmp; delete[] adot_act_tmp; delete[] a_act_new; delete[] adot_act_new; delete[] pot_act_ext;
+    delete config;
+    delete[] m; delete[] x; delete[] v; delete[] a; delete[] adot; delete[] pot_ext; delete[] t; delete[] dt; delete[] x_act_new; delete[] v_act_new; delete[] a_act_new; delete[] adot_act_new; delete[] pot_act_ext;
 
     /* Finalize the MPI work */
     MPI_Finalize();