diff --git a/phigrape.cpp b/phigrape.cpp
index d8bbd04..8e43da3 100644
--- a/phigrape.cpp
+++ b/phigrape.cpp
@@ -123,17 +123,7 @@ double DT_ACT_REDUCE;
 
 #endif
 
-/* some local settings for G6a board's */
 
-int clusterid, ii, nn, numGPU;
-
-int    npipe=G6_NPIPE, index_i[G6_NPIPE];
-double h2_i[G6_NPIPE], p_i[G6_NPIPE];
-double3 x_i[G6_NPIPE], v_i[G6_NPIPE],
-        a_i[G6_NPIPE], jerk_i[G6_NPIPE];
-int    new_tunit=51, new_xunit=51;
-
-double ti=0.0;
 
 /* external potential... */
 
@@ -286,43 +276,32 @@ void write_bh_nb_data(double time_cur, int N, double m[], double3 x[], double3 v
     fclose(out);
 }
 
-void calc_self_grav(double t, double eps2, double &g6_calls, int n_loc,
-                    int n_act, int ind_act[],
-                    double3 x_act_new[], double3 v_act_new[],
-                    double pot_act_tmp[],
-                    double3 a_act_tmp[],
-                    double3 adot_act_tmp[],
-                    double h2_i[])
-{
-    /* calc the new grav for the active particles */
-
-#ifdef TIMING
-    get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
-#endif
-
-    g6_set_ti(clusterid, t);
-
-    int ni = n_act; // TODO why is this needed?
-
-    /* define the local phi, a, adot for these active particles */
-
-    for (int i=0; i<ni; i+=npipe) {
-        nn = npipe;
-        if (ni-i < npipe) nn = ni - i;
-        for (ii=0; ii<nn; ii++) {
-            h2_i[ii]    = eps2; // TODO This should be a global or something
-        } /* ii */
-        //TODO any way we can clean up this ugly casting?
-        g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i, eps2, h2_i);
-        g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act_new+i, (double(*)[3])v_act_new+i, eps2, h2_i, (double(*)[3])a_act_tmp+i, (double(*)[3])adot_act_tmp+i, pot_act_tmp+i);
-        g6_calls++;
-    } /* i */
-
-#ifdef TIMING
-    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
-    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
-#endif
-}
+class Calc_self_grav {
+public:
+    Calc_self_grav(const int N, const int n_loc, const int clusterid, const int npipe, const double eps)
+        : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
+    {
+        h2.assign(N, eps2); 
+    }
+    void operator()(const double t, const int n_act, int ind_act[], const double3 x_act[], const double3 v_act[],
+              double pot[], double3 acc[], double3 jrk[])
+    {
+        g6_set_ti(clusterid, t);
+        for (int i=0; i<n_act; i+=npipe) {
+            int nn = npipe;
+            if (n_act-i < npipe) nn = n_act - i;
+            //TODO any way we can clean up this ugly casting?
+            g6calc_firsthalf(clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i, eps2, h2.data());
+            g6calc_lasthalf( clusterid, n_loc, nn, ind_act+i, (double(*)[3])x_act+i, (double(*)[3])v_act+i, eps2, h2.data(), (double(*)[3])acc+i, (double(*)[3])jrk+i, pot+i);
+            g6_calls++;
+        } /* i */
+    }
+    double g6_calls;
+private:
+    int n_loc, clusterid, npipe;
+    double eps2;
+    std::vector<double> h2;
+};
 
 void calc_ext_grav(std::vector<External_gravity*> &external_gravity_components, int n_act, double3 *x_act_new, double3 *v_act_new, double *pot_act_ext, double3 *a_act_new, double3* adot_act_new)
 {
@@ -705,15 +684,15 @@ int main(int argc, char *argv[])
         skip_con=0, tmp_i;
 
     double dt_disk, dt_contr, t_disk=0.0, t_contr=0.0,
-           dt_bh, t_bh=0.0, dt_bh_tmp,
+           dt_bh, t_bh=0.0,
            t_end, time_cur, dt_min, dt_max, min_t, min_t_loc,
            eta_s, eta, eta_bh,
            E_tot_0, E_tot_corr_0, E_tot_corr_sd_0,
            rcm_sum=0.0, vcm_sum=0.0,
-           eps=0.0, eps2,
+           eps,
            a2_mod, adot2_mod,
            dt_tmp, dt2half, dt3over6,
-           timesteps=0.0, n_act_sum=0.0, n_act_distr[N_MAX], g6_calls=0.0, g6_calls_sum=0.0;
+           timesteps=0.0, n_act_sum=0.0, n_act_distr[N_MAX];
 
     double3 xcm, vcm, mom,
             xdc, vdc,
@@ -743,13 +722,11 @@ int main(int argc, char *argv[])
     /* data for active particles */
 
     int    n_act, ind_act[N_MAX];
-    double m_act[N_MAX],
-           pot_act[N_MAX], t_act[N_MAX], dt_act[N_MAX],
+    double t_act[N_MAX],
            pot_act_new[N_MAX],
            pot_act_tmp[N_MAX];
 
-    double3 x_act[N_MAX], v_act[N_MAX],
-            a_act[N_MAX], adot_act[N_MAX],
+    double3 a_act[N_MAX], adot_act[N_MAX],
             x_act_new[N_MAX], v_act_new[N_MAX],
             a_act_new[N_MAX], adot_act_new[N_MAX],
             a_act_tmp[N_MAX], adot_act_tmp[N_MAX];;
@@ -765,6 +742,12 @@ int main(int argc, char *argv[])
     int    inf_event[N_MAX];
     double3 x_bbhc, v_bbhc;
 
+    /* some local settings for G6a board's */
+
+    int clusterid, numGPU, npipe=G6_NPIPE;
+
+    int    new_tunit=51, new_xunit=51;
+
     double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.
 
     /* INIT the rand() !!! */
@@ -801,31 +784,19 @@ int main(int argc, char *argv[])
     }
     else
         ascii_read(config->input_file_name, &diskstep, &N, &time_cur, m, x, v);
+    
     std::iota(ind, ind+N, 0);
 
+    eps = config->eps;
+    eta = config->eta;
+    t_end = config->t_end;
+    dt_disk = config->dt_disk;
+    dt_contr = config->dt_contr;
+    dt_bh = config->dt_bh;
+
     if (myRank == rootRank) {
-
-        //TODO move it out of (myRank == rootRank) so you don't need to communicate them.
-        eps = config->eps;
-        t_end = config->t_end;
-        dt_disk = config->dt_disk;
-        dt_contr = config->dt_contr;
-        dt_bh = config->dt_bh;
-        eta = config->eta;
-        strcpy(inp_fname, config->input_file_name.c_str());
-
         if (config->binary_smbh_influence_sphere_output) for (int i=0; i<N; i++) inf_event[i] = 0; // WARNING N wasn't set yet!
 
-        /*
-        eps      : Plummer softening parameter (can be even 0)
-        t_end    : end time of calculation
-        dt_disk  : interval of snapshot files output (0xxx.dat)
-        dt_contr : interval for the energy control output (contr.dat)
-        dt_bh    : interval for BH output (bh.dat & bh_neighbors.dat)
-        eta      : parameter for timestep determination
-        inp_data : name of the input file (data.inp)
-        */
-
         printf("\n");
         printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
         printf("\n");
@@ -872,19 +843,6 @@ int main(int argc, char *argv[])
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
 
-    /* Broadcast all useful values to all processors... */
-    MPI_Bcast(&N,        1, MPI_INT,    rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&eps,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&eta,      1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&t_end,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&dt_disk,  1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&dt_contr, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&dt_bh,    1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(&time_cur, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-
-    /* Wait to all processors to finish his works... */
-    MPI_Barrier(MPI_COMM_WORLD);
-
     double normalization_mass=1, normalization_length=1, normalization_velocity=1;
     if (config->ext_units_physical) {
         normalization_mass = 1/config->unit_mass;
@@ -915,8 +873,6 @@ int main(int argc, char *argv[])
     eta_s  = eta/ETA_S_CORR;
     eta_bh = eta/ETA_BH_CORR;
 
-    eps2 = SQR(eps);
-
     dt_min = 1.0*pow(2.0, DTMINPOWER);
     dt_max = 1.0*pow(2.0, DTMAXPOWER);
 
@@ -942,25 +898,13 @@ int main(int argc, char *argv[])
 
     n_loc = N/n_proc;
 
+    Calc_self_grav calc_self_grav(N, n_loc, clusterid, npipe, eps);
+
     Active_search active_search(myRank, n_proc, n_loc, N);
 
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
 
-    /* Broadcast the values of all particles to all processors... */
-    MPI_Bcast(ind, N, MPI_INT,    rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(m,   N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(x, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-    MPI_Bcast(v, 3*N, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
-
-    /* Wait to all processors to finish his works... */
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    /* Wait to all processors to finish his works... */
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    /* init the local GRAPE's */
-
     if (config->devices_per_node==0) {
         MPI_Comm shmcomm;
         MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
@@ -973,6 +917,7 @@ int main(int argc, char *argv[])
     printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
     fflush(stdout);
 
+    /* init the local GRAPEs */
     g6_open(clusterid);
     npipe = g6_npipes();
     g6_set_tunit(new_tunit);
@@ -1011,26 +956,14 @@ int main(int argc, char *argv[])
     n_act = N;
 
     for (int i=0; i<n_act; i++) {
-        ind_act[i]  = ind[i];
+        ind_act[i]  = ind[i]; // isn't it just i?
         iii         = ind_act[i];
-        m_act[i]    = m[iii];
-
-        x_act[i] = x[iii];
-        v_act[i] = v[iii];
 
         t_act[i]  = t[iii];
-        dt_act[i] = dt[iii];
-
     } /* i */
 
-    // NOTE this is where calc_self_grav_zero() used to be.
-    calc_self_grav(time_cur, eps2, g6_calls, n_loc,
-                   n_act, ind_act,
-                   x_act, v_act,
-                   pot_act_tmp,
-                   a_act_tmp,
-                   adot_act_tmp,
-                   h2_i);
+    calc_self_grav(time_cur, n_act, ind_act, x, v,
+                           pot_act_tmp, a_act_tmp, adot_act_tmp);
 
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1119,7 +1052,7 @@ int main(int argc, char *argv[])
         }
     }
 
-    calc_ext_grav(external_gravity_components, n_act, x_act, v_act, pot_ext, a, adot);
+    calc_ext_grav(external_gravity_components, n_act, x, v, pot_ext, a, adot);
 
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1129,7 +1062,7 @@ int main(int argc, char *argv[])
 
     /* Energy control... */
     if (myRank == rootRank) {
-        energy_contr(time_cur, timesteps, n_act_sum, g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
+        energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
     } /* if (myRank == rootRank) */
 
 #ifdef ETICS
@@ -1216,8 +1149,6 @@ int main(int argc, char *argv[])
         n_act_distr[i-1] = 0.0;
     }
 
-    g6_calls = 0.0; //TODO this should include the calls at the zeroth step, so move it further up.
-
 #ifdef TIMING
     DT_TOT      = 0.0;
 
@@ -1345,13 +1276,16 @@ int main(int argc, char *argv[])
         DT_ACT_PRED += (CPU_tmp_user - CPU_tmp_user0);
 #endif
 
-        calc_self_grav(min_t, eps2, g6_calls, n_loc,
-                n_act, ind_act,
-                x_act_new, v_act_new,
-                pot_act_tmp,
-                a_act_tmp,
-                adot_act_tmp,
-                h2_i);
+#ifdef TIMING
+        get_CPU_time(&CPU_tmp_real0, &CPU_tmp_user0, &CPU_tmp_syst0);
+#endif
+        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new,
+                               pot_act_tmp, a_act_tmp, adot_act_tmp);
+
+#ifdef TIMING
+    get_CPU_time(&CPU_tmp_real, &CPU_tmp_user, &CPU_tmp_syst);
+    DT_ACT_GRAV += (CPU_tmp_user - CPU_tmp_user0);
+#endif
 
         /* Reduce the "global" vectors from "local" on all the nodes */
 
@@ -1527,7 +1461,7 @@ int main(int argc, char *argv[])
         if (time_cur >= t_contr) {
             if (myRank == rootRank) {
 
-                energy_contr(time_cur, timesteps, n_act_sum, g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
+                energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, rcm_sum, vcm_sum, E_tot_0, E_tot_corr_0, E_tot_corr_sd_0, skip_con, N, m, x, v, pot, pot_ext);
 
                 /* write cont data */
 
@@ -1600,7 +1534,8 @@ int main(int argc, char *argv[])
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Reduce(&g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);
+    double g6_calls_sum;
+    MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);
 
     /* Wait to all processors to finish his works... */
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1610,7 +1545,7 @@ int main(int argc, char *argv[])
         /* Write some output for the timestep annalize... */
 
         printf("\n");
-        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls);
+        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
         printf("\n");
         printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(CPU_time_user-CPU_time_user0)/1.0E+09);
         fflush(stdout);