phigrape/phigrape.cpp

#include <algorithm>
#include <chrono>
#include <mpi.h>
#include <numeric>

#include "black_holes.h"
#include "config.h"
#include "double3.h"
#include "external.h"
#include "grape6.h"
#include "io.h"

#ifdef ETICS
#include "grapite.h"
#endif

namespace std::chrono {
    struct Timer {
        void start()
        {
            t_start = steady_clock::now();
        }
        void stop()
        {
            t_stop = steady_clock::now();
            time = duration_cast<nanoseconds>(t_stop - t_start).count()*1E-9;
        }
        double time; // seconds
        steady_clock::time_point t_start, t_stop;
    };
}
std::chrono::Timer timer;

class Calc_self_grav {
public:
    Calc_self_grav(const int N, const int n_loc, const int clusterid, const int npipe, const double eps)
        : g6_calls(0), n_loc(n_loc), clusterid(clusterid), npipe(npipe), eps2(eps*eps)
    {
        h2.assign(N, eps2);
        pot_loc.resize(N);
        acc_loc.resize(N);
        jrk_loc.resize(N);
    }
    void operator()(const double t, const int n_act, std::vector<int> &ind_act, std::vector<double3> &x_act, std::vector<double3> &v_act,
              std::vector<double>& pot, std::vector<double3> &acc, std::vector<double3> &jrk)
    {
        g6_set_ti(clusterid, t);
        for (int i=0; i<n_act; i+=npipe) {
            int nn = npipe;
            if (n_act-i < npipe) nn = n_act - i;
            g6calc_firsthalf(clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i], eps2, h2.data());
            g6calc_lasthalf( clusterid, n_loc, nn, ind_act.data()+i, (double(*)[3])&x_act[i], (double(*)[3])&v_act[i], eps2, h2.data(), (double(*)[3])&acc_loc[i], (double(*)[3])&jrk_loc[i], &pot_loc[i]);
            g6_calls++;
        } /* i */
        /* Reduce the "global" vectors from "local" on all the nodes */
        MPI_Allreduce(pot_loc.data(), pot.data(), n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(acc_loc.data(), acc.data(), 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(jrk_loc.data(), jrk.data(), 3*n_act, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    double g6_calls;
private:
    int n_loc, clusterid, npipe;
    double eps2;
    std::vector<double> h2;
    std::vector<double> pot_loc; // the _loc variables are for this node only.
    std::vector<double3> acc_loc, jrk_loc;
};

class Calc_ext_grav {
public:
    void add_component(External_gravity &component)
    {
        components.push_back(&component);
        if (component.is_active) any_active = true;
    }
    void operator()(int n, const std::vector<double3> &x, const std::vector<double3> &v, std::vector<double> &pot, std::vector<double3> &acc, std::vector<double3> &jrk)
    {
        for (auto component : components) {
            if (component->is_active)
                component->apply(n, x, v, pot, acc, jrk);
        }
    }
    void print_info()
    {
        for (auto component : components) {
                component->print_info();
        }
        fflush(stdout);
    }
    bool any_active = false;
private:
    std::vector<External_gravity*> components;
};

void energy_contr(const double time_cur, const double timesteps, const double n_act_sum, const double g6_calls, int N, const std::vector<double> &m, const std::vector<double3> &x, const std::vector<double3> &v, const std::vector<double> &pot, const std::vector<double> &pot_ext)
{
    double E_pot = 0;
    for (int i=0; i<N; i++) E_pot += m[i]*pot[i];
    E_pot *= 0.5;

    double E_kin = 0;
    for (int i=0; i<N; i++) E_kin += m[i]*v[i].norm2();
    E_kin *= 0.5;

    double E_pot_ext = 0;
    for (int i=0; i<N; i++) E_pot_ext += m[i]*pot_ext[i];

    double m_tot = 0;
    double3 xcm = {0, 0, 0};
    double3 vcm = {0, 0, 0};
    for (int i=0; i<N; i++) {
        m_tot += m[i];
        xcm += m[i] * x[i];
        vcm += m[i] * v[i];
    }
    xcm /= m_tot;
    vcm /= m_tot;
    double rcm_mod = xcm.norm();
    double vcm_mod = vcm.norm();

    double3 mom = {0, 0, 0};
    for (int i=0; i<N; i++) {
        mom[0] += m[i] * (x[i][1]* v[i][2] - x[i][2]*v[i][1]);
        mom[1] += m[i] * (x[i][2]* v[i][0] - x[i][0]*v[i][2]);
        mom[2] += m[i] * (x[i][0]* v[i][1] - x[i][1]*v[i][0]);
    }

    timer.stop();

    double E_tot = E_pot + E_kin + E_pot_ext;

    static double E_tot_prev;
    if (timesteps == 0.0) E_tot_prev = E_tot;

    double DE_tot = E_tot - E_tot_prev;

    /* This is the only output to screen */
    printf("%.3E %.3E  % .4E %.4E % .4E  % .4E % .4E  %.2E\n",
            time_cur, timesteps,
            E_pot, E_kin, E_pot_ext, E_tot, DE_tot,
            timer.time);

    fflush(stdout);

    auto out = fopen("contr.dat", "a");
    fprintf(out,"%.8E \t %.8E %.8E %.8E \t % .8E % .8E % .8E % .8E % .8E \t % .8E % .8E \t % .8E % .8E % .8E \t %.8E\n",
            time_cur, timesteps, n_act_sum, g6_calls,
            E_pot, E_kin, E_pot_ext,
            E_tot, DE_tot,
            rcm_mod, vcm_mod,
            mom[0], mom[1], mom[2],
            timer.time);
    fclose(out);

    E_tot_prev = E_tot;
}

class Active_search {
    // TODO you can add pointers to t and dt at the constructor, no point giving them at get_minimum_time but without the size.
public:
    Active_search(const int myRank, const int n_proc, const int n_loc, const int N, bool grapite_active_search_flag)
        : myRank(myRank), n_proc(n_proc), n_loc(n_loc), N(N), grapite_active_search_flag(grapite_active_search_flag)
    {
        ind_act_loc.resize(n_loc);
    }
    double get_minimum_time(const std::vector<double> &t, const std::vector<double> &dt)
    {
        double min_t_loc, min_t;
#ifdef ETICS
        if (grapite_active_search_flag) {
            min_t_loc = grapite_get_minimum_time();
        } else
#endif
        {
            min_t_loc = t[myRank*n_loc]+dt[myRank*n_loc];
            for (int j=myRank*n_loc+1; j<(myRank+1)*n_loc; j++) {
                double tmp = t[j] + dt[j];
                if (tmp < min_t_loc) min_t_loc = tmp;
            }
        }
        /* Reduce the "global" min_t from min_t_loc "local" on all processors) */
        MPI_Allreduce(&min_t_loc, &min_t, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
        return min_t;
    }
    void get_active_indices(const double min_t, const std::vector<double> &t, const std::vector<double> &dt, std::vector<int> &ind_act, int &n_act)
    {
#ifdef ETICS
        if (grapite_active_search_flag) {
            int n_act_loc;
            grapite_active_search(min_t, ind_act_loc.data(), &n_act_loc);
            if (myRank > 0)
                for (int i=0; i<n_act_loc; i++)
                    ind_act_loc[i] += myRank*n_loc;
            int n_act_arr[256], displs[256]; // Assuming maximum of 256 processes... seems safe.
            MPI_Allgather(&n_act_loc, 1, MPI_INT, n_act_arr, 1, MPI_INT, MPI_COMM_WORLD);
            n_act = n_act_arr[0];
            for (int i=1; i<n_proc; i++)
                n_act += n_act_arr[i];
            displs[0] = 0;
            for (int i=1; i<n_proc; i++)
                displs[i]=displs[i-1]+n_act_arr[i-1];
            MPI_Allgatherv(ind_act_loc.data(), n_act_loc, MPI_INT, ind_act.data(), n_act_arr, displs, MPI_INT, MPI_COMM_WORLD);
        } else
#endif
        {
            n_act = 0;
            for (int i=0; i<N; i++) {
                if (t[i]+dt[i] == min_t) ind_act[n_act++] = i;
            } /* i */
        }
    }
private:
    int myRank, n_proc, n_loc, N;
    std::vector<int> ind_act_loc;
    bool grapite_active_search_flag;
};

inline void calc_high_derivatives(const double dt_tmp, const double3 &a_old, const double3 &a_new, const double3 &a1_old, const double3 &a1_new, double3 &a2, double3 &a3)
{
    double dtinv = 1/dt_tmp;
    double dt2inv = dtinv*dtinv;
    double dt3inv = dt2inv*dtinv;

    double3 a0mia1 = a_old-a_new;
    double3 ad04plad12 = 4*a1_old + 2*a1_new;
    double3 ad0plad1 = a1_old + a1_new;

    a2 = -6*a0mia1*dt2inv - ad04plad12*dtinv;
    a3 = 12*a0mia1*dt3inv + 6*ad0plad1*dt2inv;
}

inline void corrector(const double dt_tmp, const double3 &a2, const double3 &a3, double3 &x, double3 &v)
{
    double dt3over6 = dt_tmp*dt_tmp*dt_tmp/6.0;
    double dt4over24 = dt3over6*dt_tmp/4.0;
    double dt5over120 = dt4over24*dt_tmp/5.0;

    x += dt4over24*a2 + dt5over120*a3;
    v += dt3over6*a2 + dt4over24*a3;
}

inline double aarseth_step(const double eta, const double dt, const double3 &a, const double3 &a1, const double3 &a2, const double3 &a3)
{
    double a1abs = a.norm();
    double adot1abs = a1.norm();
    double3 a2dot1 = a2 + dt*a3;
    double a2dot1abs = a2dot1.norm();
    double a3dot1abs = a3.norm();
    return sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)/(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));
}

inline double blockize_step(double dt, double dt_prev, double min_t, double dt_min, double dt_max)
{
    double dt_new = dt_prev;
    if (dt < dt_min) dt_prev = dt_min;
    if ((dt < dt_prev) && (dt > dt_min)) {
        int power = log(dt)/M_LN2 - 1;
        dt_new = pow(2.0, power);
    }
    if ((dt > 2*dt_new) && (fmod(min_t, 2*dt_new) == 0) && (2*dt_new <= dt_max)) dt_new *= 2;
    return dt_new;
}

inline void predictor(double min_t, const int n_act, const std::vector<int> &ind_act, const std::vector<double> &t, const std::vector<double3> &x, const std::vector<double3> &v, const std::vector<double3> &a, const std::vector<double3> &adot, std::vector<double3> &x_act_new, std::vector<double3> &v_act_new)
{
    for (int i=0; i<n_act; i++) {
        int j_act = ind_act[i];
        double dt = min_t - t[j_act];
        double dt2half = 0.5*dt*dt;
        double dt3over6 = (1./3.)*dt*dt2half;
        x_act_new[i] = x[j_act] + v[j_act]*dt + a[j_act]*dt2half + adot[j_act]*dt3over6;
        v_act_new[i] = v[j_act] + a[j_act]*dt + adot[j_act]*dt2half;
    } /* i */
}

int main(int argc, char *argv[])
{
    timer.start();

    /* INIT the rand() !!! */
    srand(19640916);                 /* it is just my birthday :-) */

    /* Init MPI */
    MPI_Init(&argc, &argv);

    /* Define the total number of processors and the Rank of each processors */
    int n_proc, myRank;
    const int rootRank = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &n_proc);
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

    /* Define the processors names */
    int name_proc;
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(processor_name, &name_proc);

    /* Print the Rank and the names of processors */
    printf("Rank of the processor %03d on %s \n", myRank, processor_name);

    const Config config("phigrape.conf");
    Input_data input_data;
    if (is_hdf5(config.input_file_name)) {
#ifndef HAS_HDF5
        fprintf(stderr, "ERROR: input file is in HDF5 format, but the code was compiled without HDF5 support\n");
        return -1;
#endif
        input_data = h5_read(config.input_file_name);
    }
    else
        input_data = ascii_read(config.input_file_name);

    int N = input_data.N;
    int diskstep = input_data.step_num;
    double time_cur = input_data.t;
    auto &m = input_data.m;
    auto &x = input_data.x;
    auto &v = input_data.v;

    double t_disk  = config.dt_disk*(1+floor(time_cur/config.dt_disk));
    double t_contr = config.dt_contr*(1+floor(time_cur/config.dt_contr));
    double t_bh    = config.dt_bh*(1+floor(time_cur/config.dt_bh));

    if (myRank == rootRank) {
        printf("\n");
        printf("Begin the calculation of phi-GRAPE program on %03d processors\n", n_proc);
        printf("\n");
        printf("N       = %07d \t eps      = %.6E\n", N, config.eps);
        printf("t_beg   = %.6E \t t_end    = %.6E\n", time_cur, config.t_end);
        printf("dt_disk = %.6E \t dt_contr = %.6E\n", config.dt_disk, config.dt_contr);
        printf("dt_bh   = %.6E \n", config.dt_bh);
        printf("eta     = %.6E\n\n", config.eta);
        printf("t_disk = %.6E   t_contr = %.6E   t_bh = %.6E\n\n", t_disk, t_contr, t_bh);

        if ((diskstep == 0) && (time_cur == 0)) {
            FILE *out = fopen("contr.dat", "w");
            fclose(out);
            if (config.live_smbh_output && (config.live_smbh_count > 0)) {
                out = fopen("bh.dat", "w");
                fclose(out);
            }
            if ((config.live_smbh_neighbor_output) && (config.live_smbh_count > 0)) {
                out = fopen("bh_neighbors.dat", "w");
                fclose(out);
            }
        }
    } /* if (myRank == rootRank) */

    double normalization_mass=1, normalization_length=1, normalization_velocity=1;
    if (config.ext_units_physical) {
        normalization_mass = 1/config.unit_mass;
        normalization_length = 1000/config.unit_length;
        normalization_velocity = 1.52484071426404437233e+01*sqrt(config.unit_length/config.unit_mass);
    }
    Calc_ext_grav calc_ext_grav;
    Plummer ext_bulge(config.ext_m_bulge*normalization_mass, config.ext_b_bulge*normalization_length);
    ext_bulge.set_name("bulge");
    calc_ext_grav.add_component(ext_bulge);
    Miyamoto_Nagai ext_disk(config.ext_m_disk*normalization_mass, config.ext_a_disk*normalization_length, config.ext_b_disk*normalization_length);
    calc_ext_grav.add_component(ext_disk);
    Plummer ext_halo_plummer(config.ext_m_halo_plummer*normalization_mass, config.ext_b_halo_plummer*normalization_length);
    ext_halo_plummer.set_name("halo");
    calc_ext_grav.add_component(ext_halo_plummer);
    Logarithmic_halo ext_log_halo(config.ext_log_halo_v*normalization_velocity, config.ext_log_halo_r*normalization_length);
    calc_ext_grav.add_component(ext_log_halo);
    Dehnen ext_dehnen(config.ext_dehnen_m*normalization_mass, config.ext_dehnen_r*normalization_length, config.ext_dehnen_gamma);
    calc_ext_grav.add_component(ext_dehnen);
    if (myRank == rootRank) calc_ext_grav.print_info();

    /* some local settings for G6a boards */
    int clusterid, numGPU;
    if (config.devices_per_node==0) {
        MPI_Comm shmcomm;
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(shmcomm, &numGPU);
        MPI_Comm_rank(shmcomm, &clusterid);
    } else {
        numGPU = config.devices_per_node;
        clusterid = myRank % numGPU;
    }
    printf("Rank of the processor %03d : Number of GPUs %01d : Cluster ID %01d \n", myRank, numGPU, clusterid);
    fflush(stdout);

    /* init the local GRAPEs */
    g6_open(clusterid);
    int npipe = g6_npipes();
    g6_set_tunit(51);
    g6_set_xunit(51);

    bool grapite_active_search_flag = false;
#ifdef ETICS
    grapite_set_dev_exec_threshold(config.grapite_dev_exec_threshold);
    grapite_active_search_flag = config.grapite_active_search;
#endif

    int n_loc = N/n_proc;
#ifdef ETICS
    grapite_read_particle_tags(N, config.grapite_mask_file_name.c_str(), myRank, n_loc);
    grapite_set_dt_exp(config.dt_scf);
    grapite_set_t_exp(time_cur);
#endif

    const double dt_min = pow(2, config.dt_min_power);
    std::vector<int> ind(N);
    std::iota(begin(ind), end(ind), 0);
    /* load the nj particles to the G6 */
    double3 zeros = {0, 0, 0}; // Dummy; can't really be const because of the GRAPE interface.
    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], time_cur, dt_min, m[j], zeros, zeros, zeros, v[j], x[j]);
    } /* k */

#ifdef ETICS
    double etics_length_scale;
    if (myRank == rootRank) etics_length_scale = grapite_get_length_scale(N, m.data(), (double(*)[3])x.data(), (double(*)[3])v.data()); // We don't want all ranks to do it, because they need to write a file and might confuse each other
    MPI_Bcast(&etics_length_scale, 1, MPI_DOUBLE, rootRank, MPI_COMM_WORLD);
    grapite_set_length_scale(etics_length_scale);

    int grapite_cep_index = grapite_get_cep_index();
    if (grapite_cep_index >= 0) {
        double3 xcm, vcm, xdc, vdc;
        grapite_calc_center(N, m.data(), (double(*)[3])x.data(), (double(*)[3])v.data(), xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, zeros, zeros);
    }

    if (config.grapite_smbh_star_eps >= 0) grapite_set_eps_bh(config.grapite_smbh_star_eps);
#endif

    std::vector<double3> a(N), adot(N);
    std::vector<double> pot(N);

    /* define the all particles as a active on all the processors for the first time grav calc. */
    Calc_self_grav calc_self_grav(N, n_loc, clusterid, npipe, config.eps);
    calc_self_grav(time_cur, N, ind, x, v, pot, a, adot);

    Black_hole_physics black_hole_physics;
    std::vector<Particle_ref> smbh_list;
    if (config.live_smbh_count >= 1)
        black_hole_physics = Black_hole_physics(config.live_smbh_count, myRank, rootRank);
    else if (config.live_smbh_count >= 2) {
        if (config.live_smbh_custom_eps >= 0) {
#ifdef ETICS
            double eps = (config.grapite_smbh_star_eps >= 0)?config.grapite_smbh_star_eps:config.eps;
#else
            double eps = config.eps;
#endif
            black_hole_physics.set_softening(eps, config.live_smbh_custom_eps);
            for (int i = 0; i < config.live_smbh_count; i++)
                smbh_list.emplace_back(m[i], x[i], v[i], pot[i], a[i], adot[i]);
            black_hole_physics.adjust_softening(smbh_list);
        }
    }
    if (config.binary_smbh_pn) {
        throw std::runtime_error("This is the triple+ SMBH version, it cannot do PN yet!");
        #if 0
        black_hole_physics.set_post_newtonian(config.pn_c, config.pn_usage.data());
        if (config.pn_usage[6]) black_hole_physics.set_spins(config.smbh1_spin.data(), config.smbh2_spin.data());
        black_hole_physics.adjust_post_newtonian(dt_min, a[0], a[1], adot[0], adot[1]);
        #endif
    }

    std::vector<double> pot_ext(N, 0.);
    calc_ext_grav(N, x, v, pot_ext, a, adot);

    double timesteps=0, n_act_sum=0;
    /* Energy control... */
    if (myRank == rootRank) energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);

#ifdef ETICS
    if (config.etics_dump_coeffs && (diskstep==0)) {
        char out_fname[256];
        sprintf(out_fname, "coeffs.%06d.%02d.dat", 0, myRank);
        grapite_dump(out_fname, 2);
    }

    if (grapite_cep_index >= 0) {
        double3 xcm, vcm, xdc, vdc;
        grapite_calc_center(N, m.data(), (double(*)[3])x.data(), (double(*)[3])v.data(), xcm, vcm, xdc, vdc);
        x[grapite_cep_index] = xdc;
        v[grapite_cep_index] = vdc;
        grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
    }
#endif

    const double dt_max = std::min({config.dt_disk, config.dt_contr, pow(2, config.dt_max_power)});
    std::vector<double> dt(N);
    /* Define initial timestep for all particles on all nodes */
    for (int j=0; j<N; j++) {
        double a2_mod = a[j].norm2();
        double adot2_mod = adot[j].norm2();

        double dt_tmp, eta_s = config.eta/config.eta_s_corr;
        if (adot2_mod==0) dt_tmp = eta_s; // That's weird, when will we have such a case?
        else              dt_tmp = eta_s*sqrt(a2_mod/adot2_mod);

        int power = log(dt_tmp)/log(2.0) - 1;

        dt_tmp = pow(2.0, (double)power);

        if (dt_tmp < dt_min) dt_tmp = dt_min;
        if (dt_tmp > dt_max) dt_tmp = dt_max;

        dt[j] = dt_tmp;

        if (config.dt_min_warning && (myRank == 0)) {
            if (dt[j] == dt_min) {
                printf("!!! Warning0: dt = dt_min = %.6E \t ind = %07d \n", dt[j], ind[j]);
                fflush(stdout);
            }
        }
    } /* j */

    if (config.live_smbh_count > 0) {
        double min_dt = *std::min_element(begin(dt), end(dt));
        for (int i=0; i<config.live_smbh_count; i++) dt[i] = min_dt;
    }

    /* load the new values for particles to the local GRAPEs */
    for (int k=0; k<n_loc; k++) {
        int j = k + myRank*n_loc;
        g6_set_j_particle(clusterid, k, ind[j], time_cur, dt[j], m[j], zeros, adot[j]*(1./6.), a[j]*0.5, v[j], x[j]);
    } /* k */

    timesteps = 0.0; // Why won't those two be long long instead of double + should include the zeroth step
    n_act_sum = 0.0;


    std::vector<int> ind_act(N);
    std::vector<double3> x_act_new(N), v_act_new(N), a_act_new(N), adot_act_new(N);
    std::vector<double> t(N, time_cur), pot_act_new(N);
    std::vector<double> pot_act_ext(N, 0.);

    // Functors for the main integration loop
    Active_search active_search(myRank, n_proc, n_loc, N, grapite_active_search_flag);
    Binary_smbh_influence_sphere_output binary_smbh_influence_sphere_output(config.binary_smbh_influence_radius_factor, N, m, x, v, pot, dt);
    Write_bh_nb_data write_bh_nb_data(config.live_smbh_neighbor_number, config.live_smbh_count, N, m, x, v);
    if (myRank == rootRank) {
        if (config.live_smbh_output) black_hole_physics.write_bh_data(time_cur, config.live_smbh_count, m, x, v, pot, a, adot, dt);
        if (config.live_smbh_neighbor_output) write_bh_nb_data(time_cur);
    } /* if (myRank == rootRank) */

    /* The main integration loop */
    while (time_cur <= config.t_end) {

        /* Define the minimal time and the active particles on all the nodes */
        double min_t = active_search.get_minimum_time(t, dt);

        /* Get indices of all particles that will be active in this bunch */
        int n_act;
        active_search.get_active_indices(min_t, t, dt, ind_act, n_act);

        /* Find the BH(s) indices in the active list */
        smbh_list.clear();
#ifdef ETICS
        /* Unlike with the simple active search, with GPU accelerated GRAPite
        active search, the list of active indices is not sorted. */
        int n_bh = config.live_smbh_count;
        if (config.grapite_active_search && (n_bh>0)) {
            int act_def_grapite_bh_count = 0;
            for (int i=0; i<n_act; i++) {
                if (ind_act[i]<n_bh) {
                    smbh_list.emplace_back(m[ind_act[i]], x_act_new[i], v_act_new[i], pot_act_new[i], a_act_new[i], adot_act_new[i]);
                    if (act_def_grapite_bh_count++ == n_bh) break;
                }
            }
        }
#else
        for (int i = 0; i < config.live_smbh_count; i++)
            smbh_list.emplace_back(m[ind_act[i]], x_act_new[i], v_act_new[i], pot_act_new[i], a_act_new[i], adot_act_new[i]);
#endif

        /* predict the active particles positions etc... on all the nodes */
        predictor(min_t, n_act, ind_act, t, x, v, a, adot, x_act_new, v_act_new);

        /* Calculate gravity on active particles */
        calc_self_grav(min_t, n_act, ind_act, x_act_new, v_act_new, pot_act_new, a_act_new, adot_act_new);

        if (config.live_smbh_count >= 2) {
            if (config.live_smbh_custom_eps >= 0) black_hole_physics.adjust_softening(smbh_list);
            #if 0
            if (config.binary_smbh_pn) black_hole_physics.adjust_post_newtonian(dt[i_bh1], a_act_new[i_bh1], a_act_new[i_bh2], adot_act_new[i_bh1], adot_act_new[i_bh2]);
            #endif
        }

        /* Calculate gravity on active particles due to external forces */
        if (calc_ext_grav.any_active) {
            std::fill_n(begin(pot_act_ext), n_act, 0);
            calc_ext_grav(n_act, x_act_new, v_act_new, pot_act_ext, a_act_new, adot_act_new);
        }

        /* correct the active particles positions etc... on all the nodes */
        double min_dt = dt_max; // notice that min_dt is not the same as dt_min; this one is to store the minimum timestep among currently active particles
        for (int i=0; i<n_act; i++) {
            int j_act = ind_act[i];
            double dt_cur = dt[j_act];

            double3 a2, a3;
            calc_high_derivatives(dt_cur, a[j_act], a_act_new[i], adot[j_act], adot_act_new[i], a2, a3);

            corrector(dt_cur, a2, a3, x_act_new[i], v_act_new[i]);

            //TODO make beautiful
            double eta_curr;
            if ((config.live_smbh_count > 0) && (ind_act[i] < config.live_smbh_count)) eta_curr = config.eta/config.eta_bh_corr;
            else eta_curr = config.eta;

            double dt_new = aarseth_step(eta_curr, dt_cur, a_act_new[i], adot_act_new[i], a2, a3);

            dt_new = blockize_step(dt_new, dt_cur, min_t, dt_min, dt_max);

            if (config.dt_min_warning && (myRank == 0)) {
                if (dt_new == dt_min) {
                    printf("!!! Warning1: dt_act = dt_min = %.6E \t ind_act = %07d time_cur=%.16E\n", dt_cur, ind_act[i], time_cur);
                    fflush(stdout);
                }
            }
            if (dt_new < min_dt) min_dt = dt_new;

            x[j_act] = x_act_new[i];
            v[j_act] = v_act_new[i];
            t[j_act] = min_t;
            dt[j_act] = dt_new;
            pot[j_act] = pot_act_new[i];
            pot_ext[j_act] = pot_act_ext[i];
            a[j_act] = a_act_new[i];
            adot[j_act] = adot_act_new[i];
        } /* i */

        /* define the min. dt over all the act. part. and set it also for the BH... */
        for (int i=0; i < config.live_smbh_count; i++) dt[i] = min_dt;

        if (config.binary_smbh_influence_sphere_output && (myRank == rootRank))
            binary_smbh_influence_sphere_output(ind_act, n_act, timesteps, time_cur);

        /* load the new values for active particles to the local GRAPE's */
        for (int i=0; i<n_act; i++) {
#ifdef ETICS
            if (ind_act[i] == grapite_cep_index) grapite_update_cep(t[grapite_cep_index], x[grapite_cep_index], v[grapite_cep_index], a[grapite_cep_index], adot[grapite_cep_index]); // All ranks should do it.
#endif
            int cur_rank = ind_act[i]/n_loc;
            if (myRank == cur_rank) {
                int j_act = ind_act[i];
                int address = ind_act[i] - myRank*n_loc;
                g6_set_j_particle(clusterid, address, ind_act[i], t[j_act], dt[j_act], m[j_act], zeros, adot[j_act]*(1./6.), a[j_act]*0.5, v[j_act], x[j_act]);
            } /* if (myRank == cur_rank) */
        } /* i */

        /* Current time set to min_t */
        time_cur = min_t;
        timesteps += 1.0;
        n_act_sum += n_act;

        if (time_cur >= t_bh) {
            if (myRank == rootRank) {
                /* Write BH data... */
                if (config.live_smbh_output) black_hole_physics.write_bh_data(time_cur, config.live_smbh_count, m, x, v, pot, a, adot, dt);

                /* Write BH NB data... */
                if (config.live_smbh_neighbor_output) write_bh_nb_data(time_cur);

            } /* if (myRank == rootRank) */

            t_bh += config.dt_bh;
        } /* if (time_cur >= t_bh) */

        if (time_cur >= t_contr) {
            if (myRank == rootRank) {
                energy_contr(time_cur, timesteps, n_act_sum, calc_self_grav.g6_calls, N, m, x, v, pot, pot_ext);
                /* write cont data */
                if (config.output_hdf5) h5_write("data.con", diskstep, N, time_cur, m, x, v, pot, a, adot, 0, true);
                else ascii_write("data.con", diskstep, N, time_cur, m, x, v, 16);
            } /* if (myRank == rootRank) */

#ifdef ETICS
            // We are /inside/ a control step, so all particles must be
            // synchronized; we can safely calculate their density centre. The
            // acceleration and jerk currently in the memory are for the
            // predicted position of the CEP, by calling grapite_calc_center we
            // "correct" the position and velocity, but not the gravity at that
            // point.
            if (grapite_cep_index >= 0) {
                double3 xcm, vcm, xdc, vdc;
                grapite_calc_center(N, m.data(), (double(*)[3])x.data(), (double(*)[3])v.data(), xcm, vcm, xdc, vdc);
                x[grapite_cep_index] = xdc;
                v[grapite_cep_index] = vdc;
                grapite_update_cep(time_cur, xdc, vdc, a[grapite_cep_index], adot[grapite_cep_index]);
            }
#endif

            t_contr += config.dt_contr;
        } /* if (time_cur >= t_contr) */

        if (time_cur >= t_disk) {
            char out_fname[256];
            diskstep++;
            if (myRank == rootRank) {
                sprintf(out_fname, "%06d", diskstep);
                if (config.output_hdf5) h5_write(std::string(out_fname) + ".h5", diskstep, N, time_cur, m, x, v, pot, a, adot, config.output_extra_mode, config.output_hdf5_double_precision);
                else ascii_write(std::string(out_fname) + ".dat", diskstep, N, time_cur, m, x, v, config.output_ascii_precision);
            } /* if (myRank == rootRank) */

#ifdef ETICS
            if (config.etics_dump_coeffs) {
                sprintf(out_fname, "coeffs.%06d.%02d.dat", diskstep, myRank);
                grapite_dump(out_fname, 2);
            }
#endif
            t_disk += config.dt_disk;
        } /* if (time_cur >= t_disk) */
    } /* while (time_cur < t_end) */

    /* close the local GRAPEs */
    timer.stop();
    g6_close(clusterid);

    double g6_calls_sum;
    MPI_Reduce(&calc_self_grav.g6_calls, &g6_calls_sum, 1, MPI_DOUBLE, MPI_SUM, rootRank, MPI_COMM_WORLD);
    if (myRank == rootRank) {
        /* Write some output for the timestep annalize... */
        printf("\n");
        printf("timesteps = %.0f   Total sum of integrated part. = %.0f   g6_calls on all nodes = %.0f \n", timesteps, n_act_sum, g6_calls_sum);
        printf("\n");
        printf("Real Speed = %.3f GFlops \n", 57.0*N*n_act_sum/(timer.time)/1.0E+09);
        fflush(stdout);
    } /* if (myRank == rootRank) */

    /* Finalize the MPI work */
    MPI_Finalize();
}