Skip to content
Snippets Groups Projects
Commit f3a2b330 authored by Stéphane Del Pino's avatar Stéphane Del Pino
Browse files

Manage execution stats for checkpoint/resume

parent 3ba4830b
No related branches found
No related tags found
1 merge request!199Integrate checkpointing
......@@ -103,10 +103,19 @@ void
ExecutionStatManager::_printElapseTime() const
{
const double elapse_time = m_instance->m_elapse_time.seconds();
std::cout << "Execution: " << rang::style::bold << m_instance->m_elapse_time.seconds() << 's' << rang::style::reset;
std::cout << "Execution: " << rang::style::bold << elapse_time << 's' << rang::style::reset;
if (elapse_time > 60) {
std::cout << " [" << rang::style::bold << this->_prettyPrintTime(elapse_time) << rang::style::reset << ']';
}
if (m_run_number > 1) {
const double cumulative_elapse_time = elapse_time + m_previous_cumulative_elapse_time;
std::cout << " (Run number " << m_run_number << ").\n - Cumulative execution time: " << rang::style::bold
<< cumulative_elapse_time << 's' << rang::style::reset;
if (cumulative_elapse_time > 60) {
std::cout << " [" << rang::style::bold << this->_prettyPrintTime(cumulative_elapse_time) << rang::style::reset
<< ']';
}
}
std::cout << '\n';
}
......@@ -117,15 +126,25 @@ ExecutionStatManager::_printTotalCPUTime() const
getrusage(RUSAGE_SELF, &u);
const double total_cpu_time =
u.ru_utime.tv_sec + u.ru_stime.tv_sec + (u.ru_utime.tv_usec + u.ru_stime.tv_usec) * 1E-6;
parallel::allReduceSum(u.ru_utime.tv_sec + u.ru_stime.tv_sec + (u.ru_utime.tv_usec + u.ru_stime.tv_usec) * 1E-6);
std::cout << "Total CPU: " << rang::style::bold << parallel::allReduceSum(total_cpu_time) << 's'
<< rang::style::reset;
std::cout << "Total CPU: " << rang::style::bold << total_cpu_time << 's' << rang::style::reset;
std::cout << " (" << parallel::allReduceSum(Kokkos::DefaultHostExecutionSpace::concurrency()) << " threads over "
<< parallel::size() << " processes)";
if (total_cpu_time > 60) {
std::cout << " [" << _prettyPrintTime(total_cpu_time) << ']';
}
if (m_run_number > 1) {
const double cumulative_total_cpu_time = total_cpu_time + m_previous_cumulative_total_cpu_time;
std::cout << "\n - Cumulative total CPU: " << rang::style::bold << cumulative_total_cpu_time << 's'
<< rang::style::reset;
if (cumulative_total_cpu_time > 60) {
std::cout << " [" << rang::style::bold << this->_prettyPrintTime(cumulative_total_cpu_time) << rang::style::reset
<< ']';
}
}
std::cout << '\n';
}
......@@ -142,6 +161,24 @@ ExecutionStatManager::printInfo()
}
}
double
ExecutionStatManager::getCumulativeElapseTime() const
{
return m_previous_cumulative_elapse_time + m_elapse_time.seconds();
}
double
ExecutionStatManager::getCumulativeTotalCPUTime() const
{
rusage u;
getrusage(RUSAGE_SELF, &u);
const double total_cpu_time =
u.ru_utime.tv_sec + u.ru_stime.tv_sec + (u.ru_utime.tv_usec + u.ru_stime.tv_usec) * 1E-6;
return m_previous_cumulative_total_cpu_time + parallel::allReduceSum(total_cpu_time);
}
void
ExecutionStatManager::create()
{
......
......@@ -13,6 +13,10 @@ class ExecutionStatManager
bool m_do_print = true;
int m_exit_code = 0;
size_t m_run_number = 1;
double m_previous_cumulative_elapse_time = 0;
double m_previous_cumulative_total_cpu_time = 0;
std::string _prettyPrintTime(double seconds) const;
void _printMaxResidentMemory() const;
......@@ -25,6 +29,24 @@ class ExecutionStatManager
~ExecutionStatManager() = default;
public:
double getCumulativeElapseTime() const;
double getCumulativeTotalCPUTime() const;
PUGS_INLINE
void
setPreviousCumulativeElapseTime(double cumulative_elapse_time)
{
m_previous_cumulative_elapse_time = cumulative_elapse_time;
}
PUGS_INLINE
void
setPreviousCumulativeTotalCPUTime(double cumulative_cpu_time)
{
m_previous_cumulative_total_cpu_time = cumulative_cpu_time;
}
PUGS_INLINE
bool
doPrint() const
......@@ -39,6 +61,20 @@ class ExecutionStatManager
m_do_print = do_print;
}
PUGS_INLINE
size_t
runNumber() const
{
return m_run_number;
}
PUGS_INLINE
void
setRunNumber(size_t run_number)
{
m_run_number = run_number;
}
PUGS_INLINE
int
exitCode() const
......
......@@ -15,6 +15,7 @@
#include <language/utils/ASTCheckpointsInfo.hpp>
#include <utils/Exceptions.hpp>
#include <utils/ExecutionStatManager.hpp>
#include <utils/checkpointing/ResumingManager.hpp>
#ifdef PUGS_HAS_HDF5
......@@ -63,13 +64,21 @@ checkpoint()
checkpoint.createAttribute("data.pgs", ASTExecutionStack::getInstance().fileContent());
{
HighFive::Group random_seed = checkpoint.createGroup("singleton/random_seed");
random_seed.createAttribute("current_seed", RandomEngine::instance().getCurrentSeed());
HighFive::Group global_variables_group = checkpoint.createGroup("singleton/global_variables");
global_variables_group.createAttribute("connectivity_id", GlobalVariableManager::instance().getConnectivityId());
global_variables_group.createAttribute("mesh_id", GlobalVariableManager::instance().getMeshId());
}
{
HighFive::Group global_variables = checkpoint.createGroup("singleton/global_variables");
global_variables.createAttribute("connectivity_id", GlobalVariableManager::instance().getConnectivityId());
global_variables.createAttribute("mesh_id", GlobalVariableManager::instance().getMeshId());
HighFive::Group random_seed_group = checkpoint.createGroup("singleton/random_seed");
random_seed_group.createAttribute("current_seed", RandomEngine::instance().getCurrentSeed());
}
{
HighFive::Group execution_info_group = checkpoint.createGroup("singleton/execution_info");
execution_info_group.createAttribute("run_number", ExecutionStatManager::getInstance().runNumber());
execution_info_group.createAttribute("cumulative_elapse_time",
ExecutionStatManager::getInstance().getCumulativeElapseTime());
execution_info_group.createAttribute("cumulative_total_cpu_time",
ExecutionStatManager::getInstance().getCumulativeTotalCPUTime());
}
{
std::cout << rang::fgB::magenta << "Checkpoint DualConnectivityManager NIY" << rang::fg::reset << '\n';
......
......@@ -18,6 +18,7 @@
#ifdef PUGS_HAS_HDF5
#include <mesh/Connectivity.hpp>
#include <utils/ExecutionStatManager.hpp>
#include <utils/RandomEngine.hpp>
#include <utils/checkpointing/ResumeUtils.hpp>
#include <utils/checkpointing/ResumingData.hpp>
......@@ -50,10 +51,21 @@ resume()
<< checkpoint.getAttribute("name").read<std::string>() << rang::fg::reset << "]\n";
{
HighFive::Group random_seed = checkpoint.getGroup("singleton/random_seed");
RandomEngine::instance().setRandomSeed(random_seed.getAttribute("current_seed").read<uint64_t>());
HighFive::Group random_seed_group = checkpoint.getGroup("singleton/random_seed");
RandomEngine::instance().setRandomSeed(random_seed_group.getAttribute("current_seed").read<uint64_t>());
}
{
HighFive::Group global_variables_group = checkpoint.getGroup("singleton/execution_info");
const size_t run_number = global_variables_group.getAttribute("run_number").read<size_t>();
const double cumulative_elapse_time =
global_variables_group.getAttribute("cumulative_elapse_time").read<double>();
const double cumulative_total_cpu_time =
global_variables_group.getAttribute("cumulative_total_cpu_time").read<double>();
ExecutionStatManager::getInstance().setRunNumber(run_number + 1);
ExecutionStatManager::getInstance().setPreviousCumulativeElapseTime(cumulative_elapse_time);
ExecutionStatManager::getInstance().setPreviousCumulativeTotalCPUTime(cumulative_total_cpu_time);
}
{
std::cout << rang::fgB::magenta << "Resume DualConnectivityManager NIY" << rang::fg::reset << '\n';
std::cout << rang::fgB::magenta << "Resume DualMeshManager NIY" << rang::fg::reset << '\n';
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment