Select Git revision
Checkpoint.cpp 8.40 KiB
#include <utils/checkpointing/Checkpoint.hpp>
#include <utils/pugs_config.hpp>
#ifdef PUGS_HAS_HDF5
#include <utils/HighFivePugsUtils.hpp>
#include <language/ast/ASTExecutionStack.hpp>
#include <language/utils/SymbolTable.hpp>
#include <iostream>
#include <map>
#endif // PUGS_HAS_HDF5
#include <language/utils/ASTCheckpointsInfo.hpp>
#include <utils/Exceptions.hpp>
#include <utils/ExecutionStatManager.hpp>
#include <utils/checkpointing/ResumingManager.hpp>
#ifdef PUGS_HAS_HDF5
#include <algebra/LinearSolverOptions.hpp>
#include <language/utils/ASTNodeDataTypeTraits.hpp>
#include <language/utils/CheckpointResumeRepository.hpp>
#include <language/utils/DataHandler.hpp>
#include <mesh/MeshVariant.hpp>
#include <utils/GlobalVariableManager.hpp>
#include <utils/RandomEngine.hpp>
#include <utils/checkpointing/LinearSolverOptionsHFType.hpp>
void
checkpoint()
{
try {
auto create_props = HighFive::FileCreateProps{};
create_props.add(HighFive::FileSpaceStrategy(H5F_FSPACE_STRATEGY_FSM_AGGR, true, 0));
HighFive::FileAccessProps fapl;
fapl.add(HighFive::MPIOFileAccess{MPI_COMM_WORLD, MPI_INFO_NULL});
fapl.add(HighFive::MPIOCollectiveMetadata{});
uint64_t& checkpoint_number = ResumingManager::getInstance().checkpointNumber();
const auto file_openmode = (checkpoint_number == 0) ? HighFive::File::Truncate : HighFive::File::ReadWrite;
HighFive::File file("checkpoint.h5", file_openmode, create_props, fapl);
std::string checkpoint_name = "checkpoint_" + std::to_string(checkpoint_number);
HighFive::Group checkpoint = file.createGroup(checkpoint_name);
uint64_t checkpoint_id =
ASTCheckpointsInfo::getInstance().getCheckpointId((ASTExecutionStack::getInstance().currentNode()));
std::string time = [] {
std::ostringstream os;
auto t = std::time(nullptr);
os << std::put_time(std::localtime(&t), "%c");
return os.str();
}();
checkpoint.createAttribute("creation_date", time);
checkpoint.createAttribute("name", checkpoint_name);
checkpoint.createAttribute("id", checkpoint_id);
checkpoint.createAttribute("data.pgs", ASTExecutionStack::getInstance().fileContent());
{
HighFive::Group global_variables_group = checkpoint.createGroup("singleton/global_variables");
global_variables_group.createAttribute("connectivity_id", GlobalVariableManager::instance().getConnectivityId());
global_variables_group.createAttribute("mesh_id", GlobalVariableManager::instance().getMeshId());
}
{
HighFive::Group random_seed_group = checkpoint.createGroup("singleton/random_seed");
random_seed_group.createAttribute("current_seed", RandomEngine::instance().getCurrentSeed());
}
{
HighFive::Group execution_info_group = checkpoint.createGroup("singleton/execution_info");
execution_info_group.createAttribute("run_number", ExecutionStatManager::getInstance().runNumber());
execution_info_group.createAttribute("cumulative_elapse_time",
ExecutionStatManager::getInstance().getCumulativeElapseTime());
execution_info_group.createAttribute("cumulative_total_cpu_time",
ExecutionStatManager::getInstance().getCumulativeTotalCPUTime());
}
{
HighFive::Group linear_solver_options_default_group =
checkpoint.createGroup("singleton/linear_solver_options_default");
const LinearSolverOptions& default_options = LinearSolverOptions::default_options;
linear_solver_options_default_group.createAttribute("epsilon", default_options.epsilon());
linear_solver_options_default_group.createAttribute("maximum_iteration", default_options.maximumIteration());
linear_solver_options_default_group.createAttribute("verbose", default_options.verbose());
linear_solver_options_default_group.createAttribute("library", default_options.library());
linear_solver_options_default_group.createAttribute("method", default_options.method());
linear_solver_options_default_group.createAttribute("precond", default_options.precond());
}
{
std::cout << rang::fgB::magenta << "Checkpoint DualConnectivityManager NIY" << rang::fg::reset << '\n';
std::cout << rang::fgB::magenta << "Checkpoint DualMeshManager NIY" << rang::fg::reset << '\n';
}
std::shared_ptr<const SymbolTable> p_symbol_table = ASTExecutionStack::getInstance().currentNode().m_symbol_table;
auto symbol_table_group = checkpoint;
size_t symbol_table_id = 0;
while (p_symbol_table.use_count() > 0) {
symbol_table_group = symbol_table_group.createGroup("symbol table");
const SymbolTable& symbol_table = *p_symbol_table;
const auto& symbol_list = symbol_table.symbolList();
for (auto& symbol : symbol_list) {
switch (symbol.attributes().dataType()) {
case ASTNodeDataType::builtin_function_t:
case ASTNodeDataType::type_name_id_t: {
break;
}
case ASTNodeDataType::function_t: {
HighFive::Group function_group = checkpoint.createGroup("functions/" + symbol.name());
function_group.createAttribute("id", std::get<size_t>(symbol.attributes().value()));
function_group.createAttribute("symbol_table_id", symbol_table_id);
break;
}
default: {
if ((symbol_table.has(symbol.name(), ASTExecutionStack::getInstance().currentNode().begin())) and
(not symbol.attributes().isModuleVariable())) {
std::visit(
[&](auto&& data) {
using DataT = std::decay_t<decltype(data)>;
if constexpr (std::is_same_v<DataT, std::monostate>) {
} else if constexpr ((std::is_arithmetic_v<DataT>) or (std::is_same_v<DataT, std::string>) or
(is_tiny_vector_v<DataT>) or (is_tiny_matrix_v<DataT>)) {
symbol_table_group.createAttribute(symbol.name(), data);
} else if constexpr (std::is_same_v<DataT, EmbeddedData>) {
CheckpointResumeRepository::instance().checkpoint(symbol.attributes().dataType(), symbol.name(), data,
file, checkpoint, symbol_table_group);
} else if constexpr (is_std_vector_v<DataT>) {
using value_type = typename DataT::value_type;
if constexpr ((std::is_arithmetic_v<value_type>) or (std::is_same_v<value_type, std::string>) or
(is_tiny_vector_v<value_type>) or (is_tiny_matrix_v<value_type>)) {
symbol_table_group.createAttribute(symbol.name(), data);
} else if constexpr (std::is_same_v<value_type, EmbeddedData>) {
symbol_table_group.createGroup("embedded/" + symbol.name())
.createAttribute("type", dataTypeName(symbol.attributes().dataType()));
for (size_t i = 0; i < data.size(); ++i) {
CheckpointResumeRepository::instance().checkpoint(symbol.attributes().dataType().contentType(),
symbol.name() + "/" + std::to_string(i),
data[i], file, checkpoint, symbol_table_group);
}
} else {
throw UnexpectedError("unexpected data type");
}
} else {
throw UnexpectedError("unexpected data type");
}
},
symbol.attributes().value());
}
}
}
}
p_symbol_table = symbol_table.parentTable();
++symbol_table_id;
}
if (file.exist("last_checkpoint")) {
file.unlink("last_checkpoint");
}
file.createHardLink("last_checkpoint", checkpoint);
if (file.exist("resuming_checkpoint")) {
file.unlink("resuming_checkpoint");
}
file.createHardLink("resuming_checkpoint", checkpoint);
if (file.hasAttribute("checkpoint_number")) {
file.deleteAttribute("checkpoint_number");
}
file.createAttribute("checkpoint_number", checkpoint_number);
++checkpoint_number;
}
catch (HighFive::Exception& e) {
throw NormalError(e.what());
}
}
#else // PUGS_HAS_HDF5
void
checkpoint()
{
throw NormalError("checkpoint/resume mechanism requires HDF5");
}
#endif // PUGS_HAS_HDF5