Skip to content
Snippets Groups Projects
Commit 85b25599 authored by Stéphane Del Pino's avatar Stéphane Del Pino
Browse files

Add new executable `pugs_checkpoint`

It allows to simply manipulate checkpoint files
- get list of available checkpoints
- change resuming checkpoint
- get the script that was used to produce a given checkpoint
parent b4746f6d
No related branches found
No related tags found
1 merge request!199Integrate checkpointing
...@@ -521,7 +521,7 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Coverage") ...@@ -521,7 +521,7 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Coverage")
COMMAND ${FASTCOV} --gcov "${GCOV_BIN}" COMMAND ${FASTCOV} --gcov "${GCOV_BIN}"
--include "${PUGS_SOURCE_DIR}/src" --include "${PUGS_SOURCE_DIR}/src"
--exclude "${PUGS_SOURCE_DIR}/src/main.cpp" "${PUGS_SOURCE_DIR}/src/utils/BacktraceManager.*" "${PUGS_SOURCE_DIR}/src/utils/FPEManager.*" "${PUGS_SOURCE_DIR}/src/utils/SignalManager.*" --exclude "${PUGS_SOURCE_DIR}/src/main.cpp" "${PUGS_SOURCE_DIR}/src/utils/BacktraceManager.*" "${PUGS_SOURCE_DIR}/src/utils/FPEManager.*" "${PUGS_SOURCE_DIR}/src/utils/SignalManager.*" "${PUGS_SOURCE_DIR}/src/utils/checkpointing/pugs_checkpoint_main.cpp"
--lcov -o coverage.info -n --lcov -o coverage.info -n
COMMAND ${LCOV} --gcov "${GCOV_BIN}" --list coverage.info COMMAND ${LCOV} --gcov "${GCOV_BIN}" --list coverage.info
...@@ -623,6 +623,39 @@ target_link_libraries( ...@@ -623,6 +623,39 @@ target_link_libraries(
stdc++fs stdc++fs
) )
# Checkpoint management tool
add_executable(
pugs_checkpoint
src/utils/checkpointing/pugs_checkpoint_main.cpp
)
target_link_libraries(
pugs_checkpoint
PugsCheckpointing
PugsUtils
PugsMesh
PugsOutput
PugsLanguage
PugsLanguageAST
PugsLanguageModules
PugsLanguageUtils
PugsScheme
PugsDev
PugsAnalysis
PugsAlgebra
Kokkos::kokkos
${PETSC_LIBRARIES}
${SLEPC_LIBRARIES}
${PARMETIS_LIBRARIES}
${MPI_CXX_LINK_FLAGS} ${MPI_CXX_LIBRARIES}
${KOKKOS_CXX_FLAGS}
${OPENMP_LINK_FLAGS}
${PUGS_STD_LINK_FLAGS}
${HIGHFIVE_TARGET}
stdc++fs
)
# -------------------- Documentation -------------------- # -------------------- Documentation --------------------
include(PugsDoc) include(PugsDoc)
...@@ -634,6 +667,7 @@ include(PugsDoxygen) ...@@ -634,6 +667,7 @@ include(PugsDoxygen)
# -------------------- Installation --------------------- # -------------------- Installation ---------------------
install(TARGETS install(TARGETS
pugs pugs
pugs_checkpoint
PugsMesh PugsMesh
PugsAlgebra PugsAlgebra
PugsAnalysis PugsAnalysis
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <utils/RevisionInfo.hpp> #include <utils/RevisionInfo.hpp>
#include <utils/SLEPcWrapper.hpp> #include <utils/SLEPcWrapper.hpp>
#include <utils/SignalManager.hpp> #include <utils/SignalManager.hpp>
#include <utils/checkpointing/PrintCheckpointInfo.hpp>
#include <utils/checkpointing/ResumingManager.hpp> #include <utils/checkpointing/ResumingManager.hpp>
#include <utils/pugs_build_info.hpp> #include <utils/pugs_build_info.hpp>
...@@ -91,8 +90,6 @@ initialize(int& argc, char* argv[]) ...@@ -91,8 +90,6 @@ initialize(int& argc, char* argv[])
bool enable_signals = true; bool enable_signals = true;
int nb_threads = -1; int nb_threads = -1;
bool print_checkpoint_info = false;
ParallelChecker::Mode pc_mode = ParallelChecker::Mode::automatic; ParallelChecker::Mode pc_mode = ParallelChecker::Mode::automatic;
std::string pc_filename = ParallelChecker::instance().filename(); std::string pc_filename = ParallelChecker::instance().filename();
...@@ -105,8 +102,6 @@ initialize(int& argc, char* argv[]) ...@@ -105,8 +102,6 @@ initialize(int& argc, char* argv[])
bool is_resuming = false; bool is_resuming = false;
app.add_flag("--resume", is_resuming, "Resume at checkpoint"); app.add_flag("--resume", is_resuming, "Resume at checkpoint");
app.add_flag("--print-checkpoint-info", print_checkpoint_info, "Print checkpoint info and exit");
app.set_version_flag("-v,--version", []() { app.set_version_flag("-v,--version", []() {
ConsoleManager::init(true); ConsoleManager::init(true);
std::stringstream os; std::stringstream os;
...@@ -225,19 +220,12 @@ initialize(int& argc, char* argv[]) ...@@ -225,19 +220,12 @@ initialize(int& argc, char* argv[])
std::cout << "-------------------------------------------------------\n"; std::cout << "-------------------------------------------------------\n";
} }
if (print_checkpoint_info) {
printCheckpointInfo(filename);
finalize();
std::exit(0);
}
return filename; return filename;
} }
// LCOV_EXCL_STOP // LCOV_EXCL_STOP
// LCOV_EXCL_START // LCOV_EXCL_START
// This function cannot be unit-tested: run once when pugs stops // This function cannot be unit-tested: run once when pugs stops
void void
......
...@@ -5,9 +5,11 @@ set(checkpointing_SOURCES) ...@@ -5,9 +5,11 @@ set(checkpointing_SOURCES)
list(APPEND checkpointing_SOURCES list(APPEND checkpointing_SOURCES
Checkpoint.cpp Checkpoint.cpp
PrintCheckpointInfo.cpp PrintCheckpointInfo.cpp
PrintScriptFrom.cpp
Resume.cpp Resume.cpp
ResumingManager.cpp ResumingManager.cpp
ResumingUtils.cpp ResumingUtils.cpp
SetResumeFrom.cpp
) )
if(PUGS_HAS_HDF5) if(PUGS_HAS_HDF5)
......
#include <utils/checkpointing/PrintScriptFrom.hpp>
#include <utils/pugs_config.hpp>
#ifdef PUGS_HAS_HDF5
#include <optional>
#include <rang.hpp>
#include <utils/Exceptions.hpp>
#include <utils/HighFivePugsUtils.hpp>
void
printScriptFrom(const std::string& filename, const uint64_t& checkpoint_number)
{
try {
HighFive::File file(filename, HighFive::File::ReadWrite);
const std::string checkpoint_name = "checkpoint_" + std::to_string(checkpoint_number);
if (not file.exist(checkpoint_name)) {
std::ostringstream error_msg;
error_msg << "cannot find checkpoint " << rang::fgB::magenta << checkpoint_number << rang::fg::reset << " in "
<< rang::fgB::yellow << filename << rang::fg::reset;
throw NormalError(error_msg.str());
}
HighFive::Group checkpoint = file.getGroup(checkpoint_name);
std::cout << checkpoint.getAttribute("data.pgs").read<std::string>();
}
catch (HighFive::Exception& e) {
throw NormalError(e.what());
}
}
#else // PUGS_HAS_HDF5
void
printScriptFrom(const std::string&, const uint64_t&)
{
std::cerr << rang::fgB::red << "error: " << rang::fg::reset << "printing checkpoint's script requires HDF5\n";
}
#endif // PUGS_HAS_HDF5
#ifndef PRINT_SCRIPT_FROM_HPP
#define PRINT_SCRIPT_FROM_HPP
#include <cstdint>
#include <string>
void printScriptFrom(const std::string& filename, const uint64_t& checkpoint_number);
#endif // PRINT_SCRIPT_FROM_HPP
#include <utils/checkpointing/SetResumeFrom.hpp>
#include <utils/pugs_config.hpp>
#ifdef PUGS_HAS_HDF5
#include <optional>
#include <rang.hpp>
#include <utils/Exceptions.hpp>
#include <utils/HighFivePugsUtils.hpp>
void
setResumeFrom(const std::string& filename, const uint64_t& checkpoint_number)
{
try {
HighFive::File file(filename, HighFive::File::ReadWrite);
const std::string checkpoint_name = "checkpoint_" + std::to_string(checkpoint_number);
if (not file.exist(checkpoint_name)) {
std::ostringstream error_msg;
error_msg << "cannot find checkpoint " << rang::fgB::magenta << checkpoint_number << rang::fg::reset << " in "
<< rang::fgB::yellow << filename << rang::fg::reset;
throw NormalError(error_msg.str());
}
HighFive::Group checkpoint = file.getGroup(checkpoint_name);
if (file.exist("resuming_checkpoint")) {
file.unlink("resuming_checkpoint");
}
file.createHardLink("resuming_checkpoint", checkpoint);
std::cout << "Resuming checkpoint " << rang::style::bold << "successfully" << rang::style::reset << " set to "
<< rang::fgB::yellow << checkpoint_number << rang::fg::reset << '\n';
}
catch (HighFive::Exception& e) {
throw NormalError(e.what());
}
}
#else // PUGS_HAS_HDF5
void
setResumeFrom(const std::string&, const uint64_t&)
{
std::cerr << rang::fgB::red << "error: " << rang::fg::reset << "setting resuming checkpoint requires HDF5\n";
}
#endif // PUGS_HAS_HDF5
#ifndef SET_RESUME_FROM_HPP
#define SET_RESUME_FROM_HPP
#include <cstdint>
#include <string>
void setResumeFrom(const std::string& filename, const uint64_t& checkpoint_number);
#endif // SET_RESUME_FROM_HPP
#include <CLI/CLI.hpp>
#include <rang.hpp>
#include <utils/ConsoleManager.hpp>
#include <utils/Messenger.hpp>
#include <utils/checkpointing/PrintCheckpointInfo.hpp>
#include <utils/checkpointing/PrintScriptFrom.hpp>
#include <utils/checkpointing/SetResumeFrom.hpp>
int
main(int argc, char* argv[])
{
std::string filename;
CLI::App app{"pugs_checkpoint help"};
app.description("A collection of simple tools to manage checkpoint/resume files for pugs.");
app.add_option("filename", filename, "pugs checkpoint file (HDF5)")->check(CLI::ExistingFile)->required();
bool print_info = false;
auto info_flag = app.add_flag("--info", print_info, "Print checkpoints info");
uint64_t checkpoint_number = 0;
auto resume_from =
app.add_option("--resume-from", checkpoint_number, "Use the given checkpoint number for next resume")
->excludes(info_flag);
auto get_script_resume_from =
app.add_option("--get-script-from", checkpoint_number, "Print script file used for a given checkpoint number")
->excludes(info_flag)
->excludes(resume_from);
std::atexit([]() { std::cout << rang::style::reset; });
try {
app.parse(argc, argv);
}
catch (const CLI::ParseError& e) {
std::exit(app.exit(e, std::cout, std::cerr));
}
parallel::Messenger::create(argc, argv);
ConsoleManager::init(true);
try {
if (*info_flag) {
printCheckpointInfo(filename);
} else if (*resume_from) {
setResumeFrom(filename, checkpoint_number);
} else if (*get_script_resume_from) {
printScriptFrom(filename, checkpoint_number);
}
}
catch (const std::runtime_error& e) {
std::cerr << e.what() << '\n';
parallel::Messenger::destroy();
std::exit(1);
}
parallel::Messenger::destroy();
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment