diff --git a/CMakeLists.txt b/CMakeLists.txt index 8deb3a4925634b7a5a8b65e60924e198381d03d4..54f37abc8c7e2ded5d9472187475f464811ffd3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -521,7 +521,7 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Coverage") COMMAND ${FASTCOV} --gcov "${GCOV_BIN}" --include "${PUGS_SOURCE_DIR}/src" - --exclude "${PUGS_SOURCE_DIR}/src/main.cpp" "${PUGS_SOURCE_DIR}/src/utils/BacktraceManager.*" "${PUGS_SOURCE_DIR}/src/utils/FPEManager.*" "${PUGS_SOURCE_DIR}/src/utils/SignalManager.*" + --exclude "${PUGS_SOURCE_DIR}/src/main.cpp" "${PUGS_SOURCE_DIR}/src/utils/BacktraceManager.*" "${PUGS_SOURCE_DIR}/src/utils/FPEManager.*" "${PUGS_SOURCE_DIR}/src/utils/SignalManager.*" "${PUGS_SOURCE_DIR}/src/utils/checkpointing/pugs_checkpoint_main.cpp" --lcov -o coverage.info -n COMMAND ${LCOV} --gcov "${GCOV_BIN}" --list coverage.info @@ -623,6 +623,39 @@ target_link_libraries( stdc++fs ) +# Checkpoint management tool +add_executable( + pugs_checkpoint + src/utils/checkpointing/pugs_checkpoint_main.cpp + ) + +target_link_libraries( + pugs_checkpoint + PugsCheckpointing + PugsUtils + PugsMesh + PugsOutput + PugsLanguage + PugsLanguageAST + PugsLanguageModules + PugsLanguageUtils + PugsScheme + PugsDev + PugsAnalysis + PugsAlgebra + Kokkos::kokkos + ${PETSC_LIBRARIES} + ${SLEPC_LIBRARIES} + ${PARMETIS_LIBRARIES} + ${MPI_CXX_LINK_FLAGS} ${MPI_CXX_LIBRARIES} + ${KOKKOS_CXX_FLAGS} + ${OPENMP_LINK_FLAGS} + ${PUGS_STD_LINK_FLAGS} + ${HIGHFIVE_TARGET} + stdc++fs + ) + + # -------------------- Documentation -------------------- include(PugsDoc) @@ -634,6 +667,7 @@ include(PugsDoxygen) # -------------------- Installation --------------------- install(TARGETS pugs + pugs_checkpoint PugsMesh PugsAlgebra PugsAnalysis diff --git a/src/utils/PugsUtils.cpp b/src/utils/PugsUtils.cpp index ed6f6803de0ace073192f155fbef1883bfe8e7a4..f9b6c94985d4e273aa409c7e6aa50e27ef58d01d 100644 --- a/src/utils/PugsUtils.cpp +++ b/src/utils/PugsUtils.cpp @@ -13,7 +13,6 @@ #include <utils/RevisionInfo.hpp> #include <utils/SLEPcWrapper.hpp> #include <utils/SignalManager.hpp> -#include <utils/checkpointing/PrintCheckpointInfo.hpp> #include <utils/checkpointing/ResumingManager.hpp> #include <utils/pugs_build_info.hpp> @@ -91,8 +90,6 @@ initialize(int& argc, char* argv[]) bool enable_signals = true; int nb_threads = -1; - bool print_checkpoint_info = false; - ParallelChecker::Mode pc_mode = ParallelChecker::Mode::automatic; std::string pc_filename = ParallelChecker::instance().filename(); @@ -105,8 +102,6 @@ initialize(int& argc, char* argv[]) bool is_resuming = false; app.add_flag("--resume", is_resuming, "Resume at checkpoint"); - app.add_flag("--print-checkpoint-info", print_checkpoint_info, "Print checkpoint info and exit"); - app.set_version_flag("-v,--version", []() { ConsoleManager::init(true); std::stringstream os; @@ -225,19 +220,12 @@ initialize(int& argc, char* argv[]) std::cout << "-------------------------------------------------------\n"; } - if (print_checkpoint_info) { - printCheckpointInfo(filename); - finalize(); - std::exit(0); - } - return filename; } // LCOV_EXCL_STOP // LCOV_EXCL_START - // This function cannot be unit-tested: run once when pugs stops void diff --git a/src/utils/checkpointing/CMakeLists.txt b/src/utils/checkpointing/CMakeLists.txt index 2c493e227ed375531d8a443a58bcdcf3cf33b82a..b051975e8538df40a2b9b8a8d7f0486874cda938 100644 --- a/src/utils/checkpointing/CMakeLists.txt +++ b/src/utils/checkpointing/CMakeLists.txt @@ -5,9 +5,11 @@ set(checkpointing_SOURCES) list(APPEND checkpointing_SOURCES Checkpoint.cpp PrintCheckpointInfo.cpp + PrintScriptFrom.cpp Resume.cpp ResumingManager.cpp ResumingUtils.cpp + SetResumeFrom.cpp ) if(PUGS_HAS_HDF5) diff --git a/src/utils/checkpointing/PrintScriptFrom.cpp b/src/utils/checkpointing/PrintScriptFrom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4f94012d1f4be84c4062ea55a9ecc92dab35c3ff --- /dev/null +++ b/src/utils/checkpointing/PrintScriptFrom.cpp @@ -0,0 +1,42 @@ +#include <utils/checkpointing/PrintScriptFrom.hpp> + +#include <utils/pugs_config.hpp> + +#ifdef PUGS_HAS_HDF5 + +#include <optional> +#include <rang.hpp> +#include <utils/Exceptions.hpp> +#include <utils/HighFivePugsUtils.hpp> + +void +printScriptFrom(const std::string& filename, const uint64_t& checkpoint_number) +{ + try { + HighFive::File file(filename, HighFive::File::ReadWrite); + const std::string checkpoint_name = "checkpoint_" + std::to_string(checkpoint_number); + + if (not file.exist(checkpoint_name)) { + std::ostringstream error_msg; + error_msg << "cannot find checkpoint " << rang::fgB::magenta << checkpoint_number << rang::fg::reset << " in " + << rang::fgB::yellow << filename << rang::fg::reset; + throw NormalError(error_msg.str()); + } + + HighFive::Group checkpoint = file.getGroup(checkpoint_name); + std::cout << checkpoint.getAttribute("data.pgs").read<std::string>(); + } + catch (HighFive::Exception& e) { + throw NormalError(e.what()); + } +} + +#else // PUGS_HAS_HDF5 + +void +printScriptFrom(const std::string&, const uint64_t&) +{ + std::cerr << rang::fgB::red << "error: " << rang::fg::reset << "printing checkpoint's script requires HDF5\n"; +} + +#endif // PUGS_HAS_HDF5 diff --git a/src/utils/checkpointing/PrintScriptFrom.hpp b/src/utils/checkpointing/PrintScriptFrom.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c3807c7af000280698406050ffa3320effb05eb6 --- /dev/null +++ b/src/utils/checkpointing/PrintScriptFrom.hpp @@ -0,0 +1,9 @@ +#ifndef PRINT_SCRIPT_FROM_HPP +#define PRINT_SCRIPT_FROM_HPP + +#include <cstdint> +#include <string> + +void printScriptFrom(const std::string& filename, const uint64_t& checkpoint_number); + +#endif // PRINT_SCRIPT_FROM_HPP diff --git a/src/utils/checkpointing/SetResumeFrom.cpp b/src/utils/checkpointing/SetResumeFrom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..566b7505d54ed184752ae868863c69ad1ebb0926 --- /dev/null +++ b/src/utils/checkpointing/SetResumeFrom.cpp @@ -0,0 +1,47 @@ +#include <utils/checkpointing/SetResumeFrom.hpp> + +#include <utils/pugs_config.hpp> + +#ifdef PUGS_HAS_HDF5 + +#include <optional> +#include <rang.hpp> +#include <utils/Exceptions.hpp> +#include <utils/HighFivePugsUtils.hpp> + +void +setResumeFrom(const std::string& filename, const uint64_t& checkpoint_number) +{ + try { + HighFive::File file(filename, HighFive::File::ReadWrite); + const std::string checkpoint_name = "checkpoint_" + std::to_string(checkpoint_number); + + if (not file.exist(checkpoint_name)) { + std::ostringstream error_msg; + error_msg << "cannot find checkpoint " << rang::fgB::magenta << checkpoint_number << rang::fg::reset << " in " + << rang::fgB::yellow << filename << rang::fg::reset; + throw NormalError(error_msg.str()); + } + + HighFive::Group checkpoint = file.getGroup(checkpoint_name); + if (file.exist("resuming_checkpoint")) { + file.unlink("resuming_checkpoint"); + } + file.createHardLink("resuming_checkpoint", checkpoint); + std::cout << "Resuming checkpoint " << rang::style::bold << "successfully" << rang::style::reset << " set to " + << rang::fgB::yellow << checkpoint_number << rang::fg::reset << '\n'; + } + catch (HighFive::Exception& e) { + throw NormalError(e.what()); + } +} + +#else // PUGS_HAS_HDF5 + +void +setResumeFrom(const std::string&, const uint64_t&) +{ + std::cerr << rang::fgB::red << "error: " << rang::fg::reset << "setting resuming checkpoint requires HDF5\n"; +} + +#endif // PUGS_HAS_HDF5 diff --git a/src/utils/checkpointing/SetResumeFrom.hpp b/src/utils/checkpointing/SetResumeFrom.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c8e44e6e4319832d9018a1569f003bed509ab5d2 --- /dev/null +++ b/src/utils/checkpointing/SetResumeFrom.hpp @@ -0,0 +1,9 @@ +#ifndef SET_RESUME_FROM_HPP +#define SET_RESUME_FROM_HPP + +#include <cstdint> +#include <string> + +void setResumeFrom(const std::string& filename, const uint64_t& checkpoint_number); + +#endif // SET_RESUME_FROM_HPP diff --git a/src/utils/checkpointing/pugs_checkpoint_main.cpp b/src/utils/checkpointing/pugs_checkpoint_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..982b11159ef7e68e4d437dc77aaa5c4c5b36b32a --- /dev/null +++ b/src/utils/checkpointing/pugs_checkpoint_main.cpp @@ -0,0 +1,60 @@ +#include <CLI/CLI.hpp> +#include <rang.hpp> +#include <utils/ConsoleManager.hpp> +#include <utils/Messenger.hpp> +#include <utils/checkpointing/PrintCheckpointInfo.hpp> +#include <utils/checkpointing/PrintScriptFrom.hpp> +#include <utils/checkpointing/SetResumeFrom.hpp> + +int +main(int argc, char* argv[]) +{ + std::string filename; + + CLI::App app{"pugs_checkpoint help"}; + app.description("A collection of simple tools to manage checkpoint/resume files for pugs."); + app.add_option("filename", filename, "pugs checkpoint file (HDF5)")->check(CLI::ExistingFile)->required(); + + bool print_info = false; + auto info_flag = app.add_flag("--info", print_info, "Print checkpoints info"); + + uint64_t checkpoint_number = 0; + auto resume_from = + app.add_option("--resume-from", checkpoint_number, "Use the given checkpoint number for next resume") + ->excludes(info_flag); + + auto get_script_resume_from = + app.add_option("--get-script-from", checkpoint_number, "Print script file used for a given checkpoint number") + ->excludes(info_flag) + ->excludes(resume_from); + + std::atexit([]() { std::cout << rang::style::reset; }); + try { + app.parse(argc, argv); + } + catch (const CLI::ParseError& e) { + std::exit(app.exit(e, std::cout, std::cerr)); + } + + parallel::Messenger::create(argc, argv); + ConsoleManager::init(true); + + try { + if (*info_flag) { + printCheckpointInfo(filename); + } else if (*resume_from) { + setResumeFrom(filename, checkpoint_number); + } else if (*get_script_resume_from) { + printScriptFrom(filename, checkpoint_number); + } + } + catch (const std::runtime_error& e) { + std::cerr << e.what() << '\n'; + parallel::Messenger::destroy(); + std::exit(1); + } + + parallel::Messenger::destroy(); + + return 0; +}