From 7f4c625d3690347ea6e28774bf11fa198cb3b406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Del=20Pino?= <stephane.delpino44@gmail.com> Date: Wed, 16 Oct 2024 16:14:54 +0200 Subject: [PATCH] Prepare use of slurm library for timeout checkpointing [ci-skip] --- src/language/modules/CoreModule.cpp | 28 ++-------------- src/utils/CMakeLists.txt | 3 +- src/utils/Stop.cpp | 50 +++++++++++++++++++++++++++++ src/utils/Stop.hpp | 6 ++++ 4 files changed, 60 insertions(+), 27 deletions(-) create mode 100644 src/utils/Stop.cpp create mode 100644 src/utils/Stop.hpp diff --git a/src/language/modules/CoreModule.cpp b/src/language/modules/CoreModule.cpp index 85dfdcd94..d7b331a1d 100644 --- a/src/language/modules/CoreModule.cpp +++ b/src/language/modules/CoreModule.cpp @@ -31,10 +31,10 @@ #include <language/utils/UnaryOperatorRegisterForRn.hpp> #include <language/utils/UnaryOperatorRegisterForRnxn.hpp> #include <language/utils/UnaryOperatorRegisterForZ.hpp> -#include <utils/ExecutionStatManager.hpp> #include <utils/Messenger.hpp> #include <utils/PugsUtils.hpp> #include <utils/RandomEngine.hpp> +#include <utils/Stop.hpp> #include <utils/checkpointing/Checkpoint.hpp> #include <utils/checkpointing/ReadOStream.hpp> @@ -146,31 +146,7 @@ CoreModule::CoreModule() : BuiltinModule(true) )); - this->_addBuiltinFunction("stop", - std::function( - - []() -> bool { - bool has_stop_file = false; - - if (parallel::rank() == 0) { - std::filesystem::path stop_file("stop"); - if (std::filesystem::exists(stop_file)) { - const double elapse_time = ExecutionStatManager::getInstance().getElapseTime(); - - const double stop_file_age = std::chrono::duration_cast<std::chrono::seconds>( - std::filesystem::file_time_type::clock::now() - - std::filesystem::last_write_time(stop_file)) - .count(); - - has_stop_file = elapse_time > stop_file_age; - } - } - parallel::broadcast(has_stop_file, 0); - - return has_stop_file; - } - - )); + this->_addBuiltinFunction("stop", std::function([]() -> bool { return stop(); })); this->_addNameValue("cout", ast_node_data_type_from<std::shared_ptr<const OStream>>, EmbeddedData{std::make_shared<DataHandler<const OStream>>(std::make_shared<OStream>(std::cout))}); diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index e2ab407c3..13b0db808 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -22,7 +22,8 @@ add_library( RevisionInfo.cpp SignalManager.cpp SLEPcWrapper.cpp - Socket.cpp) + Socket.cpp + Stop.cpp) if(${MPI_FOUND}) target_include_directories(PugsUtils PRIVATE ${PARMETIS_INCLUDE_DIR}) diff --git a/src/utils/Stop.cpp b/src/utils/Stop.cpp new file mode 100644 index 000000000..61e4b2ad0 --- /dev/null +++ b/src/utils/Stop.cpp @@ -0,0 +1,50 @@ +#include <utils/Stop.hpp> + +#include <utils/ExecutionStatManager.hpp> +#include <utils/Messenger.hpp> +#include <utils/pugs_config.hpp> + +#include <filesystem> +#include <iostream> + +#ifdef PUGS_HAS_SLURM +#include <slurm/slurm.h> +#endif // PUGS_HAS_SLURM + +bool +stop() +{ + bool must_stop = false; + + if (parallel::rank() == 0) { + std::filesystem::path stop_file("stop"); + if (std::filesystem::exists(stop_file)) { + const double elapse_time = ExecutionStatManager::getInstance().getElapseTime(); + + const double stop_file_age = + std::chrono::duration_cast<std::chrono::seconds>(std::filesystem::file_time_type::clock::now() - + std::filesystem::last_write_time(stop_file)) + .count(); + + must_stop = elapse_time > stop_file_age; + } + +#ifdef PUGS_HAS_SLURM + char* env = getenv("SLURM_JOB_ID"); + if (env != nullptr) { + slurm_init(nullptr); + int slurm_job_id = std::atoi(env); + + if (slurm_get_rem_time(slurm_job_id) < 150) { + must_stop = true; + } + + slurm_fini(); + } +#endif // PUGS_HAS_SLURM + } + + parallel::broadcast(must_stop, 0); + + return must_stop; +} diff --git a/src/utils/Stop.hpp b/src/utils/Stop.hpp new file mode 100644 index 000000000..5ba32eb64 --- /dev/null +++ b/src/utils/Stop.hpp @@ -0,0 +1,6 @@ +#ifndef STOP_HPP +#define STOP_HPP + +bool stop(); + +#endif // STOP_HPP -- GitLab