Skip to content
Snippets Groups Projects
Commit 7f4c625d authored by Stéphane Del Pino's avatar Stéphane Del Pino
Browse files

Prepare use of slurm library for timeout checkpointing [ci-skip]

parent 16d445be
No related branches found
No related tags found
1 merge request!199Integrate checkpointing
...@@ -31,10 +31,10 @@ ...@@ -31,10 +31,10 @@
#include <language/utils/UnaryOperatorRegisterForRn.hpp> #include <language/utils/UnaryOperatorRegisterForRn.hpp>
#include <language/utils/UnaryOperatorRegisterForRnxn.hpp> #include <language/utils/UnaryOperatorRegisterForRnxn.hpp>
#include <language/utils/UnaryOperatorRegisterForZ.hpp> #include <language/utils/UnaryOperatorRegisterForZ.hpp>
#include <utils/ExecutionStatManager.hpp>
#include <utils/Messenger.hpp> #include <utils/Messenger.hpp>
#include <utils/PugsUtils.hpp> #include <utils/PugsUtils.hpp>
#include <utils/RandomEngine.hpp> #include <utils/RandomEngine.hpp>
#include <utils/Stop.hpp>
#include <utils/checkpointing/Checkpoint.hpp> #include <utils/checkpointing/Checkpoint.hpp>
#include <utils/checkpointing/ReadOStream.hpp> #include <utils/checkpointing/ReadOStream.hpp>
...@@ -146,31 +146,7 @@ CoreModule::CoreModule() : BuiltinModule(true) ...@@ -146,31 +146,7 @@ CoreModule::CoreModule() : BuiltinModule(true)
)); ));
this->_addBuiltinFunction("stop", this->_addBuiltinFunction("stop", std::function([]() -> bool { return stop(); }));
std::function(
[]() -> bool {
bool has_stop_file = false;
if (parallel::rank() == 0) {
std::filesystem::path stop_file("stop");
if (std::filesystem::exists(stop_file)) {
const double elapse_time = ExecutionStatManager::getInstance().getElapseTime();
const double stop_file_age = std::chrono::duration_cast<std::chrono::seconds>(
std::filesystem::file_time_type::clock::now() -
std::filesystem::last_write_time(stop_file))
.count();
has_stop_file = elapse_time > stop_file_age;
}
}
parallel::broadcast(has_stop_file, 0);
return has_stop_file;
}
));
this->_addNameValue("cout", ast_node_data_type_from<std::shared_ptr<const OStream>>, this->_addNameValue("cout", ast_node_data_type_from<std::shared_ptr<const OStream>>,
EmbeddedData{std::make_shared<DataHandler<const OStream>>(std::make_shared<OStream>(std::cout))}); EmbeddedData{std::make_shared<DataHandler<const OStream>>(std::make_shared<OStream>(std::cout))});
......
...@@ -22,7 +22,8 @@ add_library( ...@@ -22,7 +22,8 @@ add_library(
RevisionInfo.cpp RevisionInfo.cpp
SignalManager.cpp SignalManager.cpp
SLEPcWrapper.cpp SLEPcWrapper.cpp
Socket.cpp) Socket.cpp
Stop.cpp)
if(${MPI_FOUND}) if(${MPI_FOUND})
target_include_directories(PugsUtils PRIVATE ${PARMETIS_INCLUDE_DIR}) target_include_directories(PugsUtils PRIVATE ${PARMETIS_INCLUDE_DIR})
......
#include <utils/Stop.hpp>
#include <utils/ExecutionStatManager.hpp>
#include <utils/Messenger.hpp>
#include <utils/pugs_config.hpp>
#include <filesystem>
#include <iostream>
#ifdef PUGS_HAS_SLURM
#include <slurm/slurm.h>
#endif // PUGS_HAS_SLURM
bool
stop()
{
bool must_stop = false;
if (parallel::rank() == 0) {
std::filesystem::path stop_file("stop");
if (std::filesystem::exists(stop_file)) {
const double elapse_time = ExecutionStatManager::getInstance().getElapseTime();
const double stop_file_age =
std::chrono::duration_cast<std::chrono::seconds>(std::filesystem::file_time_type::clock::now() -
std::filesystem::last_write_time(stop_file))
.count();
must_stop = elapse_time > stop_file_age;
}
#ifdef PUGS_HAS_SLURM
char* env = getenv("SLURM_JOB_ID");
if (env != nullptr) {
slurm_init(nullptr);
int slurm_job_id = std::atoi(env);
if (slurm_get_rem_time(slurm_job_id) < 150) {
must_stop = true;
}
slurm_fini();
}
#endif // PUGS_HAS_SLURM
}
parallel::broadcast(must_stop, 0);
return must_stop;
}
#ifndef STOP_HPP
#define STOP_HPP
bool stop();
#endif // STOP_HPP
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment