From 7f4c625d3690347ea6e28774bf11fa198cb3b406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Del=20Pino?= <stephane.delpino44@gmail.com>
Date: Wed, 16 Oct 2024 16:14:54 +0200
Subject: [PATCH] Prepare use of slurm library for timeout checkpointing
 [ci-skip]

---
 src/language/modules/CoreModule.cpp | 28 ++--------------
 src/utils/CMakeLists.txt            |  3 +-
 src/utils/Stop.cpp                  | 50 +++++++++++++++++++++++++++++
 src/utils/Stop.hpp                  |  6 ++++
 4 files changed, 60 insertions(+), 27 deletions(-)
 create mode 100644 src/utils/Stop.cpp
 create mode 100644 src/utils/Stop.hpp

diff --git a/src/language/modules/CoreModule.cpp b/src/language/modules/CoreModule.cpp
index 85dfdcd94..d7b331a1d 100644
--- a/src/language/modules/CoreModule.cpp
+++ b/src/language/modules/CoreModule.cpp
@@ -31,10 +31,10 @@
 #include <language/utils/UnaryOperatorRegisterForRn.hpp>
 #include <language/utils/UnaryOperatorRegisterForRnxn.hpp>
 #include <language/utils/UnaryOperatorRegisterForZ.hpp>
-#include <utils/ExecutionStatManager.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/PugsUtils.hpp>
 #include <utils/RandomEngine.hpp>
+#include <utils/Stop.hpp>
 
 #include <utils/checkpointing/Checkpoint.hpp>
 #include <utils/checkpointing/ReadOStream.hpp>
@@ -146,31 +146,7 @@ CoreModule::CoreModule() : BuiltinModule(true)
 
                                                      ));
 
-  this->_addBuiltinFunction("stop",
-                            std::function(
-
-                              []() -> bool {
-                                bool has_stop_file = false;
-
-                                if (parallel::rank() == 0) {
-                                  std::filesystem::path stop_file("stop");
-                                  if (std::filesystem::exists(stop_file)) {
-                                    const double elapse_time = ExecutionStatManager::getInstance().getElapseTime();
-
-                                    const double stop_file_age = std::chrono::duration_cast<std::chrono::seconds>(
-                                                                   std::filesystem::file_time_type::clock::now() -
-                                                                   std::filesystem::last_write_time(stop_file))
-                                                                   .count();
-
-                                    has_stop_file = elapse_time > stop_file_age;
-                                  }
-                                }
-                                parallel::broadcast(has_stop_file, 0);
-
-                                return has_stop_file;
-                              }
-
-                              ));
+  this->_addBuiltinFunction("stop", std::function([]() -> bool { return stop(); }));
 
   this->_addNameValue("cout", ast_node_data_type_from<std::shared_ptr<const OStream>>,
                       EmbeddedData{std::make_shared<DataHandler<const OStream>>(std::make_shared<OStream>(std::cout))});
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index e2ab407c3..13b0db808 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -22,7 +22,8 @@ add_library(
   RevisionInfo.cpp
   SignalManager.cpp
   SLEPcWrapper.cpp
-  Socket.cpp)
+  Socket.cpp
+  Stop.cpp)
 
 if(${MPI_FOUND})
   target_include_directories(PugsUtils PRIVATE ${PARMETIS_INCLUDE_DIR})
diff --git a/src/utils/Stop.cpp b/src/utils/Stop.cpp
new file mode 100644
index 000000000..61e4b2ad0
--- /dev/null
+++ b/src/utils/Stop.cpp
@@ -0,0 +1,50 @@
+#include <utils/Stop.hpp>
+
+#include <utils/ExecutionStatManager.hpp>
+#include <utils/Messenger.hpp>
+#include <utils/pugs_config.hpp>
+
+#include <filesystem>
+#include <iostream>
+
+#ifdef PUGS_HAS_SLURM
+#include <slurm/slurm.h>
+#endif   // PUGS_HAS_SLURM
+
+bool
+stop()
+{
+  bool must_stop = false;
+
+  if (parallel::rank() == 0) {
+    std::filesystem::path stop_file("stop");
+    if (std::filesystem::exists(stop_file)) {
+      const double elapse_time = ExecutionStatManager::getInstance().getElapseTime();
+
+      const double stop_file_age =
+        std::chrono::duration_cast<std::chrono::seconds>(std::filesystem::file_time_type::clock::now() -
+                                                         std::filesystem::last_write_time(stop_file))
+          .count();
+
+      must_stop = elapse_time > stop_file_age;
+    }
+
+#ifdef PUGS_HAS_SLURM
+    char* env = getenv("SLURM_JOB_ID");
+    if (env != nullptr) {
+      slurm_init(nullptr);
+      int slurm_job_id = std::atoi(env);
+
+      if (slurm_get_rem_time(slurm_job_id) < 150) {
+        must_stop = true;
+      }
+
+      slurm_fini();
+    }
+#endif   // PUGS_HAS_SLURM
+  }
+
+  parallel::broadcast(must_stop, 0);
+
+  return must_stop;
+}
diff --git a/src/utils/Stop.hpp b/src/utils/Stop.hpp
new file mode 100644
index 000000000..5ba32eb64
--- /dev/null
+++ b/src/utils/Stop.hpp
@@ -0,0 +1,6 @@
+#ifndef STOP_HPP
+#define STOP_HPP
+
+bool stop();
+
+#endif   // STOP_HPP
-- 
GitLab