From 82d750676e323b9fa68ee460b04188ab7142979f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Del=20Pino?= <stephane.delpino44@gmail.com>
Date: Fri, 5 May 2023 19:17:15 +0200
Subject: [PATCH] Add a first version of a parallel checking tool

This tool checks if chosen data are synchronized and have same values
comparing two runs

By now only ItemValue are checked
---
 src/main.cpp                 |   3 +
 src/mesh/ParallelChecker.cpp |  21 ++
 src/mesh/ParallelChecker.hpp | 361 +++++++++++++++++++++++++++++++++++
 3 files changed, 385 insertions(+)
 create mode 100644 src/mesh/ParallelChecker.cpp
 create mode 100644 src/mesh/ParallelChecker.hpp

diff --git a/src/main.cpp b/src/main.cpp
index 7e03cdc80..05f01f290 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -3,6 +3,7 @@
 #include <mesh/DualConnectivityManager.hpp>
 #include <mesh/DualMeshManager.hpp>
 #include <mesh/MeshDataManager.hpp>
+#include <mesh/ParallelChecker.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsUtils.hpp>
 #include <utils/RandomEngine.hpp>
@@ -18,9 +19,11 @@ main(int argc, char* argv[])
   MeshDataManager::create();
   DualConnectivityManager::create();
   DualMeshManager::create();
+  parallel::ParallelChecker::create();
 
   parser(filename);
 
+  parallel::ParallelChecker::destroy();
   DualMeshManager::destroy();
   DualConnectivityManager::destroy();
   MeshDataManager::destroy();
diff --git a/src/mesh/ParallelChecker.cpp b/src/mesh/ParallelChecker.cpp
new file mode 100644
index 000000000..4d329e1ca
--- /dev/null
+++ b/src/mesh/ParallelChecker.cpp
@@ -0,0 +1,21 @@
+#include <mesh/ParallelChecker.hpp>
+
+namespace parallel
+{
+ParallelChecker* ParallelChecker::m_instance = nullptr;
+
+void
+ParallelChecker::create()
+{
+  Assert(ParallelChecker::m_instance == nullptr, "ParallelChecker has already been created");
+  ParallelChecker::m_instance = new ParallelChecker;
+}
+
+void
+ParallelChecker::destroy()
+{
+  Assert(ParallelChecker::m_instance != nullptr, "ParallelChecker has already been destroyed");
+  delete ParallelChecker::m_instance;
+}
+
+}   // namespace parallel
diff --git a/src/mesh/ParallelChecker.hpp b/src/mesh/ParallelChecker.hpp
new file mode 100644
index 000000000..0c82d8750
--- /dev/null
+++ b/src/mesh/ParallelChecker.hpp
@@ -0,0 +1,361 @@
+#ifndef PARALLEL_CHECKER_HPP
+#define PARALLEL_CHECKER_HPP
+
+#include <mesh/Connectivity.hpp>
+#include <mesh/ItemValue.hpp>
+#include <utils/HDF5.hpp>
+#include <utils/Messenger.hpp>
+
+#include <experimental/source_location>
+
+#include <fstream>
+#include <utils/Demangle.hpp>
+
+namespace parallel
+{
+#ifdef PUGS_HAS_HDF5
+
+template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+void check(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value,
+           const std::string& name,
+           const std::experimental::source_location& source_location = std::experimental::source_location::current());
+
+class ParallelChecker
+{
+ private:
+  static ParallelChecker* m_instance;
+
+  size_t m_tag = 0;
+
+  std::string m_filename = "parallel_checker.h5";
+
+  ParallelChecker() = default;
+
+  void
+  _printHeader(const std::string& name, const std::experimental::source_location& source_location) const
+  {
+    std::cout << rang::fg::cyan << " | " << rang::fgB::cyan << "parallel checker" << rang::fg::cyan << " for \""
+              << rang::fgB::magenta << name << rang::fg::cyan << "\" tag " << rang::fgB::blue << m_tag
+              << rang::fg::reset << '\n';
+    std::cout << rang::fg::cyan << " | from " << rang::fgB::blue << source_location.file_name() << rang::fg::reset
+              << ':' << rang::style::bold << source_location.line() << rang::style::reset << '\n';
+  }
+
+ public:
+  static void create();
+  static void destroy();
+
+  static ParallelChecker&
+  instance()
+  {
+    return *m_instance;
+  }
+
+  template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+  friend void check(const ItemValue<DataType, item_type, ConnectivityPtr>&,
+                    const std::string&,
+                    const std::experimental::source_location&);
+
+ private:
+  template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+  void
+  write(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value,
+        const std::string& name,
+        const std::experimental::source_location& source_location)
+  {
+    this->_printHeader(name, source_location);
+
+    auto file_id = [&] {
+      if (m_tag == 0) {
+        return HDF5::create(m_filename);
+      } else {
+        return HDF5::openFileRW(m_filename);
+      }
+    }();
+
+    auto values_group_id = HDF5::createOrOpenGroup(file_id, "/values");
+    auto group_id        = HDF5::createOrOpenSubGroup(values_group_id, std::to_string(m_tag));
+
+    HDF5::writeAttribute(group_id, "filename", std::string{source_location.file_name()});
+    HDF5::writeAttribute(group_id, "function", source_location.function_name());
+    HDF5::writeAttribute(group_id, "line", static_cast<size_t>(source_location.line()));
+    HDF5::writeAttribute(group_id, "name", name);
+
+    std::shared_ptr<const IConnectivity> i_connectivity = item_value.connectivity_ptr();
+    HDF5::writeAttribute(group_id, "dimension", static_cast<size_t>(i_connectivity->dimension()));
+    HDF5::writeAttribute(group_id, "item_type", itemName(item_type));
+    HDF5::writeAttribute(group_id, "data_type", demangle<DataType>());
+
+    HDF5::write(group_id, name, item_value.arrayView());
+
+    switch (i_connectivity->dimension()) {
+    case 1: {
+      const Connectivity<1>& connectivity = dynamic_cast<const Connectivity<1>&>(*i_connectivity);
+      HDF5::write(group_id, "numbers", connectivity.number<item_type>().arrayView());
+      break;
+    }
+    case 2: {
+      const Connectivity<2>& connectivity = dynamic_cast<const Connectivity<2>&>(*i_connectivity);
+      HDF5::write(group_id, "numbers", connectivity.number<item_type>().arrayView());
+      break;
+    }
+    case 3: {
+      const Connectivity<3>& connectivity = dynamic_cast<const Connectivity<3>&>(*i_connectivity);
+      HDF5::write(group_id, "numbers", connectivity.number<item_type>().arrayView());
+      break;
+    }
+    default: {
+      throw UnexpectedError("unexpected connectivity dimension");
+    }
+    }
+
+    ++m_tag;
+
+    HDF5::close(file_id);
+
+    std::cout << rang::fg::cyan << " | writing " << rang::fgB::green << "success" << rang::fg::reset << '\n';
+  }
+
+  template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+  void
+  compare(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value,
+          const std::string& name,
+          const std::experimental::source_location& source_location)
+  {
+    this->_printHeader(name, source_location);
+
+    auto file_id = HDF5::openFileRO(m_filename);
+
+    auto values_group_id = HDF5::openGroup(file_id, "/values");
+    auto group_id        = HDF5::openSubGroup(values_group_id, std::to_string(m_tag));
+
+    const std::string reference_name          = HDF5::readAttribute<std::string>(group_id, "name");
+    const std::string reference_file_name     = HDF5::readAttribute<std::string>(group_id, "filename");
+    const std::string reference_function_name = HDF5::readAttribute<std::string>(group_id, "function");
+    const size_t reference_line_number        = HDF5::readAttribute<size_t>(group_id, "line");
+    const size_t reference_dimension          = HDF5::readAttribute<size_t>(group_id, "dimension");
+    const std::string reference_item_type     = HDF5::readAttribute<std::string>(group_id, "item_type");
+    const std::string reference_data_type     = HDF5::readAttribute<std::string>(group_id, "data_type");
+
+    std::shared_ptr<const IConnectivity> i_connectivity = item_value.connectivity_ptr();
+
+    bool is_comparable = true;
+    if (i_connectivity->dimension() != reference_dimension) {
+      std::cout << rang::fg::cyan << " | " << rang::fgB::red << "different support dimensions: reference ("
+                << rang::fgB::yellow << reference_dimension << rang::fgB::red << ") / target (" << rang::fgB::yellow
+                << i_connectivity->dimension() << rang::fg::reset << ")\n";
+      is_comparable = false;
+    }
+    if (itemName(item_type) != reference_item_type) {
+      std::cout << rang::fg::cyan << " | " << rang::fgB::red << "different item types: reference (" << rang::fgB::yellow
+                << reference_item_type << rang::fgB::red << ") / target (" << rang::fgB::yellow << itemName(item_type)
+                << rang::fg::reset << ")\n";
+      is_comparable = false;
+    }
+    if (demangle<DataType>() != reference_data_type) {
+      std::cout << rang::fg::cyan << " | " << rang::fgB::red << "different data types: reference (" << rang::fgB::yellow
+                << reference_data_type << rang::fgB::red << ") / target (" << rang::fgB::yellow << demangle<DataType>()
+                << rang::fg::reset << ")\n";
+      is_comparable = false;
+    }
+    if (name != reference_name) {
+      // Just warn for different labels (maybe useful for some kind of
+      // debugging...)
+      std::cout << rang::fg::cyan << " | " << rang::fgB::magenta << "different names: reference (" << rang::fgB::yellow
+                << reference_name << rang::fgB::magenta << ") / target (" << rang::fgB::yellow << name
+                << rang::fg::reset << ")\n";
+      std::cout << rang::fg::cyan << " | " << rang::fgB::magenta << "reference from " << rang::fgB::blue
+                << reference_file_name << rang::fg::reset << ':' << rang::style::bold << reference_line_number
+                << rang::style::reset << '\n';
+      std::cout << rang::fg::cyan << " | " << rang::fgB::magenta << "reference function " << rang::fgB::blue
+                << reference_function_name << rang::fg::reset << '\n';
+      std::cout << rang::fg::cyan << " | " << rang::fgB::magenta << "target function " << rang::fgB::blue
+                << source_location.function_name() << rang::fg::reset << '\n';
+    }
+
+    if (not parallel::allReduceAnd(is_comparable)) {
+      throw NormalError("cannot compare data");
+    }
+
+    Array<const int> reference_item_numbers = HDF5::readArray<int>(group_id, "numbers");
+    Array<const DataType> reference_item_value =
+      HDF5::readArray<std::remove_const_t<DataType> >(group_id, reference_name);
+
+    Array<const int> item_numbers = [&] {
+      switch (i_connectivity->dimension()) {
+      case 1: {
+        const Connectivity<1>& connectivity = dynamic_cast<const Connectivity<1>&>(*i_connectivity);
+        return connectivity.number<item_type>().arrayView();
+      }
+      case 2: {
+        const Connectivity<2>& connectivity = dynamic_cast<const Connectivity<2>&>(*i_connectivity);
+        return connectivity.number<item_type>().arrayView();
+      }
+      case 3: {
+        const Connectivity<3>& connectivity = dynamic_cast<const Connectivity<3>&>(*i_connectivity);
+        return connectivity.number<item_type>().arrayView();
+      }
+      default: {
+        throw UnexpectedError("unexpected connectivity dimension");
+      }
+      }
+    }();
+
+    using ItemId = ItemIdT<item_type>;
+
+    std::unordered_map<int, ItemId> item_number_to_item_id_map;
+
+    for (ItemId item_id = 0; item_id < item_numbers.size(); ++item_id) {
+      const auto& [iterator, success] =
+        item_number_to_item_id_map.insert(std::make_pair(item_numbers[item_id], item_id));
+
+      if (not success) {
+        throw UnexpectedError("item numbers have duplicate values");
+      }
+    }
+
+    Assert(item_number_to_item_id_map.size() == item_numbers.size());
+
+    Array<int> index_in_reference(item_numbers.size());
+    index_in_reference.fill(-1);
+    for (size_t i = 0; i < reference_item_numbers.size(); ++i) {
+      const auto& i_number_to_item_id = item_number_to_item_id_map.find(reference_item_numbers[i]);
+      if (i_number_to_item_id != item_number_to_item_id_map.end()) {
+        index_in_reference[i_number_to_item_id->second] = i;
+      }
+    }
+
+    if (parallel::allReduceMin(min(index_in_reference)) < 0) {
+      throw NormalError("some item numbers are not defined in reference");
+    }
+
+    Array<const int> owner = [&] {
+      switch (i_connectivity->dimension()) {
+      case 1: {
+        const Connectivity<1>& connectivity = dynamic_cast<const Connectivity<1>&>(*i_connectivity);
+        return connectivity.owner<item_type>().arrayView();
+      }
+      case 2: {
+        const Connectivity<2>& connectivity = dynamic_cast<const Connectivity<2>&>(*i_connectivity);
+        return connectivity.owner<item_type>().arrayView();
+      }
+      case 3: {
+        const Connectivity<3>& connectivity = dynamic_cast<const Connectivity<3>&>(*i_connectivity);
+        return connectivity.owner<item_type>().arrayView();
+      }
+      default: {
+        throw UnexpectedError("unexpected connectivity dimension");
+      }
+      }
+    }();
+
+    bool has_own_differences = false;
+    bool is_same             = true;
+
+    for (ItemId item_id = 0; item_id < item_value.numberOfItems(); ++item_id) {
+      if (reference_item_value[index_in_reference[item_id]] != item_value[item_id]) {
+        is_same = false;
+        if (static_cast<size_t>(owner[item_id]) == parallel::rank()) {
+          has_own_differences = true;
+        }
+      }
+    }
+
+    is_same             = parallel::allReduceAnd(is_same);
+    has_own_differences = parallel::allReduceOr(has_own_differences);
+
+    if (is_same) {
+      std::cout << rang::fg::cyan << " | compare: " << rang::fgB::green << "success" << rang::fg::reset << '\n';
+    } else {
+      if (has_own_differences) {
+        std::cout << rang::fg::cyan << " | compare: " << rang::fgB::red << "failed!" << rang::fg::reset;
+      } else {
+        std::cout << rang::fg::cyan << " | compare: " << rang::fgB::yellow << "not synchronized" << rang::fg::reset;
+      }
+      std::cout << rang::fg::cyan << " [see \"" << rang::fgB::blue << "parallel_differences_" << m_tag << "_*"
+                << rang::fg::cyan << "\" files for details]" << rang::fg::reset << '\n';
+
+      {
+        std::ofstream fout(std::string{"parallel_differences_"} + stringify(m_tag) + std::string{"_"} +
+                           stringify(parallel::rank()));
+
+        fout.precision(15);
+        for (ItemId item_id = 0; item_id < item_value.numberOfItems(); ++item_id) {
+          if (reference_item_value[index_in_reference[item_id]] != item_value[item_id]) {
+            const bool is_own_difference = (parallel::rank() == static_cast<size_t>(owner[item_id]));
+            if (is_own_difference) {
+              fout << rang::fgB::red << "[ own ]" << rang::fg::reset;
+            } else {
+              fout << rang::fgB::yellow << "[ghost]" << rang::fg::reset;
+            }
+            fout << " rank=" << parallel::rank() << " owner=" << owner[item_id] << " item_id=" << item_id
+                 << " number=" << item_numbers[item_id]
+                 << " reference=" << reference_item_value[index_in_reference[item_id]]
+                 << " target=" << item_value[item_id]
+                 << " difference=" << reference_item_value[index_in_reference[item_id]] - item_value[item_id] << '\n';
+            if (static_cast<size_t>(owner[item_id]) == parallel::rank()) {
+              has_own_differences = true;
+            }
+          }
+        }
+      }
+
+      if (parallel::allReduceAnd(has_own_differences)) {
+        throw NormalError("calculations differ!");
+      }
+    }
+
+    HDF5::close(file_id);
+    ++m_tag;
+  }
+};
+
+template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+void
+check(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value,
+      const std::string& name,
+      const std::experimental::source_location& source_location)
+{
+  const bool write_mode = (parallel::size() == 1);
+
+  std::cout << '\n';
+  if (write_mode) {
+    ParallelChecker::instance().write(item_value, name, source_location);
+  } else {
+    ParallelChecker::instance().compare(item_value, name, source_location);
+  }
+  std::cout << '\n';
+}
+
+#else   // PUGS_HAS_HDF5
+
+template <typename DataType, ItemType item_type, typename ConnectivityPtr>
+void
+check(const ItemValue<DataType, item_type, ConnectivityPtr>&,
+      const std::string&,
+      const std::experimental::source_location& = std::experimental::source_location::current())
+{
+  throw UnexpectedError("parallel checker cannot be used without HDF5 support");
+}
+
+class ParallelChecker
+{
+ private:
+  static ParallelChecker* m_instance;
+
+ public:
+  static void create();
+  static void destroy();
+
+  static ParallelChecker&
+  instance()
+  {
+    return *m_instance;
+  }
+};
+
+#endif   // PUGS_HAS_HDF5
+
+}   // namespace parallel
+
+#endif   // PARALLEL_CHECKER_HPP
-- 
GitLab