diff --git a/CMakeLists.txt b/CMakeLists.txt
index f249c9025ec9119d11afa4554be4d8a8a417164d..ab43ba3dd8bd0b0c0073f614c3e3c1636740d8da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -741,6 +741,12 @@ else()
   message(" gnuplot: not found!")
 endif()
 
+if (GMSH)
+  message(" gmsh: ${GMSH}")
+else()
+  message(" gmsh: not found!")
+endif()
+
 if (PYGMENTIZE)
   message(" pygmentize: ${PYGMENTIZE}")
 else()
@@ -753,7 +759,7 @@ else()
   message(" pdflatex: not found!")
 endif()
 
-if (NOT EMACS OR NOT GNUPLOT_FOUND)
+if (NOT EMACS OR NOT GNUPLOT_FOUND OR NOT GMSH)
   message(" ** Cannot build documentation: missing ")
 elseif(NOT LATEX_PDFLATEX_FOUND OR NOT PYGMENTIZE)
   message(" ** Cannot build pdf documentation: missing")
@@ -764,6 +770,9 @@ endif()
 if (NOT GNUPLOT_FOUND)
   message("    - gnuplot")
 endif()
+if (NOT GMSH)
+  message("    - gmsh")
+endif()
 if (NOT LATEX_PDFLATEX_FOUND)
   message("    - pdflatex")
 endif()
diff --git a/README.md b/README.md
index 62d6b43773cb0164a29e8a3716a818c5ec219c7d..36e460a851a53d40b4ab442460594091fa22b20f 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ apt install slepc-dev
 
 ### User documentation
 
-To build documentation one requires `emacs` and `gnuplot`,
+To build documentation one requires `emacs`, `gmsh` and `gnuplot`,
 additionally since examples results are generated, the documentation
 can only be produced after the compilation of `pugs` itself.
 
@@ -94,6 +94,10 @@ To install `emacs` on Debian-like systems
 ```shell
 apt install emacs
 ```
+To install `gmsh` on Debian-like systems
+```shell
+apt install gmsh
+```
 To install `gnuplot` one can either use
 ```shell
 apt install gnuplot-nox
@@ -106,6 +110,7 @@ apt install gnuplot-x11
 > When building the documentation for the first time, a local `emacs`
 > configuration is generated. *This requires an internet connection.*
 
+
 These packages are enough to build the html documentation. To build
 the pdf documentation one requires a few more packages: `pdflatex`
 (actually a fresh texlive installation is probably necessary) and `pygmentize`
diff --git a/cmake/PugsDoc.cmake b/cmake/PugsDoc.cmake
index 1f8701ccc20f6416b529908f3893d971e86c7452..b067a5cc828ec0c245e8d51c0bda46a96dcce702 100644
--- a/cmake/PugsDoc.cmake
+++ b/cmake/PugsDoc.cmake
@@ -12,10 +12,13 @@ find_program(PYGMENTIZE pygmentize)
 # check for gnuplot
 find_package(Gnuplot)
 
+# check for gmsh
+find_program(GMSH NAMES gmsh)
+
 add_custom_target(userdoc)
 add_custom_target(doc DEPENDS userdoc)
 
-if (EMACS AND GNUPLOT_FOUND)
+if (EMACS AND GNUPLOT_FOUND AND GMSH)
 
   add_custom_command(
     OUTPUT "${PUGS_BINARY_DIR}/doc"
@@ -129,6 +132,13 @@ else()
       COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --red --bold "gnuplot missing")
     add_dependencies(userdoc userdoc-missing-gnuplot)
   endif()
+
+  if (NOT GMSH)
+    add_custom_target(userdoc-missing-gmsh
+      COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --no-newline "Cannot build documentation: "
+      COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --red --bold "gmsh missing")
+    add_dependencies(userdoc userdoc-missing-gmsh)
+  endif()
 endif()
 
 add_dependencies(doc userdoc)
diff --git a/src/language/utils/OFStream.cpp b/src/language/utils/OFStream.cpp
index 39b7cc8ecedee3ff76e3d6d54790d25f90d01569..ca7c77fad8a69fd79ae2b512635acbb7f03277fd 100644
--- a/src/language/utils/OFStream.cpp
+++ b/src/language/utils/OFStream.cpp
@@ -1,10 +1,12 @@
 #include <language/utils/OFStream.hpp>
 
+#include <utils/Filesystem.hpp>
 #include <utils/Messenger.hpp>
 
 OFStream::OFStream(const std::string& filename)
 {
   if (parallel::rank() == 0) {
+    createDirectoryIfNeeded(filename);
     m_fstream.open(filename);
     if (m_fstream.is_open()) {
       m_ostream = &m_fstream;
diff --git a/src/mesh/ItemArrayUtils.hpp b/src/mesh/ItemArrayUtils.hpp
index b1c3ba8cdad569f73f91029c24fb1333e05037ee..1bb8d929f12a2f834535a70166e43a130fb73e40 100644
--- a/src/mesh/ItemArrayUtils.hpp
+++ b/src/mesh/ItemArrayUtils.hpp
@@ -5,6 +5,7 @@
 
 #include <mesh/Connectivity.hpp>
 #include <mesh/ItemArray.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 
@@ -12,7 +13,7 @@
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
   using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
   using ItemIsOwnedType = ItemValue<const bool, item_type>;
@@ -25,7 +26,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
   class ItemArrayMin
   {
    private:
-    const ItemArrayType& m_item_value;
+    const ItemArrayType& m_item_array;
     const ItemIsOwnedType m_is_owned;
 
    public:
@@ -33,7 +34,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator data_type()
     {
       data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
+      parallel_reduce(m_item_array.numberOfItems(), *this, reduced_value);
       return reduced_value;
     }
 
@@ -42,8 +43,8 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator()(const index_type& i_item, data_type& data) const
     {
       if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
+        auto array = m_item_array[i_item];
+        for (size_t i = 0; i < m_item_array.sizeOfArrays(); ++i) {
           if (array[i] < data) {
             data = array[i];
           }
@@ -69,8 +70,8 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemArrayMin(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
+    ItemArrayMin(const ItemArrayType& item_array)
+      : m_item_array(item_array), m_is_owned([&](const IConnectivity& connectivity) {
           Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
                  "unexpected connectivity dimension");
 
@@ -96,7 +97,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
           }
             // LCOV_EXCL_STOP
           }
-        }(*item_value.connectivity_ptr()))
+        }(*item_array.connectivity_ptr()))
     {
       ;
     }
@@ -105,13 +106,13 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemArrayMin() = default;
   };
 
-  const DataType local_min = ItemArrayMin{item_value};
+  const DataType local_min = ItemArrayMin{item_array};
   return parallel::allReduceMin(local_min);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
   using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
   using ItemIsOwnedType = ItemValue<const bool, item_type>;
@@ -124,7 +125,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
   class ItemArrayMax
   {
    private:
-    const ItemArrayType& m_item_value;
+    const ItemArrayType& m_item_array;
     const ItemIsOwnedType m_is_owned;
 
    public:
@@ -132,7 +133,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator data_type()
     {
       data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
+      parallel_reduce(m_item_array.numberOfItems(), *this, reduced_value);
       return reduced_value;
     }
 
@@ -141,8 +142,8 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator()(const index_type& i_item, data_type& data) const
     {
       if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
+        auto array = m_item_array[i_item];
+        for (size_t i = 0; i < m_item_array.sizeOfArrays(); ++i) {
           if (array[i] > data) {
             data = array[i];
           }
@@ -168,8 +169,8 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemArrayMax(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
+    ItemArrayMax(const ItemArrayType& item_array)
+      : m_item_array(item_array), m_is_owned([&](const IConnectivity& connectivity) {
           Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
                  "unexpected connectivity dimension");
 
@@ -195,7 +196,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
           }
             // LCOV_EXCL_STOP
           }
-        }(*item_value.connectivity_ptr()))
+        }(*item_array.connectivity_ptr()))
     {
       ;
     }
@@ -204,106 +205,34 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemArrayMax() = default;
   };
 
-  const DataType local_max = ItemArrayMax{item_value};
+  const DataType local_max = ItemArrayMax{item_array};
   return parallel::allReduceMax(local_max);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+sum(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
-  using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
-  using ItemIsOwnedType = ItemValue<const bool, item_type>;
-  using data_type       = std::remove_const_t<typename ItemArrayType::data_type>;
-  using index_type      = typename ItemArrayType::index_type;
+  using ItemArrayType = ItemArray<DataType, item_type, ConnectivityPtr>;
+  using data_type     = std::remove_const_t<typename ItemArrayType::data_type>;
+  using index_type    = typename ItemArrayType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class ItemArraySum
-  {
-   private:
-    const ItemArrayType& m_item_value;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& i_item, data_type& data) const
-    {
-      if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
-          data += array[i];
-        }
-      }
-    }
+  ItemValue<data_type, item_type> item_sum{*item_array.connectivity_ptr()};
 
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
+  parallel_for(
+    item_array.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto row          = item_array[item_id];
+      data_type row_sum = row[0];
 
-    PUGS_INLINE
-    void
-    init(data_type& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        value = zero;
+      for (size_t i = 1; i < row.size(); ++i) {
+        row_sum += row[i];
       }
-    }
-
-    PUGS_INLINE
-    ItemArraySum(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
-    {
-      ;
-    }
-
-    PUGS_INLINE
-    ~ItemArraySum() = default;
-  };
+      item_sum[item_id] = row_sum;
+    });
 
-  const DataType local_sum = ItemArraySum{item_value};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
diff --git a/src/mesh/ItemValueUtils.hpp b/src/mesh/ItemValueUtils.hpp
index 9ef745b550e42add842ac1c822cf65b07e34dc2b..e64f2e1bdc0bc1e934aac22258461947c1c0ffd3 100644
--- a/src/mesh/ItemValueUtils.hpp
+++ b/src/mesh/ItemValueUtils.hpp
@@ -8,6 +8,7 @@
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
+#include <utils/ReproducibleSumMPI.hpp>
 
 #include <iostream>
 
@@ -210,6 +211,33 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
+  auto get_is_owned = [](const IConnectivity& connectivity) -> ItemIsOwnedType {
+    Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3), "unexpected connectivity dimension");
+
+    switch (connectivity.dimension()) {
+    case 1: {
+      const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
+      return connectivity_1d.isOwned<item_type>();
+      break;
+    }
+    case 2: {
+      const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
+      return connectivity_2d.isOwned<item_type>();
+      break;
+    }
+    case 3: {
+      const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
+      return connectivity_3d.isOwned<item_type>();
+      break;
+    }
+      // LCOV_EXCL_START
+    default: {
+      throw UnexpectedError("unexpected dimension");
+    }
+      // LCOV_EXCL_STOP
+    }
+  };
+
   class ItemValueSum
   {
    private:
@@ -254,34 +282,8 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemValueSum(const ItemValueType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
+    ItemValueSum(const ItemValueType& item_value, const ItemIsOwnedType& is_owned)
+      : m_item_value{item_value}, m_is_owned{is_owned}
     {
       ;
     }
@@ -290,8 +292,34 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemValueSum() = default;
   };
 
-  const DataType local_sum = ItemValueSum{item_value};
-  return parallel::allReduceSum(local_sum);
+  auto is_owned = get_is_owned(*item_value.connectivity_ptr());
+
+  if (ReproducibleSumManager::reproducibleSums()) {
+    if constexpr (std::is_floating_point_v<data_type>) {
+      ReproducibleScalarSum r_sum(item_value.arrayView(), is_owned.arrayView());
+      using ReproducibleSumType = decltype(r_sum);
+      return ReproducibleSumType::getValue(allReduceReproducibleSum(r_sum));
+    } else if constexpr (not std::is_arithmetic_v<data_type>) {
+      if constexpr (is_tiny_vector_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproducibleTinyVectorSum r_sum(item_value.arrayView(), is_owned.arrayView());
+        using ReproducibleSumType = decltype(r_sum);
+        return ReproducibleSumType::getValue(allReduceReproducibleSum(r_sum));
+      } else if constexpr (is_tiny_matrix_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproducibleTinyMatrixSum r_sum(item_value.arrayView(), is_owned.arrayView());
+        using ReproducibleSumType = decltype(r_sum);
+        return ReproducibleSumType::getValue(allReduceReproducibleSum(r_sum));
+      } else {
+        const DataType local_sum = ItemValueSum{item_value, is_owned};
+        return parallel::allReduceSum(local_sum);
+      }
+    } else {
+      const DataType local_sum = ItemValueSum{item_value, is_owned};
+      return parallel::allReduceSum(local_sum);
+    }
+  } else {
+    const DataType local_sum = ItemValueSum{item_value, is_owned};
+    return parallel::allReduceSum(local_sum);
+  }
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
diff --git a/src/mesh/MeshFlatNodeBoundary.cpp b/src/mesh/MeshFlatNodeBoundary.cpp
index b7de42715eff88510119b54507ed32323365c817..459a827c84ead76ed23acdf28cbd8525923694da 100644
--- a/src/mesh/MeshFlatNodeBoundary.cpp
+++ b/src/mesh/MeshFlatNodeBoundary.cpp
@@ -87,57 +87,108 @@ MeshFlatNodeBoundary<2>::_getNormal(const Mesh<Connectivity<2>>& mesh)
 
 template <>
 TinyVector<3, double>
-MeshFlatNodeBoundary<3>::_getNormal(const Mesh<Connectivity<3>>& mesh)
+MeshFlatNodeBoundary<3>::_getFarestNode(const Mesh<Connectivity<3>>& mesh, const Rd& x0, const Rd& x1)
 {
-  using R3 = TinyVector<3, double>;
-
-  std::array<R3, 6> bounds = this->_getBounds(mesh);
+  const NodeValue<const Rd>& xr = mesh.xr();
+  const auto node_number        = mesh.connectivity().nodeNumber();
 
-  const R3& xmin = bounds[0];
-  const R3& ymin = bounds[1];
-  const R3& zmin = bounds[2];
-  const R3& xmax = bounds[3];
-  const R3& ymax = bounds[4];
-  const R3& zmax = bounds[5];
+  using NodeNumberType = std::remove_const_t<typename decltype(node_number)::data_type>;
 
-  const R3 u = xmax - xmin;
-  const R3 v = ymax - ymin;
-  const R3 w = zmax - zmin;
+  Rd t = x1 - x0;
+  t *= 1. / l2Norm(t);
 
-  const R3 uv        = crossProduct(u, v);
-  const double uv_l2 = dot(uv, uv);
+  double farest_distance       = 0;
+  Rd farest_x                  = zero;
+  NodeNumberType farest_number = std::numeric_limits<NodeNumberType>::max();
 
-  R3 normal        = uv;
-  double normal_l2 = uv_l2;
+  auto node_list = this->m_ref_node_list.list();
 
-  const R3 uw        = crossProduct(u, w);
-  const double uw_l2 = dot(uw, uw);
+  for (size_t i_node = 0; i_node < node_list.size(); ++i_node) {
+    const NodeId& node_id = node_list[i_node];
+    const Rd& x           = xr[node_id];
+    const double distance = l2Norm(crossProduct(t, x - x0));
 
-  if (uw_l2 > uv_l2) {
-    normal    = uw;
-    normal_l2 = uw_l2;
+    if ((distance > farest_distance) or ((distance == farest_distance) and (node_number[node_id] < farest_number))) {
+      farest_distance = distance;
+      farest_number   = node_number[node_id];
+      farest_x        = x;
+    }
   }
 
-  const R3 vw        = crossProduct(v, w);
-  const double vw_l2 = dot(vw, vw);
+  if (parallel::size()) {
+    Array<double> farest_distance_array       = parallel::allGather(farest_distance);
+    Array<Rd> farest_x_array                  = parallel::allGather(farest_x);
+    Array<NodeNumberType> farest_number_array = parallel::allGather(farest_number);
 
-  if (vw_l2 > normal_l2) {
-    normal    = vw;
-    normal_l2 = vw_l2;
+    Assert(farest_distance_array.size() == farest_x_array.size());
+    Assert(farest_distance_array.size() == farest_number_array.size());
+
+    for (size_t i = 0; i < farest_distance_array.size(); ++i) {
+      if ((farest_distance_array[i] > farest_distance) or
+          ((farest_distance_array[i] == farest_distance) and (farest_number_array[i] < farest_number))) {
+        farest_distance = farest_distance_array[i];
+        farest_number   = farest_number_array[i];
+        farest_x        = farest_x_array[i];
+      }
+    }
   }
 
-  if (normal_l2 == 0) {
+  return farest_x;
+}
+
+template <>
+TinyVector<3, double>
+MeshFlatNodeBoundary<3>::_getNormal(const Mesh<Connectivity<3>>& mesh)
+{
+  using R3 = TinyVector<3, double>;
+
+  std::array<R3, 2> diagonal = [](const std::array<R3, 6>& bounds) {
+    size_t max_i      = 0;
+    size_t max_j      = 0;
+    double max_length = 0;
+
+    for (size_t i = 0; i < bounds.size(); ++i) {
+      for (size_t j = i + 1; j < bounds.size(); ++j) {
+        double length = l2Norm(bounds[i] - bounds[j]);
+        if (length > max_length) {
+          max_i      = i;
+          max_j      = j;
+          max_length = length;
+        }
+      }
+    }
+
+    return std::array<R3, 2>{bounds[max_i], bounds[max_j]};
+  }(this->_getBounds(mesh));
+
+  const R3& x0 = diagonal[0];
+  const R3& x1 = diagonal[1];
+
+  if (x0 == x1) {
     std::ostringstream ost;
     ost << "invalid boundary \"" << rang::fgB::yellow << m_ref_node_list.refId() << rang::style::reset
         << "\": unable to compute normal";
     throw NormalError(ost.str());
   }
 
-  const double length = sqrt(normal_l2);
+  const R3 x2 = this->_getFarestNode(mesh, x0, x1);
+
+  const R3 u = x1 - x0;
+  const R3 v = x2 - x0;
+
+  R3 normal                = crossProduct(u, v);
+  const double normal_norm = l2Norm(normal);
+
+  if (normal_norm == 0) {
+    std::ostringstream ost;
+    ost << "invalid boundary \"" << rang::fgB::yellow << m_ref_node_list.refId() << rang::style::reset
+        << "\": unable to compute normal";
+    throw NormalError(ost.str());
+  }
 
-  normal *= 1. / length;
+  normal *= (1. / normal_norm);
 
-  this->_checkBoundaryIsFlat(normal, 1. / 6. * (xmin + xmax + ymin + ymax + zmin + zmax), length, mesh);
+  this->_checkBoundaryIsFlat(normal, 1. / 3. * (x0 + x1 + x2), normal_norm, mesh);
 
   return normal;
 }
diff --git a/src/mesh/MeshFlatNodeBoundary.hpp b/src/mesh/MeshFlatNodeBoundary.hpp
index 8486f02c7377fe2987560d29f2ee7cc45c1b44cf..54a10ae1c1412005440cd9e6cc49973c017265e0 100644
--- a/src/mesh/MeshFlatNodeBoundary.hpp
+++ b/src/mesh/MeshFlatNodeBoundary.hpp
@@ -13,21 +13,26 @@ class [[nodiscard]] MeshFlatNodeBoundary final
  private:
   const Rd m_outgoing_normal;
 
+  Rd _getFarestNode(const Mesh<Connectivity<Dimension>>& mesh, const Rd& x0, const Rd& x1);
+
   Rd _getNormal(const Mesh<Connectivity<Dimension>>& mesh);
 
-  void _checkBoundaryIsFlat(const TinyVector<Dimension, double>& normal, const TinyVector<Dimension, double>& origin,
-                            const double length, const Mesh<Connectivity<Dimension>>& mesh) const;
+  void _checkBoundaryIsFlat(const TinyVector<Dimension, double>& normal,
+                            const TinyVector<Dimension, double>& origin,
+                            const double length,
+                            const Mesh<Connectivity<Dimension>>& mesh) const;
 
   Rd _getOutgoingNormal(const Mesh<Connectivity<Dimension>>& mesh);
 
  public:
-  const Rd& outgoingNormal() const
+  const Rd&
+  outgoingNormal() const
   {
     return m_outgoing_normal;
   }
 
   MeshFlatNodeBoundary& operator=(const MeshFlatNodeBoundary&) = default;
-  MeshFlatNodeBoundary& operator=(MeshFlatNodeBoundary&&) = default;
+  MeshFlatNodeBoundary& operator=(MeshFlatNodeBoundary&&)      = default;
 
   template <size_t MeshDimension>
   friend MeshFlatNodeBoundary<MeshDimension> getMeshFlatNodeBoundary(const Mesh<Connectivity<MeshDimension>>& mesh,
@@ -47,7 +52,7 @@ class [[nodiscard]] MeshFlatNodeBoundary final
  public:
   MeshFlatNodeBoundary()                            = default;
   MeshFlatNodeBoundary(const MeshFlatNodeBoundary&) = default;
-  MeshFlatNodeBoundary(MeshFlatNodeBoundary &&)     = default;
+  MeshFlatNodeBoundary(MeshFlatNodeBoundary&&)      = default;
   ~MeshFlatNodeBoundary()                           = default;
 };
 
diff --git a/src/mesh/SubItemArrayPerItemUtils.hpp b/src/mesh/SubItemArrayPerItemUtils.hpp
index d2553590ab0f0dad219cdfa82491c05a70b5edbe..aec08ef845cc80bafd20ec8fa1a3b9bcc94415de 100644
--- a/src/mesh/SubItemArrayPerItemUtils.hpp
+++ b/src/mesh/SubItemArrayPerItemUtils.hpp
@@ -4,13 +4,12 @@
 #include <utils/Messenger.hpp>
 
 #include <mesh/Connectivity.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/SubItemArrayPerItem.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
 
-#include <iostream>
-
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
 min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_array_per_item)
@@ -34,9 +33,9 @@ min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
     PUGS_INLINE
     operator data_type()
     {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
+      data_type reduced_value;
+      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_value);
+      return reduced_value;
     }
 
     PUGS_INLINE
@@ -67,9 +66,9 @@ min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
     PUGS_INLINE
     void
-    init(data_type& array) const
+    init(data_type& value) const
     {
-      array = std::numeric_limits<data_type>::max();
+      value = std::numeric_limits<data_type>::max();
     }
 
     PUGS_INLINE
@@ -136,9 +135,9 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
     PUGS_INLINE
     operator data_type()
     {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
+      data_type reduced_value;
+      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_value);
+      return reduced_value;
     }
 
     PUGS_INLINE
@@ -169,9 +168,9 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
     PUGS_INLINE
     void
-    init(data_type& array) const
+    init(data_type& value) const
     {
-      array = std::numeric_limits<data_type>::min();
+      value = std::numeric_limits<data_type>::min();
     }
 
     PUGS_INLINE
@@ -217,103 +216,38 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& item_array)
+sum(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_array_per_item)
 {
   using SubItemArrayPerItemType = SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>;
   constexpr ItemType item_type  = ItemOfItem::item_type;
-  using ItemIsOwnedType         = ItemValue<const bool, item_type>;
   using data_type               = std::remove_const_t<typename SubItemArrayPerItemType::data_type>;
   using index_type              = typename SubItemArrayPerItemType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class SubItemArrayPerItemSum
-  {
-   private:
-    const SubItemArrayPerItemType& m_sub_item_array_per_item;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& item_id, data_type& data) const
-    {
-      if (m_is_owned[item_id]) {
-        auto sub_item_table = m_sub_item_array_per_item.itemTable(item_id);
-        for (size_t i = 0; i < sub_item_table.numberOfRows(); ++i) {
-          for (size_t j = 0; j < sub_item_table.numberOfColumns(); ++j) {
-            data += sub_item_table(i, j);
-          }
+  ItemValue<data_type, item_type> item_sum{*sub_item_array_per_item.connectivity_ptr()};
+  parallel_for(
+    sub_item_array_per_item.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto sub_item_table          = sub_item_array_per_item.itemTable(item_id);
+      data_type sub_item_table_sum = [] {
+        if constexpr (std::is_arithmetic_v<data_type>) {
+          return 0;
+        } else {
+          static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
+          return zero;
         }
-      }
-    }
+      }();
 
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(data_type& array) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        array = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        array = zero;
+      for (size_t i = 0; i < sub_item_table.numberOfRows(); ++i) {
+        for (size_t j = 0; j < sub_item_table.numberOfColumns(); ++j) {
+          sub_item_table_sum += sub_item_table(i, j);
+        }
       }
-    }
 
-    PUGS_INLINE
-    SubItemArrayPerItemSum(const SubItemArrayPerItemType& item_array)
-      : m_sub_item_array_per_item(item_array), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_array.connectivity_ptr()))
-    {
-      ;
-    }
-
-    PUGS_INLINE
-    ~SubItemArrayPerItemSum() = default;
-  };
+      item_sum[item_id] = sub_item_table_sum;
+    });
 
-  const DataType local_sum = SubItemArrayPerItemSum{item_array};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
diff --git a/src/mesh/SubItemValuePerItemUtils.hpp b/src/mesh/SubItemValuePerItemUtils.hpp
index 304acae23a31e610695a2b309e72e7d8035d01a5..575c39c5238877bb064883c54b7d08792547000b 100644
--- a/src/mesh/SubItemValuePerItemUtils.hpp
+++ b/src/mesh/SubItemValuePerItemUtils.hpp
@@ -4,13 +4,12 @@
 #include <utils/Messenger.hpp>
 
 #include <mesh/Connectivity.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/SubItemValuePerItem.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
 
-#include <iostream>
-
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
 min(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_value_per_item)
@@ -213,101 +212,28 @@ max(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_v
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& item_value)
+sum(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_value_per_item)
 {
   using SubItemValuePerItemType = SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>;
   constexpr ItemType item_type  = ItemOfItem::item_type;
-  using ItemIsOwnedType         = ItemValue<const bool, item_type>;
   using data_type               = std::remove_const_t<typename SubItemValuePerItemType::data_type>;
   using index_type              = typename SubItemValuePerItemType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class SubItemValuePerItemSum
-  {
-   private:
-    const SubItemValuePerItemType& m_sub_item_value_per_item;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_value;
-      parallel_reduce(m_sub_item_value_per_item.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& item_id, data_type& data) const
-    {
-      if (m_is_owned[item_id]) {
-        auto sub_item_array = m_sub_item_value_per_item.itemArray(item_id);
-        for (size_t i = 0; i < sub_item_array.size(); ++i) {
-          data += sub_item_array[i];
-        }
-      }
-    }
-
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(data_type& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        value = zero;
+  ItemValue<data_type, item_type> item_sum{*sub_item_value_per_item.connectivity_ptr()};
+  parallel_for(
+    sub_item_value_per_item.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto sub_item_array          = sub_item_value_per_item.itemArray(item_id);
+      data_type sub_item_array_sum = sub_item_array[0];
+      for (size_t i = 1; i < sub_item_array.size(); ++i) {
+        sub_item_array_sum += sub_item_array[i];
       }
-    }
-
-    PUGS_INLINE
-    SubItemValuePerItemSum(const SubItemValuePerItemType& item_value)
-      : m_sub_item_value_per_item(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
-    {
-      ;
-    }
 
-    PUGS_INLINE
-    ~SubItemValuePerItemSum() = default;
-  };
+      item_sum[item_id] = sub_item_array_sum;
+    });
 
-  const DataType local_sum = SubItemValuePerItemSum{item_value};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
diff --git a/src/output/GnuplotWriter.cpp b/src/output/GnuplotWriter.cpp
index 50969799eeb4250b28d21722e8476ab86a88d9eb..8ccf7767180b30524ba54f7c4f085c3ad35783d2 100644
--- a/src/output/GnuplotWriter.cpp
+++ b/src/output/GnuplotWriter.cpp
@@ -5,6 +5,7 @@
 #include <mesh/Mesh.hpp>
 #include <mesh/MeshData.hpp>
 #include <mesh/MeshDataManager.hpp>
+#include <utils/Filesystem.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/PugsTraits.hpp>
 #include <utils/RevisionInfo.hpp>
@@ -243,8 +244,16 @@ GnuplotWriter::_write(const MeshType& mesh,
                       const OutputNamedItemDataSet& output_named_item_data_set,
                       std::optional<double> time) const
 {
+  createDirectoryIfNeeded(_getFilename());
+
   if (parallel::rank() == 0) {
     std::ofstream fout{_getFilename()};
+    if (not fout) {
+      std::ostringstream error_msg;
+      error_msg << "cannot create file \"" << rang::fgB::yellow << _getFilename() << rang::fg::reset << '"';
+      throw NormalError(error_msg.str());
+    }
+
     fout.precision(15);
     fout.setf(std::ios_base::scientific);
 
@@ -286,6 +295,12 @@ GnuplotWriter::_write(const MeshType& mesh,
   for (size_t i_rank = 0; i_rank < parallel::size(); ++i_rank) {
     if (i_rank == parallel::rank()) {
       std::ofstream fout(_getFilename(), std::ios_base::app);
+      if (not fout) {
+        std::ostringstream error_msg;
+        error_msg << "cannot open file \"" << rang::fgB::yellow << _getFilename() << rang::fg::reset << '"';
+        throw NormalError(error_msg.str());
+      }
+
       fout.precision(15);
       fout.setf(std::ios_base::scientific);
 
diff --git a/src/output/GnuplotWriter1D.cpp b/src/output/GnuplotWriter1D.cpp
index a3156904843bdeb77be718de6a10614a3333afae..3e0740446e961505eb01e36eee5ba168e0f47dcc 100644
--- a/src/output/GnuplotWriter1D.cpp
+++ b/src/output/GnuplotWriter1D.cpp
@@ -5,6 +5,7 @@
 #include <mesh/Mesh.hpp>
 #include <mesh/MeshData.hpp>
 #include <mesh/MeshDataManager.hpp>
+#include <utils/Filesystem.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/PugsTraits.hpp>
 #include <utils/RevisionInfo.hpp>
@@ -322,6 +323,12 @@ GnuplotWriter1D::_write(const MeshType& mesh,
 
   if (parallel::rank() == 0) {
     fout.open(_getFilename());
+    if (not fout) {
+      std::ostringstream error_msg;
+      error_msg << "cannot create file \"" << rang::fgB::yellow << _getFilename() << rang::fg::reset << '"';
+      throw NormalError(error_msg.str());
+    }
+
     fout.precision(15);
     fout.setf(std::ios_base::scientific);
     fout << _getDateAndVersionComment();
diff --git a/src/output/VTKWriter.cpp b/src/output/VTKWriter.cpp
index 0a40c7265e05a17d31e59bd41eeb10cb63935821..6ec6295e44b54bef9ea21c110120fdd873f71488 100644
--- a/src/output/VTKWriter.cpp
+++ b/src/output/VTKWriter.cpp
@@ -4,6 +4,7 @@
 #include <mesh/Mesh.hpp>
 #include <mesh/MeshData.hpp>
 #include <mesh/MeshDataManager.hpp>
+#include <utils/Filesystem.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/RevisionInfo.hpp>
 #include <utils/Stringify.hpp>
@@ -364,8 +365,16 @@ VTKWriter::_write(const MeshType& mesh,
   output_named_item_data_set.add(NamedItemData{"cell_number", mesh.connectivity().cellNumber()});
   output_named_item_data_set.add(NamedItemData{"node_number", mesh.connectivity().nodeNumber()});
 
+  createDirectoryIfNeeded(m_base_filename);
+
   if (parallel::rank() == 0) {   // write PVTK file
     std::ofstream fout(_getFilenamePVTU());
+    if (not fout) {
+      std::ostringstream error_msg;
+      error_msg << "cannot create file \"" << rang::fgB::yellow << _getFilenamePVTU() << rang::fg::reset << '"';
+      throw NormalError(error_msg.str());
+    }
+
     fout << "<?xml version=\"1.0\"?>\n";
     fout << _getDateAndVersionComment();
     fout << "<VTKFile type=\"PUnstructuredGrid\">\n";
@@ -424,7 +433,7 @@ VTKWriter::_write(const MeshType& mesh,
     fout << "</PCellData>\n";
 
     for (size_t i_rank = 0; i_rank < parallel::size(); ++i_rank) {
-      fout << "<Piece Source=\"" << _getFilenameVTU(i_rank) << "\"/>\n";
+      fout << "<Piece Source=" << std::filesystem::path{_getFilenameVTU(i_rank)}.filename() << "/>\n";
     }
     fout << "</PUnstructuredGrid>\n";
     fout << "</VTKFile>\n";
@@ -435,6 +444,13 @@ VTKWriter::_write(const MeshType& mesh,
 
     // write VTK files
     std::ofstream fout(_getFilenameVTU(parallel::rank()));
+    if (not fout) {
+      std::ostringstream error_msg;
+      error_msg << "cannot create file \"" << rang::fgB::yellow << _getFilenameVTU(parallel::rank()) << rang::fg::reset
+                << '"';
+      throw NormalError(error_msg.str());
+    }
+
     fout << "<?xml version=\"1.0\"?>\n";
     fout << _getDateAndVersionComment();
     fout << "<VTKFile type=\"UnstructuredGrid\"";
@@ -632,7 +648,13 @@ VTKWriter::_write(const MeshType& mesh,
   }
 
   if (parallel::rank() == 0) {   // write PVD file
-    std::ofstream fout(m_base_filename + ".pvd");
+    const std::string pvd_filename = m_base_filename + ".pvd";
+    std::ofstream fout(pvd_filename);
+    if (not fout) {
+      std::ostringstream error_msg;
+      error_msg << "cannot create file \"" << rang::fgB::yellow << pvd_filename << rang::fg::reset << '"';
+      throw NormalError(error_msg.str());
+    }
 
     fout << "<?xml version=\"1.0\"?>\n";
     fout << _getDateAndVersionComment();
@@ -645,11 +667,13 @@ VTKWriter::_write(const MeshType& mesh,
         sout << m_base_filename;
         sout << '.' << std::setfill('0') << std::setw(4) << i_time << ".pvtu";
 
-        fout << "<DataSet timestep=\"" << m_period_manager->savedTime(i_time) << "\" file=\"" << sout.str() << "\"/>\n";
+        fout << "<DataSet timestep=\"" << m_period_manager->savedTime(i_time)
+             << "\" file=" << std::filesystem::path{sout.str()}.filename() << "/>\n";
       }
-      fout << "<DataSet timestep=\"" << *time << "\" file=\"" << _getFilenamePVTU() << "\"/>\n";
+      fout << "<DataSet timestep=\"" << *time << "\" file=" << std::filesystem::path{_getFilenamePVTU()}.filename()
+           << "/>\n";
     } else {
-      fout << "<DataSet file=\"" << _getFilenamePVTU() << "\"/>\n";
+      fout << "<DataSet file=" << std::filesystem::path{_getFilenamePVTU()}.filename() << "/>\n";
     }
 
     fout << "</Collection>\n";
diff --git a/src/scheme/DiscreteFunctionP0.hpp b/src/scheme/DiscreteFunctionP0.hpp
index 6587a19f2c77d1f063f38765b4e768d92798f7a4..5a49b9db3d7f6ecd6b589f8a58703ec02d311f6b 100644
--- a/src/scheme/DiscreteFunctionP0.hpp
+++ b/src/scheme/DiscreteFunctionP0.hpp
@@ -708,79 +708,19 @@ class DiscreteFunctionP0
     return sum(f.m_cell_values);
   }
 
- private:
-  class Integrator
-  {
-   private:
-    const DiscreteFunctionP0 m_function;
-    CellValue<const double> m_cell_volume;
-    CellValue<const bool> m_cell_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator std::decay_t<data_type>()
-    {
-      std::decay_t<data_type> reduced_value;
-      if constexpr (std::is_arithmetic_v<std::decay_t<data_type>>) {
-        reduced_value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<std::decay_t<data_type>> or is_tiny_matrix_v<std::decay_t<data_type>>,
-                      "invalid data type");
-        reduced_value = zero;
-      }
-      parallel_reduce(m_cell_volume.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const CellId& cell_id, std::decay_t<data_type>& data) const
-    {
-      if (m_cell_is_owned[cell_id]) {
-        data += m_cell_volume[cell_id] * m_function[cell_id];
-      }
-    }
-
-    PUGS_INLINE
-    void
-    join(std::decay_t<data_type>& dst, const std::decay_t<data_type>& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(std::decay_t<data_type>& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<std::decay_t<data_type>> or is_tiny_matrix_v<std::decay_t<data_type>>,
-                      "invalid data type");
-        value = zero;
-      }
-    }
-
-    PUGS_INLINE
-    Integrator(const DiscreteFunctionP0& function) : m_function(function)
-    {
-      const Mesh<Connectivity<Dimension>>& mesh = *function.m_mesh;
-      m_cell_volume                             = MeshDataManager::instance().getMeshData(mesh).Vj();
-      m_cell_is_owned                           = mesh.connectivity().cellIsOwned();
-    }
-
-    PUGS_INLINE
-    ~Integrator() = default;
-  };
-
- public:
   PUGS_INLINE friend DataType
   integrate(const DiscreteFunctionP0& f)
   {
     Assert(f.m_cell_values.isBuilt());
-    const DataType integral = Integrator{f};
 
-    return parallel::allReduceSum(integral);
+    const Mesh<Connectivity<Dimension>>& mesh = *f.m_mesh;
+    CellValue<const double> cell_volume       = MeshDataManager::instance().getMeshData(mesh).Vj();
+
+    CellValue<std::remove_const_t<DataType>> f_v{mesh.connectivity()};
+    parallel_for(
+      mesh.numberOfCells(), PUGS_LAMBDA(CellId cell_id) { f_v[cell_id] = cell_volume[cell_id] * f[cell_id]; });
+
+    return sum(f_v);
   }
 
   DiscreteFunctionP0(const std::shared_ptr<const MeshType>& mesh) : m_mesh{mesh}, m_cell_values{mesh->connectivity()} {}
diff --git a/src/scheme/FluxingAdvectionSolver.cpp b/src/scheme/FluxingAdvectionSolver.cpp
index 264ecef042956e6b77230e00f49cd321b50dcc6a..754c6e3ac6448a96e5e05c0a7f60967243b4798f 100644
--- a/src/scheme/FluxingAdvectionSolver.cpp
+++ b/src/scheme/FluxingAdvectionSolver.cpp
@@ -1,5 +1,8 @@
 #include <scheme/FluxingAdvectionSolver.hpp>
 
+#include <analysis/GaussQuadratureDescriptor.hpp>
+#include <analysis/QuadratureManager.hpp>
+#include <geometry/PrismTransformation.hpp>
 #include <language/utils/EvaluateAtPoints.hpp>
 #include <language/utils/InterpolateItemArray.hpp>
 #include <language/utils/InterpolateItemValue.hpp>
@@ -322,6 +325,10 @@ template <>
 FaceValue<double>
 FluxingAdvectionSolver<3>::_computeAlgebraicFluxingVolume()
 {
+  // due to the z component of the jacobian determinant, degree 3
+  // polynomials must be exactly integrated
+  const QuadratureFormula<3>& gauss = QuadratureManager::instance().getPrismFormula(GaussQuadratureDescriptor(3));
+
   const auto face_to_node_matrix = m_old_mesh->connectivity().faceToNodeMatrix();
   FaceValue<double> algebraic_fluxing_volume(m_new_mesh->connectivity());
   auto old_xr = m_old_mesh->xr();
@@ -329,31 +336,10 @@ FluxingAdvectionSolver<3>::_computeAlgebraicFluxingVolume()
   parallel_for(
     m_new_mesh->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
       const auto& face_to_node = face_to_node_matrix[face_id];
-      if (face_to_node.size() == 4) {
-        const Rd& x0 = old_xr[face_to_node[0]];
-        const Rd& x1 = old_xr[face_to_node[1]];
-        const Rd& x2 = old_xr[face_to_node[2]];
-        const Rd& x3 = old_xr[face_to_node[3]];
-
-        const Rd& x4 = new_xr[face_to_node[0]];
-        const Rd& x5 = new_xr[face_to_node[1]];
-        const Rd& x6 = new_xr[face_to_node[2]];
-        const Rd& x7 = new_xr[face_to_node[3]];
-
-        const Rd& a1 = x6 - x1;
-        const Rd& a2 = x6 - x3;
-        const Rd& a3 = x6 - x4;
-
-        const Rd& b1 = x7 - x0;
-        const Rd& b2 = x5 - x0;
-        const Rd& b3 = x2 - x0;
 
-        TinyMatrix<3> M1(a1 + b1, a2, a3);
-        TinyMatrix<3> M2(b1, a2 + b2, a3);
-        TinyMatrix<3> M3(a1, b2, a3 + b3);
+      const size_t face_nb_node = face_to_node.size();
 
-        algebraic_fluxing_volume[face_id] = (det(M1) + det(M2) + det(M3)) / 12;
-      } else if (face_to_node.size() == 3) {
+      if (face_nb_node == 3) {
         const Rd& x0 = old_xr[face_to_node[0]];
         const Rd& x1 = old_xr[face_to_node[1]];
         const Rd& x2 = old_xr[face_to_node[2]];
@@ -362,21 +348,40 @@ FluxingAdvectionSolver<3>::_computeAlgebraicFluxingVolume()
         const Rd& x4 = new_xr[face_to_node[1]];
         const Rd& x5 = new_xr[face_to_node[2]];
 
-        const Rd& a1 = x5 - x1;
-        const Rd& a2 = x5 - x2;
-        const Rd& a3 = x5 - x3;
+        double volume = 0;
 
-        const Rd& b1 = x5 - x0;
-        const Rd& b2 = x4 - x0;
-        const Rd& b3 = x2 - x0;
-
-        TinyMatrix<3> M1(a1 + b1, a2, a3);
-        TinyMatrix<3> M2(b1, a2 + b2, a3);
-        TinyMatrix<3> M3(a1, b2, a3 + b3);
+        PrismTransformation T(x0, x1, x2, x3, x4, x5);
+        for (size_t i = 0; i < gauss.numberOfPoints(); ++i) {
+          volume += gauss.weight(i) * T.jacobianDeterminant(gauss.point(i));
+        }
 
-        algebraic_fluxing_volume[face_id] = (det(M1) + det(M2) + det(M3)) / 12;
+        algebraic_fluxing_volume[face_id] = volume;
       } else {
-        throw NotImplementedError("Not implemented for non quad faces");
+        Rd xg_old = zero;
+        for (size_t i_node = 0; i_node < face_nb_node; ++i_node) {
+          xg_old += old_xr[face_to_node[i_node]];
+        }
+        xg_old *= 1. / face_nb_node;
+        Rd xg_new = zero;
+        for (size_t i_node = 0; i_node < face_nb_node; ++i_node) {
+          xg_new += new_xr[face_to_node[i_node]];
+        }
+        xg_new *= 1. / face_nb_node;
+
+        double volume = 0;
+        for (size_t i_node = 0; i_node < face_nb_node; ++i_node) {
+          PrismTransformation T(xg_old,                                              //
+                                old_xr[face_to_node[i_node]],                        //
+                                old_xr[face_to_node[(i_node + 1) % face_nb_node]],   //
+                                xg_new,                                              //
+                                new_xr[face_to_node[i_node]],                        //
+                                new_xr[face_to_node[(i_node + 1) % face_nb_node]]);
+          for (size_t i = 0; i < gauss.numberOfPoints(); ++i) {
+            volume += gauss.weight(i) * T.jacobianDeterminant(gauss.point(i));
+          }
+        }
+
+        algebraic_fluxing_volume[face_id] = volume;
       }
     });
 
diff --git a/src/utils/Array.hpp b/src/utils/Array.hpp
index 4943d4c08d983f67d1550262e34aa80030d49bf1..2f0146e2b9d08c231eaadee7e7f50eeea439ddaf 100644
--- a/src/utils/Array.hpp
+++ b/src/utils/Array.hpp
@@ -6,6 +6,8 @@
 #include <utils/PugsAssert.hpp>
 #include <utils/PugsMacros.hpp>
 #include <utils/PugsUtils.hpp>
+#include <utils/ReproducibleSumManager.hpp>
+#include <utils/ReproducibleSumUtils.hpp>
 #include <utils/Types.hpp>
 
 #include <algorithm>
@@ -373,8 +375,9 @@ template <typename DataType>
 [[nodiscard]] std::remove_const_t<DataType>
 sum(const Array<DataType>& array)
 {
-  using ArrayType  = Array<DataType>;
-  using data_type  = std::remove_const_t<DataType>;
+  using ArrayType = Array<DataType>;
+  using data_type = std::remove_const_t<DataType>;
+
   using index_type = typename ArrayType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
@@ -429,7 +432,26 @@ sum(const Array<DataType>& array)
     ~ArraySum() = default;
   };
 
-  return ArraySum(array);
+  if (ReproducibleSumManager::reproducibleSums()) {
+    if constexpr (std::is_floating_point_v<data_type>) {
+      ReproducibleScalarSum r_sum(array);
+      return r_sum.getSum();
+    } else if constexpr (not std::is_arithmetic_v<data_type>) {
+      if constexpr (is_tiny_vector_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproducibleTinyVectorSum r_sum(array);
+        return r_sum.getSum();
+      } else if constexpr (is_tiny_matrix_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproducibleTinyMatrixSum r_sum(array);
+        return r_sum.getSum();
+      } else {
+        return ArraySum(array);
+      }
+    } else {
+      return ArraySum(array);
+    }
+  } else {
+    return ArraySum(array);
+  }
 }
 
 #endif   // ARRAY_HPP
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index 7c6c99fcb8edee03b3cbed4687275eef3f98a527..9284d3f038b599a073649b63c249b8144ec1bef5 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -4,6 +4,7 @@ add_library(
   PugsUtils
   BuildInfo.cpp
   BacktraceManager.cpp
+  CommunicatorManager.cpp
   ConsoleManager.cpp
   Demangle.cpp
   Exceptions.cpp
@@ -14,6 +15,7 @@ add_library(
   PETScWrapper.cpp
   PugsUtils.cpp
   RandomEngine.cpp
+  ReproducibleSumManager.cpp
   RevisionInfo.cpp
   SignalManager.cpp
   SLEPcWrapper.cpp
diff --git a/src/utils/CommunicatorManager.cpp b/src/utils/CommunicatorManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..511257c4bb10f941ca592d2500e55b32ef45ec7d
--- /dev/null
+++ b/src/utils/CommunicatorManager.cpp
@@ -0,0 +1,27 @@
+#include <utils/CommunicatorManager.hpp>
+
+#include <utils/PugsAssert.hpp>
+
+std::optional<int> CommunicatorManager::s_split_color;
+
+bool
+CommunicatorManager::hasSplitColor()
+{
+  return s_split_color.has_value();
+}
+
+int
+CommunicatorManager::splitColor()
+{
+  Assert(s_split_color.has_value(), "split color has not been defined");
+
+  return s_split_color.value();
+}
+
+void
+CommunicatorManager::setSplitColor(int split_color)
+{
+  Assert(not s_split_color.has_value(), "split color has already been defined");
+
+  s_split_color = split_color;
+}
diff --git a/src/utils/CommunicatorManager.hpp b/src/utils/CommunicatorManager.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..df0ea8dba61b7ab51f9dd5819a9a20c9eca684ec
--- /dev/null
+++ b/src/utils/CommunicatorManager.hpp
@@ -0,0 +1,17 @@
+#ifndef COMMUNICATOR_MANAGER_HPP
+#define COMMUNICATOR_MANAGER_HPP
+
+#include <optional>
+
+class CommunicatorManager
+{
+ private:
+  static std::optional<int> s_split_color;
+
+ public:
+  static bool hasSplitColor();
+  static int splitColor();
+  static void setSplitColor(int split_color);
+};
+
+#endif   // COMMUNICATOR_MANAGER_HPP
diff --git a/src/utils/Filesystem.hpp b/src/utils/Filesystem.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..63af52b2e432f1baf55d2cbfd895485a408f7e10
--- /dev/null
+++ b/src/utils/Filesystem.hpp
@@ -0,0 +1,23 @@
+#ifndef FILESYSTEM_HPP
+#define FILESYSTEM_HPP
+
+#include <utils/Exceptions.hpp>
+#include <utils/PugsMacros.hpp>
+
+#include <filesystem>
+
+PUGS_INLINE void
+createDirectoryIfNeeded(const std::string& filename)
+{
+  std::filesystem::path path = std::filesystem::path{filename}.parent_path();
+  if (not path.empty()) {
+    try {
+      std::filesystem::create_directories(path);
+    }
+    catch (std::filesystem::filesystem_error& e) {
+      throw NormalError(e.what());
+    }
+  }
+}
+
+#endif   // FILESYSTEM_HPP
diff --git a/src/utils/Messenger.cpp b/src/utils/Messenger.cpp
index e3fff49d02f94649f641f5a273f9f607cf27edd9..05dfa43e5a22f5223fd6c1e8291302c5366f5015 100644
--- a/src/utils/Messenger.cpp
+++ b/src/utils/Messenger.cpp
@@ -1,5 +1,7 @@
 #include <utils/Messenger.hpp>
 
+#include <utils/CommunicatorManager.hpp>
+
 #include <iostream>
 
 namespace parallel
@@ -31,13 +33,34 @@ Messenger::Messenger([[maybe_unused]] int& argc, [[maybe_unused]] char* argv[])
 #ifdef PUGS_HAS_MPI
   MPI_Init(&argc, &argv);
 
-  m_rank = []() {
+  if (CommunicatorManager::hasSplitColor()) {
+    int key = 0;
+
+    auto res = MPI_Comm_split(MPI_COMM_WORLD, CommunicatorManager::splitColor(), key, &m_pugs_comm_world);
+    if (res) {
+      MPI_Abort(MPI_COMM_WORLD, res);
+    }
+  }
+
+  m_rank = [&]() {
+    int rank;
+    MPI_Comm_rank(m_pugs_comm_world, &rank);
+    return rank;
+  }();
+
+  m_size = [&]() {
+    int size = 0;
+    MPI_Comm_size(m_pugs_comm_world, &size);
+    return size;
+  }();
+
+  m_global_rank = [&]() {
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     return rank;
   }();
 
-  m_size = []() {
+  m_global_size = [&]() {
     int size = 0;
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     return size;
@@ -64,7 +87,7 @@ void
 Messenger::barrier() const
 {
 #ifdef PUGS_HAS_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(m_pugs_comm_world);
 #endif   // PUGS_HAS_MPI
 }
 
diff --git a/src/utils/Messenger.hpp b/src/utils/Messenger.hpp
index 416dbdf1d65e8b028b9ed39fff2a2acb4ebcb2d5..21e17abf24600869b619a66415dc51714ec2cb70 100644
--- a/src/utils/Messenger.hpp
+++ b/src/utils/Messenger.hpp
@@ -23,7 +23,7 @@ namespace parallel
 {
 class Messenger
 {
- private:
+ public:
   struct helper
   {
 #ifdef PUGS_HAS_MPI
@@ -87,9 +87,16 @@ class Messenger
   static Messenger* m_instance;
   Messenger(int& argc, char* argv[]);
 
+#ifdef PUGS_HAS_MPI
+  MPI_Comm m_pugs_comm_world = MPI_COMM_WORLD;
+#endif   // PUGS_HAS_MPI
   size_t m_rank{0};
   size_t m_size{1};
 
+  // Rank and size in the whole MPI_COMM_WORLD of the process
+  size_t m_global_rank{0};
+  size_t m_global_size{1};
+
   template <typename DataType>
   void
   _gather(const DataType& data, Array<DataType> gather, size_t rank) const
@@ -102,7 +109,7 @@ class Messenger
 
     auto gather_address = (gather.size() > 0) ? &(gather[0]) : nullptr;
 
-    MPI_Gather(&data, 1, mpi_datatype, gather_address, 1, mpi_datatype, rank, MPI_COMM_WORLD);
+    MPI_Gather(&data, 1, mpi_datatype, gather_address, 1, mpi_datatype, rank, m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     gather[0] = data;
 #endif   // PUGS_HAS_MPI
@@ -131,7 +138,7 @@ class Messenger
     auto gather_address = (gather_array.size() > 0) ? &(gather_array[0]) : nullptr;
 
     MPI_Gather(data_address, data_array.size(), mpi_datatype, gather_address, data_array.size(), mpi_datatype, rank,
-               MPI_COMM_WORLD);
+               m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     copy_to(data_array, gather_array);
 #endif   // PUGS_HAS_MPI
@@ -172,7 +179,7 @@ class Messenger
     auto gather_address    = (gather_array.size() > 0) ? &(gather_array[0]) : nullptr;
 
     MPI_Gatherv(data_address, data_array.size(), mpi_datatype, gather_address, sizes_address, positions_address,
-                mpi_datatype, rank, MPI_COMM_WORLD);
+                mpi_datatype, rank, m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     copy_to(data_array, gather_array);
 #endif   // PUGS_HAS_MPI
@@ -188,7 +195,7 @@ class Messenger
 #ifdef PUGS_HAS_MPI
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
-    MPI_Allgather(&data, 1, mpi_datatype, &(gather[0]), 1, mpi_datatype, MPI_COMM_WORLD);
+    MPI_Allgather(&data, 1, mpi_datatype, &(gather[0]), 1, mpi_datatype, m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     gather[0] = data;
 #endif   // PUGS_HAS_MPI
@@ -217,7 +224,7 @@ class Messenger
     auto gather_address = (gather_array.size() > 0) ? &(gather_array[0]) : nullptr;
 
     MPI_Allgather(data_address, data_array.size(), mpi_datatype, gather_address, data_array.size(), mpi_datatype,
-                  MPI_COMM_WORLD);
+                  m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     copy_to(data_array, gather_array);
 #endif   // PUGS_HAS_MPI
@@ -257,7 +264,7 @@ class Messenger
     auto gather_address    = (gather_array.size() > 0) ? &(gather_array[0]) : nullptr;
 
     MPI_Allgatherv(data_address, data_array.size(), mpi_datatype, gather_address, sizes_address, positions_address,
-                   mpi_datatype, MPI_COMM_WORLD);
+                   mpi_datatype, m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     copy_to(data_array, gather_array);
 #endif   // PUGS_HAS_MPI
@@ -273,7 +280,7 @@ class Messenger
 #ifdef PUGS_HAS_MPI
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
-    MPI_Bcast(&data, 1, mpi_datatype, root_rank, MPI_COMM_WORLD);
+    MPI_Bcast(&data, 1, mpi_datatype, root_rank, m_pugs_comm_world);
 #endif   // PUGS_HAS_MPI
   }
 
@@ -289,7 +296,7 @@ class Messenger
     auto array_address = (array.size() > 0) ? &(array[0]) : nullptr;
 
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
-    MPI_Bcast(array_address, array.size(), mpi_datatype, root_rank, MPI_COMM_WORLD);
+    MPI_Bcast(array_address, array.size(), mpi_datatype, root_rank, m_pugs_comm_world);
 #endif   // PUGS_HAS_MPI
   }
 
@@ -318,7 +325,7 @@ class Messenger
     auto sent_address = (sent_array.size() > 0) ? &(sent_array[0]) : nullptr;
     auto recv_address = (recv_array.size() > 0) ? &(recv_array[0]) : nullptr;
 
-    MPI_Alltoall(sent_address, count, mpi_datatype, recv_address, count, mpi_datatype, MPI_COMM_WORLD);
+    MPI_Alltoall(sent_address, count, mpi_datatype, recv_address, count, mpi_datatype, m_pugs_comm_world);
 #else    // PUGS_HAS_MPI
     copy_to(sent_array, recv_array);
 #endif   // PUGS_HAS_MPI
@@ -348,7 +355,7 @@ class Messenger
       const auto sent_array = sent_array_list[i_send];
       if (sent_array.size() > 0) {
         MPI_Request request;
-        MPI_Isend(&(sent_array[0]), sent_array.size(), mpi_datatype, i_send, 0, MPI_COMM_WORLD, &request);
+        MPI_Isend(&(sent_array[0]), sent_array.size(), mpi_datatype, i_send, 0, m_pugs_comm_world, &request);
         request_list.push_back(request);
       }
     }
@@ -357,7 +364,7 @@ class Messenger
       auto recv_array = recv_array_list[i_recv];
       if (recv_array.size() > 0) {
         MPI_Request request;
-        MPI_Irecv(&(recv_array[0]), recv_array.size(), mpi_datatype, i_recv, 0, MPI_COMM_WORLD, &request);
+        MPI_Irecv(&(recv_array[0]), recv_array.size(), mpi_datatype, i_recv, 0, m_pugs_comm_world, &request);
         request_list.push_back(request);
       }
     }
@@ -424,6 +431,33 @@ class Messenger
     return m_size;
   }
 
+  // The global rank is the rank in the whole MPI_COMM_WORLD, one
+  // generally needs rank() for classical parallelism
+  PUGS_INLINE
+  const size_t&
+  globalRank() const
+  {
+    return m_global_rank;
+  }
+
+  // The global size is the size in the whole MPI_COMM_WORLD, one
+  // generally needs size() for classical parallelism
+  PUGS_INLINE
+  const size_t&
+  globalSize() const
+  {
+    return m_global_size;
+  }
+
+#ifdef PUGS_HAS_MPI
+  PUGS_INLINE
+  const MPI_Comm&
+  comm() const
+  {
+    return m_pugs_comm_world;
+  }
+#endif   // PUGS_HAS_MPI
+
   void barrier() const;
 
   template <typename DataType>
@@ -438,7 +472,7 @@ class Messenger
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
     DataType min_data = data;
-    MPI_Allreduce(&data, &min_data, 1, mpi_datatype, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&data, &min_data, 1, mpi_datatype, MPI_MIN, m_pugs_comm_world);
 
     return min_data;
 #else    // PUGS_HAS_MPI
@@ -456,7 +490,7 @@ class Messenger
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
     DataType max_data = data;
-    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_LAND, MPI_COMM_WORLD);
+    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_LAND, m_pugs_comm_world);
 
     return max_data;
 #else    // PUGS_HAS_MPI
@@ -474,7 +508,7 @@ class Messenger
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
     DataType max_data = data;
-    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_LOR, MPI_COMM_WORLD);
+    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_LOR, m_pugs_comm_world);
 
     return max_data;
 #else    // PUGS_HAS_MPI
@@ -494,7 +528,7 @@ class Messenger
     MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
     DataType max_data = data;
-    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&data, &max_data, 1, mpi_datatype, MPI_MAX, m_pugs_comm_world);
 
     return max_data;
 #else    // PUGS_HAS_MPI
@@ -514,18 +548,20 @@ class Messenger
       MPI_Datatype mpi_datatype = Messenger::helper::mpiType<DataType>();
 
       DataType data_sum = data;
-      MPI_Allreduce(&data, &data_sum, 1, mpi_datatype, MPI_SUM, MPI_COMM_WORLD);
+      MPI_Allreduce(&data, &data_sum, 1, mpi_datatype, MPI_SUM, m_pugs_comm_world);
 
       return data_sum;
-    } else if (is_trivially_castable<DataType>) {
+    } else if constexpr (is_trivially_castable<DataType>) {
       using InnerDataType = typename DataType::data_type;
 
       MPI_Datatype mpi_datatype = Messenger::helper::mpiType<InnerDataType>();
       const int size            = sizeof(DataType) / sizeof(InnerDataType);
       DataType data_sum         = data;
-      MPI_Allreduce(&data, &data_sum, size, mpi_datatype, MPI_SUM, MPI_COMM_WORLD);
+      MPI_Allreduce(&data, &data_sum, size, mpi_datatype, MPI_SUM, m_pugs_comm_world);
 
       return data_sum;
+    } else {
+      throw UnexpectedError("invalid data type for reduce sum");
     }
 #else    // PUGS_HAS_MPI
     return data;
diff --git a/src/utils/PETScWrapper.cpp b/src/utils/PETScWrapper.cpp
index 18b16aa1435c2f27dc4d850c0727ddb4e259acb0..fb7fc932a2b5c3013d3831fb22bbbaa997d09bb0 100644
--- a/src/utils/PETScWrapper.cpp
+++ b/src/utils/PETScWrapper.cpp
@@ -1,5 +1,6 @@
 #include <utils/PETScWrapper.hpp>
 
+#include <utils/Messenger.hpp>
 #include <utils/pugs_config.hpp>
 
 #ifdef PUGS_HAS_PETSC
@@ -12,6 +13,7 @@ void
 initialize([[maybe_unused]] int& argc, [[maybe_unused]] char* argv[])
 {
 #ifdef PUGS_HAS_PETSC
+  PETSC_COMM_WORLD = parallel::Messenger::getInstance().comm();
   PetscOptionsSetValue(NULL, "-no_signal_handler", "true");
   PetscOptionsSetValue(NULL, "-fp_trap", "false");
   PetscInitialize(&argc, &argv, 0, 0);
diff --git a/src/utils/Partitioner.cpp b/src/utils/Partitioner.cpp
index 347b07da6bffdf62d0218ca5231ed9e03a9c698f..e63a4b48ccfdd57952d77797bbbee3344187352e 100644
--- a/src/utils/Partitioner.cpp
+++ b/src/utils/Partitioner.cpp
@@ -31,7 +31,8 @@ Partitioner::partition(const CRSGraph& graph)
   Array<int> part(0);
 
   MPI_Group world_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+
+  MPI_Comm_group(parallel::Messenger::getInstance().comm(), &world_group);
 
   MPI_Group mesh_group;
   std::vector<int> group_ranks = [&]() {
@@ -49,7 +50,7 @@ Partitioner::partition(const CRSGraph& graph)
   MPI_Group_incl(world_group, group_ranks.size(), &(group_ranks[0]), &mesh_group);
 
   MPI_Comm parmetis_comm;
-  MPI_Comm_create_group(MPI_COMM_WORLD, mesh_group, 1, &parmetis_comm);
+  MPI_Comm_create_group(parallel::Messenger::getInstance().comm(), mesh_group, 1, &parmetis_comm);
 
   int local_number_of_nodes = graph.numberOfNodes();
 
diff --git a/src/utils/PugsUtils.cpp b/src/utils/PugsUtils.cpp
index a684a71a98abe11a8899d2c0e15c3ac515adb6cc..c265c5f906dc69fecb73f4e70b7877a735c1b39c 100644
--- a/src/utils/PugsUtils.cpp
+++ b/src/utils/PugsUtils.cpp
@@ -2,10 +2,12 @@
 
 #include <utils/BacktraceManager.hpp>
 #include <utils/BuildInfo.hpp>
+#include <utils/CommunicatorManager.hpp>
 #include <utils/ConsoleManager.hpp>
 #include <utils/FPEManager.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/PETScWrapper.hpp>
+#include <utils/ReproducibleSumManager.hpp>
 #include <utils/RevisionInfo.hpp>
 #include <utils/SLEPcWrapper.hpp>
 #include <utils/SignalManager.hpp>
@@ -81,7 +83,6 @@ setDefaultOMPEnvironment()
 std::string
 initialize(int& argc, char* argv[])
 {
-  parallel::Messenger::create(argc, argv);
   bool enable_fpe     = true;
   bool enable_signals = true;
   int nb_threads      = -1;
@@ -118,20 +119,36 @@ initialize(int& argc, char* argv[])
     bool pause_on_error = false;
     app.add_flag("-p,--pause-on-error", pause_on_error, "Pause for debugging on unexpected error [default: false]");
 
+    bool reproducible_sums = true;
+    app.add_flag("--reproducible-sums,!--no-reproducible-sums", show_preamble,
+                 "Special treatment of array sums to ensure reproducibility [default: true]");
+
+    int mpi_split_color = -1;
+    app.add_option("--mpi-split-color", mpi_split_color, "Sets the MPI split color value (for MPMD applications)")
+      ->check(CLI::Range(0, std::numeric_limits<decltype(mpi_split_color)>::max()));
+
     std::atexit([]() { std::cout << rang::style::reset; });
     try {
       app.parse(argc, argv);
     }
     catch (const CLI::ParseError& e) {
+      // Stupid trick to avoid repetition of error messages in parallel
+      parallel::Messenger::create(argc, argv);
       parallel::Messenger::destroy();
       std::exit(app.exit(e, std::cout, std::cerr));
     }
 
+    if (app.count("--mpi-split-color") > 0) {
+      CommunicatorManager::setSplitColor(mpi_split_color);
+    }
+
     BacktraceManager::setShow(show_backtrace);
     ConsoleManager::setShowPreamble(show_preamble);
     ConsoleManager::init(enable_color);
     SignalManager::setPauseForDebug(pause_on_error);
+    ReproducibleSumManager::setReproducibleSums(reproducible_sums);
   }
+  parallel::Messenger::create(argc, argv);
 
   PETScWrapper::initialize(argc, argv);
   SLEPcWrapper::initialize(argc, argv);
@@ -155,7 +172,14 @@ initialize(int& argc, char* argv[])
 
     std::cout << rang::style::bold;
 #ifdef PUGS_HAS_MPI
-    std::cout << "MPI number of ranks " << parallel::size() << '\n';
+    if (CommunicatorManager::hasSplitColor()) {
+      std::cout << "MPI number of global ranks " << parallel::Messenger::getInstance().globalSize() << '\n';
+      std::cout << "MPI local pugs communication world\n";
+      std::cout << " - number of local ranks  " << parallel::size() << '\n';
+      std::cout << " - local comm world color " << CommunicatorManager::splitColor() << '\n';
+    } else {
+      std::cout << "MPI number of ranks " << parallel::size() << '\n';
+    }
 #else    // PUGS_HAS_MPI
     std::cout << "Sequential build\n";
 #endif   // PUGS_HAS_MPI
diff --git a/src/utils/ReproducibleSumMPI.hpp b/src/utils/ReproducibleSumMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4b80eb5d32384ba14cf49ade607cbf66fcf5632
--- /dev/null
+++ b/src/utils/ReproducibleSumMPI.hpp
@@ -0,0 +1,124 @@
+#ifndef REPRODUCIBLE_SUM_MPI_HPP
+#define REPRODUCIBLE_SUM_MPI_HPP
+
+#include <utils/Messenger.hpp>
+#include <utils/ReproducibleSumUtils.hpp>
+#include <utils/pugs_config.hpp>
+
+#ifdef PUGS_HAS_MPI
+
+template <typename RSumType>
+struct ReproducibleMPIReducer
+{
+  static void
+  all_reduce_sum_embedder(void* local_value, void* sum_value, int* length, MPI_Datatype*)
+  {
+    Assert(*length == 1, "reduction must be performed on single bin");
+    using Bin        = typename RSumType::Bin;
+    Bin* p_local_bin = reinterpret_cast<Bin*>(local_value);
+    Bin* p_sum_bin   = reinterpret_cast<Bin*>(sum_value);
+
+    RSumType::addBinTo(*p_local_bin, *p_sum_bin);
+  }
+};
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleScalarSum<ArrayT, BoolArrayT>& s)
+{
+  using DataType = std::decay_t<typename ArrayT::data_type>;
+  using RSumType = std::decay_t<decltype(s)>;
+  using BinType  = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproducibleMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, parallel::Messenger::getInstance().comm());
+
+  return sum_bin;
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleTinyVectorSum<ArrayT, BoolArrayT>& s)
+{
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  using DataType       = typename TinyVectorType::data_type;
+  using RSumType       = std::decay_t<decltype(s)>;
+  using BinType        = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproducibleMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, parallel::Messenger::getInstance().comm());
+
+  return sum_bin;
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleTinyMatrixSum<ArrayT, BoolArrayT>& s)
+{
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  using DataType       = typename TinyVectorType::data_type;
+  using RSumType       = std::decay_t<decltype(s)>;
+  using BinType        = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproducibleMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, parallel::Messenger::getInstance().comm());
+
+  return sum_bin;
+}
+
+#else   // PUGS_HAS_MPI
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleScalarSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleTinyVectorSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproducibleSum(const ReproducibleTinyMatrixSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+#endif   // PUGS_HAS_MPI
+
+#endif   // REPRODUCIBLE_SUM_MPI_HPP
diff --git a/src/utils/ReproducibleSumManager.cpp b/src/utils/ReproducibleSumManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0f7f7d545e36346b7d0c3280024b81198cdce9c
--- /dev/null
+++ b/src/utils/ReproducibleSumManager.cpp
@@ -0,0 +1,15 @@
+#include <utils/ReproducibleSumManager.hpp>
+
+bool ReproducibleSumManager::s_reproducible_sums = true;
+
+bool
+ReproducibleSumManager::reproducibleSums()
+{
+  return s_reproducible_sums;
+}
+
+void
+ReproducibleSumManager::setReproducibleSums(bool reproducible_sum)
+{
+  s_reproducible_sums = reproducible_sum;
+}
diff --git a/src/utils/ReproducibleSumManager.hpp b/src/utils/ReproducibleSumManager.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..40e380fd6747a714906fbc15fc85667f551c8586
--- /dev/null
+++ b/src/utils/ReproducibleSumManager.hpp
@@ -0,0 +1,14 @@
+#ifndef REPRODUCIBLE_SUM_MANAGER_HPP
+#define REPRODUCIBLE_SUM_MANAGER_HPP
+
+class ReproducibleSumManager
+{
+ private:
+  static bool s_reproducible_sums;
+
+ public:
+  static void setReproducibleSums(bool);
+  static bool reproducibleSums();
+};
+
+#endif   // REPRODUCIBLE_SUM_MANAGER_HPP
diff --git a/src/utils/ReproducibleSumUtils.hpp b/src/utils/ReproducibleSumUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2d52aa20122452e96130c0c29b6d12ccc32a4aa
--- /dev/null
+++ b/src/utils/ReproducibleSumUtils.hpp
@@ -0,0 +1,878 @@
+#ifndef REPRODUCIBLE_SUM_UTILS_HPP
+#define REPRODUCIBLE_SUM_UTILS_HPP
+
+#include <utils/PugsUtils.hpp>
+#include <utils/Types.hpp>
+
+template <typename DataType>
+class Array;
+
+namespace reproducible_sum_utils
+{
+template <size_t NumberOfBits>
+struct IntegerFromBitSize
+{
+};
+
+template <>
+struct IntegerFromBitSize<8>
+{
+  using integer_t = std::int8_t;
+};
+
+template <>
+struct IntegerFromBitSize<16>
+{
+  using integer_t = std::int16_t;
+};
+
+template <>
+struct IntegerFromBitSize<32>
+{
+  using integer_t = std::int32_t;
+};
+
+template <>
+struct IntegerFromBitSize<64>
+{
+  using integer_t = std::int64_t;
+};
+
+template <typename DataType>
+struct IntegerType
+{
+  using integer_t = typename IntegerFromBitSize<sizeof(DataType) * 8>::integer_t;
+};
+
+template <typename DataType>
+DataType
+ulp(const DataType& x) noexcept(NO_ASSERT)
+{
+  static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
+
+  if (x == 0) {
+    return std::numeric_limits<DataType>::denorm_min();
+  }
+
+  return std::pow(DataType{2}, std::ilogb(std::abs(x)) - std::numeric_limits<DataType>::digits);
+}
+
+template <typename DataType>
+DataType
+ufp(const DataType& x) noexcept(NO_ASSERT)
+{
+  static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
+
+  return std::pow(DataType{2}, std::ilogb(std::abs(x)));
+}
+
+// Useful bits per bin
+template <typename DataType>
+constexpr inline size_t bin_size = 0;
+template <>
+constexpr inline size_t bin_size<double> = 40;
+template <>
+constexpr inline size_t bin_size<float> = 12;
+
+// IEEE 754 epsilon
+template <typename DataType>
+constexpr inline double bin_epsilon = 0;
+template <>
+constexpr inline double bin_epsilon<double> = std::numeric_limits<double>::epsilon();
+template <>
+constexpr inline double bin_epsilon<float> = std::numeric_limits<float>::epsilon();
+
+// number of bins: improves precision
+template <typename DataType>
+constexpr inline size_t bin_number = 0;
+
+template <>
+constexpr inline size_t bin_number<double> = 3;
+template <>
+constexpr inline size_t bin_number<float> = 4;
+
+// max local sum size to avoid overflow
+template <typename DataType>
+constexpr inline size_t bin_max_size = 0;
+
+template <>
+constexpr inline size_t bin_max_size<double> = 2048;
+template <>
+constexpr inline size_t bin_max_size<float> = 1024;
+
+struct NoMask
+{
+  PUGS_INLINE bool
+  operator[](size_t) const
+  {
+    return true;
+  }
+};
+
+}   // namespace reproducible_sum_utils
+
+template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
+class ReproducibleScalarSum
+{
+ public:
+  using DataType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<DataType, K> S;   // sum
+    std::array<DataType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&)      = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        S[k] = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        C[k] = 0;
+      }
+    }
+
+    Bin() = default;
+
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k] = bin.S[k - g];
+      bin.C[k] = bin.C[k - g];
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k]);
+      bin.C[k] = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const DataType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    Assert(m > 0);
+
+    using namespace reproducible_sum_utils;
+    if (m >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0])) {
+      const size_t shift = 1 + std::floor(std::log2(m / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0]))) / W);
+
+      _shift(shift, bin);
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(DataType& S, DataType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    x_bar.as_DataType = x;
+    x_bar.as_integer |= 0x1;
+
+    const DataType S0 = S;
+    S += x_bar.as_DataType;
+    x -= S - S0;
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      if (bin.S[k] >= 1.75 * ufp(bin.S[k])) {
+        bin.S[k] -= 0.25 * ufp(bin.S[k]);
+        bin.C[k] += 1;
+      } else if (bin.S[k] < 1.25 * ufp(bin.S[k])) {
+        bin.S[k] += 0.5 * ufp(bin.S[k]);
+        bin.C[k] -= 2;
+      } else if (bin.S[k] < 1.5 * ufp(bin.S[k])) {
+        bin.S[k] += 0.25 * ufp(bin.S[k]);
+        bin.C[k] -= 1;
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproducible_sum_utils;
+
+    DataType ulp_bin = ulp(bin.S[0]);
+    DataType ulp_sum = ulp(bin_sum.S[0]);
+
+    if (ulp_bin < ulp_sum) {
+      const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+      if (shift > 0) {
+        _shift(shift, bin);
+      }
+    } else if (ulp_bin > ulp_sum) {
+      const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+      if (shift > 0) {
+        _shift(shift, bin_sum);
+      }
+    }
+
+    for (size_t k = 0; k < K; ++k) {
+      bin_sum.S[k] += bin.S[k] - 1.5 * ufp(bin.S[k]);
+      bin_sum.C[k] += bin.C[k];
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static DataType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    DataType value = 0;
+    for (size_t k = 0; k < K; ++k) {
+      value += 0.25 * (bin.C[k] - 6) * ufp(bin.S[k]) + bin.S[k];
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE DataType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproducibleScalarSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproducible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<DataType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<DataType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(0);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              DataType& m        = local_max[thread_id];
+              DataType abs_value = std::abs(array[league_start + i]);
+              if (abs_value > m) {
+                m = abs_value;
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              DataType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              };
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproducibleScalarSum() = default;
+};
+
+template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
+class ReproducibleTinyVectorSum
+{
+ public:
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(is_tiny_vector_v<TinyVectorType>);
+
+  using DataType = std::decay_t<typename TinyVectorType::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<TinyVectorType, K> S;   // sum
+    std::array<TinyVectorType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&)      = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+          S[k][i_component] = init_value;
+        }
+        C[k] = zero;
+      }
+    }
+
+    Bin() = default;
+
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin, const size_t& i_component) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k][i_component] = bin.S[k - g][i_component];
+      bin.C[k][i_component] = bin.C[k - g][i_component];
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k][i_component] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k][i_component]);
+      bin.C[k][i_component] = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const TinyVectorType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      if (m[i_component] >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component])) {
+        const size_t shift =
+          1 + std::floor(std::log2(m[i_component] / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component]))) / W);
+
+        _shift(shift, bin, i_component);
+      }
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(TinyVectorType& S, TinyVectorType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      x_bar.as_DataType = x[i_component];
+      x_bar.as_integer |= 0x1;
+
+      DataType& S_i = S[i_component];
+      DataType& x_i = x[i_component];
+
+      const DataType S0 = S_i;
+      S_i += x_bar.as_DataType;
+      x_i -= S_i - S0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      TinyVectorType& S_k = bin.S[k];
+      TinyVectorType& C_k = bin.C[k];
+      for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+        DataType& Sk_i = S_k[i_component];
+        DataType& Ck_i = C_k[i_component];
+
+        if (Sk_i >= 1.75 * ufp(Sk_i)) {
+          Sk_i -= 0.25 * ufp(Sk_i);
+          Ck_i += 1;
+        } else if (Sk_i < 1.25 * ufp(Sk_i)) {
+          Sk_i += 0.5 * ufp(Sk_i);
+          Ck_i -= 2;
+        } else if (Sk_i < 1.5 * ufp(Sk_i)) {
+          Sk_i += 0.25 * ufp(Sk_i);
+          Ck_i -= 1;
+        }
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      DataType ulp_bin = ulp(bin.S[0][i_component]);
+      DataType ulp_sum = ulp(bin_sum.S[0][i_component]);
+
+      if (ulp_bin < ulp_sum) {
+        const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+        if (shift > 0) {
+          _shift(shift, bin, i_component);
+        }
+      } else if (ulp_bin > ulp_sum) {
+        const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+        if (shift > 0) {
+          _shift(shift, bin_sum, i_component);
+        }
+      }
+
+      for (size_t k = 0; k < K; ++k) {
+        bin_sum.S[k][i_component] += bin.S[k][i_component] - 1.5 * ufp(bin.S[k][i_component]);
+        bin_sum.C[k][i_component] += bin.C[k][i_component];
+      }
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static TinyVectorType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    TinyVectorType value = zero;
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      for (size_t k = 0; k < K; ++k) {
+        value[i_component] += 0.25 * (bin.C[k][i_component] - 6) * ufp(bin.S[k][i_component]) + bin.S[k][i_component];
+      }
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE TinyVectorType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproducibleTinyVectorSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproducible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<TinyVectorType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<TinyVectorType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(zero);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+                DataType& m        = local_max[thread_id][i_component];
+                DataType abs_value = std::abs(array[league_start + i][i_component]);
+                if (abs_value > m) {
+                  m = abs_value;
+                }
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              TinyVectorType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              }
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproducibleTinyVectorSum() = default;
+};
+
+template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
+class ReproducibleTinyMatrixSum
+{
+ public:
+  using TinyMatrixType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(is_tiny_matrix_v<TinyMatrixType>);
+
+  using DataType = std::decay_t<typename TinyMatrixType::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<TinyMatrixType, K> S;   // sum
+    std::array<TinyMatrixType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&)      = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+            S[k](i_component, j_component) = init_value;
+          }
+        }
+        C[k] = zero;
+      }
+    }
+
+    Bin()  = default;
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin, const size_t i_component, const size_t j_component) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k](i_component, j_component) = bin.S[k - g](i_component, j_component);
+      bin.C[k](i_component, j_component) = bin.C[k - g](i_component, j_component);
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k](i_component, j_component) = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k](i_component, j_component));
+      bin.C[k](i_component, j_component) = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const TinyMatrixType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        if (m(i_component, j_component) >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component))) {
+          const size_t shift =
+            1 + std::floor(std::log2(m(i_component, j_component) /
+                                     (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component)))) /
+                           W);
+
+          _shift(shift, bin, i_component, j_component);
+        }
+      }
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(TinyMatrixType& S, TinyMatrixType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        DataType& S_ij = S(i_component, j_component);
+        DataType& x_ij = x(i_component, j_component);
+
+        x_bar.as_DataType = x_ij;
+        x_bar.as_integer |= 0x1;
+
+        const DataType S0 = S_ij;
+        S_ij += x_bar.as_DataType;
+        x_ij -= S_ij - S0;
+      }
+    }
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      TinyMatrixType& S_k = bin.S[k];
+      TinyMatrixType& C_k = bin.C[k];
+      for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+        for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+          DataType& Sk_ij = S_k(i_component, j_component);
+          DataType& Ck_ij = C_k(i_component, j_component);
+
+          if (Sk_ij >= 1.75 * ufp(Sk_ij)) {
+            Sk_ij -= 0.25 * ufp(Sk_ij);
+            Ck_ij += 1;
+          } else if (Sk_ij < 1.25 * ufp(Sk_ij)) {
+            Sk_ij += 0.5 * ufp(Sk_ij);
+            Ck_ij -= 2;
+          } else if (Sk_ij < 1.5 * ufp(Sk_ij)) {
+            Sk_ij += 0.25 * ufp(Sk_ij);
+            Ck_ij -= 1;
+          }
+        }
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproducible_sum_utils;
+
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        DataType ulp_bin = ulp(bin.S[0](i_component, j_component));
+        DataType ulp_sum = ulp(bin_sum.S[0](i_component, j_component));
+
+        if (ulp_bin < ulp_sum) {
+          const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+          if (shift > 0) {
+            _shift(shift, bin, i_component, j_component);
+          }
+        } else if (ulp_bin > ulp_sum) {
+          const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+          if (shift > 0) {
+            _shift(shift, bin_sum, i_component, j_component);
+          }
+        }
+
+        for (size_t k = 0; k < K; ++k) {
+          bin_sum.S[k](i_component, j_component) +=
+            bin.S[k](i_component, j_component) - 1.5 * ufp(bin.S[k](i_component, j_component));
+          bin_sum.C[k](i_component, j_component) += bin.C[k](i_component, j_component);
+        }
+      }
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static TinyMatrixType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproducible_sum_utils;
+
+    TinyMatrixType value = zero;
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        for (size_t k = 0; k < K; ++k) {
+          value(i_component, j_component) +=
+            0.25 * (bin.C[k](i_component, j_component) - 6) * ufp(bin.S[k](i_component, j_component)) +
+            bin.S[k](i_component, j_component);
+        }
+      }
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE TinyMatrixType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproducibleTinyMatrixSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproducible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<TinyMatrixType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<TinyMatrixType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(zero);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+                for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+                  DataType& m        = local_max[thread_id](i_component, j_component);
+                  DataType abs_value = std::abs(array[league_start + i](i_component, j_component));
+                  if (abs_value > m) {
+                    m = abs_value;
+                  }
+                }
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              TinyMatrixType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              }
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproducibleTinyMatrixSum() = default;
+};
+
+#endif   // REPRODUCIBLE_SUM_UTILS_HPP
diff --git a/tests/test_Array.cpp b/tests/test_Array.cpp
index f7d121df0ab11e229bae06f505e8f12361fe00da..7efc346e29693e2ee6302dfd4498ac2a9cb51b04 100644
--- a/tests/test_Array.cpp
+++ b/tests/test_Array.cpp
@@ -14,6 +14,10 @@
 #include <valarray>
 #include <vector>
 
+#include <utils/ReproducibleSumUtils.hpp>
+#include <utils/SmallArray.hpp>
+#include <utils/Timer.hpp>
+
 // Instantiate to ensure full coverage is performed
 template class Array<int>;
 
@@ -319,6 +323,837 @@ TEST_CASE("Array", "[utils]")
     }
   }
 
+  SECTION("reproducible floating point sum")
+  {
+    auto direct_sum = [](auto array) {
+      auto sum = array[0];
+      for (size_t i = 1; i < array.size(); ++i) {
+        sum += array[i];
+      }
+      return sum;
+    };
+
+    SECTION("reproducible double sum")
+    {
+      Array<double> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] = ((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      REQUIRE(sum_before_shuffle == Catch::Approx(direct_sum(array)));
+    }
+
+    SECTION("reproducible float sum")
+    {
+      Array<float> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] = ((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      REQUIRE(sum_before_shuffle == Catch::Approx(direct_sum(array)));
+    }
+
+    SECTION("reproducible TinyVector<3,double> sum")
+    {
+      Array<TinyVector<3, double>> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] =
+          TinyVector<3, double>(((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                17 * i - 3,   //
+                                1);
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      ReproducibleTinyVectorSum s0(array);
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      auto naive_sum = direct_sum(array);
+      REQUIRE(sum_before_shuffle[0] == Catch::Approx(naive_sum[0]));
+      REQUIRE(sum_before_shuffle[1] == Catch::Approx(naive_sum[1]));
+      REQUIRE(sum_before_shuffle[2] == Catch::Approx(naive_sum[2]));
+    }
+
+    SECTION("reproducible TinyVector<3,float> sum")
+    {
+      Array<TinyVector<3, float>> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] =
+          TinyVector<3, float>(((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                               17 * i - 3,   //
+                               1);
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      ReproducibleTinyVectorSum s0(array);
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      auto naive_sum = direct_sum(array);
+      REQUIRE(sum_before_shuffle[0] == Catch::Approx(naive_sum[0]));
+      REQUIRE(sum_before_shuffle[1] == Catch::Approx(naive_sum[1]));
+      REQUIRE(sum_before_shuffle[2] == Catch::Approx(naive_sum[2]));
+    }
+
+    SECTION("reproducible TinyMatrix<2, 3> sum")
+    {
+      Array<TinyMatrix<2, 3, double>> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] = TinyMatrix<2, 3, double>{((i < (array.size() / 1000)) * 1E25 + 1E10) * ((i + 1) % 1'000) *
+                                              std::sin(3 * i + 1),
+                                            17 * i - 3,            //
+                                            1,                     //
+                                            std::cos(7 * i) * i,   //
+                                            (2 - 3 * i) * 1E-30,   //
+                                            (i * 100 + 1000)};
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      auto naive_sum = direct_sum(array);
+      REQUIRE(sum_before_shuffle(0, 0) == Catch::Approx(naive_sum(0, 0)));
+      REQUIRE(sum_before_shuffle(0, 1) == Catch::Approx(naive_sum(0, 1)));
+      REQUIRE(sum_before_shuffle(0, 2) == Catch::Approx(naive_sum(0, 2)));
+      REQUIRE(sum_before_shuffle(1, 0) == Catch::Approx(naive_sum(1, 0)));
+      REQUIRE(sum_before_shuffle(1, 1) == Catch::Approx(naive_sum(1, 1)));
+      REQUIRE(sum_before_shuffle(1, 2) == Catch::Approx(naive_sum(1, 2)));
+    }
+
+    SECTION("reproducible TinyMatrix<2, 3, float> sum")
+    {
+      Array<TinyMatrix<2, 3, float>> array(10'000);
+
+      for (size_t i = 0; i < array.size(); ++i) {
+        array[i] =
+          TinyMatrix<2, 3, float>(((i < (array.size() / 1000)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  17 * i - 3,            //
+                                  1,                     //
+                                  std::cos(7 * i) * i,   //
+                                  (2 - 3 * i) * 1E-30,   //
+                                  (i * 100 + 1000));
+      }
+
+      const auto sum_before_shuffle = sum(array);
+
+      std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+      const auto sum_after_shuffle = sum(array);
+
+      REQUIRE(sum_before_shuffle == sum_after_shuffle);
+
+      auto naive_sum = direct_sum(array);
+      REQUIRE(sum_before_shuffle(0, 0) == Catch::Approx(naive_sum(0, 0)));
+      REQUIRE(sum_before_shuffle(0, 1) == Catch::Approx(naive_sum(0, 1)));
+      REQUIRE(sum_before_shuffle(0, 2) == Catch::Approx(naive_sum(0, 2)));
+      REQUIRE(sum_before_shuffle(1, 0) == Catch::Approx(naive_sum(1, 0)).epsilon(1E-4));
+      REQUIRE(sum_before_shuffle(1, 1) == Catch::Approx(naive_sum(1, 1)).margin(1E-6));
+      REQUIRE(sum_before_shuffle(1, 2) == Catch::Approx(naive_sum(1, 2)));
+    }
+
+    SECTION("scalar bin summation")
+    {
+      SECTION("small arrays")
+      {
+        Array<double> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] = ((i < (array1.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array2(357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] = ((i < (array2.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] = ((i < (array3.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleScalarSum s1(array1);
+        ReproducibleScalarSum s2(array2);
+        ReproducibleScalarSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("small and large arrays")
+      {
+        Array<double> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] = ((i < (array1.size() / 100)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] = ((i < (array2.size() / 1000)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] = ((i < (array3.size() / 30)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleScalarSum s1(array1);
+        ReproducibleScalarSum s2(array2);
+        ReproducibleScalarSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("large arrays")
+      {
+        Array<double> array1(5122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] = ((i < (array1.size() / 100)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] = ((i < (array2.size() / 1000)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array3(6283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] = ((i < (array3.size() / 300)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+        }
+
+        Array<double> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleScalarSum s1(array1);
+        ReproducibleScalarSum s2(array2);
+        ReproducibleScalarSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+    }
+
+    SECTION("TinyVector bin summation")
+    {
+      SECTION("small arrays")
+      {
+        Array<TinyVector<3, double>> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyVector<3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array2(357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyVector<3, double>(((i < (array2.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array2.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array2.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyVector<3, double>(((i < (array3.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array3.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array3.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyVectorSum s1(array1);
+        ReproducibleTinyVectorSum s2(array2);
+        ReproducibleTinyVectorSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("small and large arrays")
+      {
+        Array<TinyVector<3, double>> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyVector<3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyVector<3, double>(((i < (array2.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array2.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array2.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyVector<3, double>(((i < (array3.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array3.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array3.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyVectorSum s1(array1);
+        ReproducibleTinyVectorSum s2(array2);
+        ReproducibleTinyVectorSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("large arrays")
+      {
+        Array<TinyVector<3, double>> array1(5122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyVector<3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyVector<3, double>(((i < (array2.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array2.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array2.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array3(4283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyVector<3, double>(((i < (array3.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  ((i < (array3.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                  ((i < (array3.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1));
+        }
+
+        Array<TinyVector<3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyVectorSum s1(array1);
+        ReproducibleTinyVectorSum s2(array2);
+        ReproducibleTinyVectorSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+    }
+
+    SECTION("TinyMatrix bin summation")
+    {
+      SECTION("small arrays")
+      {
+        Array<TinyMatrix<2, 3, double>> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyMatrix<2, 3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array1.size() / 30)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array1.size() / 70)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array1.size() / 11)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array2(357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyMatrix<2, 3, double>(((i < (array2.size() / 20)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array2.size() / 70)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array2.size() / 32)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array2.size() / 45)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array2.size() / 66)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array2.size() / 13)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyMatrix<2, 3, double>(((i < (array3.size() / 23)) * 1E15 + 1E17) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array3.size() / 67)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array3.size() / 62)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array3.size() / 35)) * 1E8 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array3.size() / 63)) * 1E7 + 1E12) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array3.size() / 17)) * 1E9 + 1E8) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyMatrixSum s1(array1);
+        ReproducibleTinyMatrixSum s2(array2);
+        ReproducibleTinyMatrixSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("small and large arrays")
+      {
+        Array<TinyMatrix<2, 3, double>> array1(122);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyMatrix<2, 3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array1.size() / 30)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array1.size() / 70)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array1.size() / 11)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyMatrix<2, 3, double>(((i < (array2.size() / 20)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array2.size() / 70)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array2.size() / 32)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array2.size() / 45)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array2.size() / 66)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array2.size() / 13)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array3(283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyMatrix<2, 3, double>(((i < (array3.size() / 23)) * 1E15 + 1E17) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array3.size() / 67)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array3.size() / 62)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array3.size() / 35)) * 1E8 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array3.size() / 63)) * 1E7 + 1E12) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array3.size() / 17)) * 1E9 + 1E8) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyMatrixSum s1(array1);
+        ReproducibleTinyMatrixSum s2(array2);
+        ReproducibleTinyMatrixSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+
+      SECTION("large arrays")
+      {
+        Array<TinyMatrix<2, 3, double>> array1(3762);
+        for (size_t i = 0; i < array1.size(); ++i) {
+          array1[i] =
+            TinyMatrix<2, 3, double>(((i < (array1.size() / 10)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array1.size() / 50)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array1.size() / 20)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array1.size() / 30)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array1.size() / 70)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array1.size() / 11)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array2(7357);
+        for (size_t i = 0; i < array2.size(); ++i) {
+          array2[i] =
+            TinyMatrix<2, 3, double>(((i < (array2.size() / 20)) * 1E25 + 1E20) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array2.size() / 70)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array2.size() / 32)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array2.size() / 45)) * 1E6 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array2.size() / 66)) * 1E8 + 1E13) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array2.size() / 13)) * 1E8 + 1E9) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array3(6283);
+        for (size_t i = 0; i < array3.size(); ++i) {
+          array3[i] =
+            TinyMatrix<2, 3, double>(((i < (array3.size() / 23)) * 1E15 + 1E17) * ((i + 1) % 1'000) *
+                                       std::sin(3 * i + 1),
+                                     ((i < (array3.size() / 67)) * 1E15 + 1E10) * ((i + 1) % 100) * std::cos(3 * i + 1),
+                                     ((i < (array3.size() / 62)) * 1E7 + 1E10) * ((i + 1) % 10) * std::tan(3 * i + 1),
+                                     ((i < (array3.size() / 35)) * 1E8 + 1E11) * ((i + 1) % 100) * std::cos(4 * i + 1),
+                                     ((i < (array3.size() / 63)) * 1E7 + 1E12) * ((i + 1) % 100) * std::sin(4 * i + 1),
+                                     ((i < (array3.size() / 17)) * 1E9 + 1E8) * ((i + 1) % 100) * std::cos(4 * i + 1));
+        }
+
+        Array<TinyMatrix<2, 3, double>> array(array1.size() + array2.size() + array3.size());
+        {
+          size_t I = 0;
+          for (size_t i = 0; i < array1.size(); ++i) {
+            array[I++] = array1[i];
+          }
+
+          for (size_t i = 0; i < array2.size(); ++i) {
+            array[I++] = array2[i];
+          }
+
+          for (size_t i = 0; i < array3.size(); ++i) {
+            array[I++] = array3[i];
+          }
+        }
+
+        ReproducibleTinyMatrixSum s1(array1);
+        ReproducibleTinyMatrixSum s2(array2);
+        ReproducibleTinyMatrixSum s3(array3);
+        using RSumT = decltype(s1);
+
+        RSumT::Bin bin1 = s1.getSummationBin();
+        RSumT::Bin bin2 = s2.getSummationBin();
+        RSumT::Bin bin3 = s3.getSummationBin();
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin1, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin3, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+
+        {
+          RSumT::Bin bin_sum = zero;
+
+          RSumT::addBinTo(bin3, bin_sum);
+          RSumT::addBinTo(bin2, bin_sum);
+          RSumT::addBinTo(bin1, bin_sum);
+
+          REQUIRE(sum(array) == RSumT::getValue(bin_sum));
+        }
+      }
+    }
+  }
+
   SECTION("checking for subArrayView")
   {
     Array<int> array{10};
diff --git a/tests/test_DiscreteFunctionP0.cpp b/tests/test_DiscreteFunctionP0.cpp
index 39b91c4de355c1a96feaa1f1b5e2a59bf9c494d8..d0f71e984d516ec243de289f399d882663c9c499 100644
--- a/tests/test_DiscreteFunctionP0.cpp
+++ b/tests/test_DiscreteFunctionP0.cpp
@@ -2787,7 +2787,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -2806,8 +2806,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -2826,10 +2826,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
@@ -3131,7 +3131,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -3150,8 +3150,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -3170,10 +3170,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
@@ -3563,7 +3563,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -3582,8 +3582,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -3602,10 +3602,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
diff --git a/tests/test_ItemArrayUtils.cpp b/tests/test_ItemArrayUtils.cpp
index 8c832d16c1dac5ed7a84828303f68bceaf25c3d8..346345eae1134da16e7e75b7c1d6f553ac14ef6d 100644
--- a/tests/test_ItemArrayUtils.cpp
+++ b/tests/test_ItemArrayUtils.cpp
@@ -8,6 +8,8 @@
 #include <mesh/ItemArray.hpp>
 #include <mesh/ItemArrayUtils.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <utils/Messenger.hpp>
 
 // clazy:excludeall=non-pod-global-static
@@ -1055,26 +1057,31 @@ TEST_CASE("ItemArrayUtils", "[mesh]")
       std::array mesh_list = MeshDataBaseForTests::get().all2DMeshes();
 
       for (const auto& named_mesh : mesh_list) {
-        SECTION(named_mesh.name() + " for size_t data")
+        SECTION(named_mesh.name() + " for double data")
         {
           auto mesh_2d = named_mesh.mesh();
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
-          FaceArray<size_t> face_array{connectivity, 3};
-          face_array.fill(2);
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
+          FaceArray<double> face_array{connectivity, 3};
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_array[face_id][0] = ll[face_id];
+              face_array[face_id][1] = (3 + ll[face_id]) * ll[face_id];
+              face_array[face_id][2] = 2 * ll[face_id] - 3;
+            });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
-            for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
-            }
-            return parallel::allReduceSum(number_of_faces);
-          }();
+          FaceValue<double> face_sum{connectivity};
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_sum[face_id] = face_array[face_id][0] + face_array[face_id][1] + face_array[face_id][2];
+            });
 
-          REQUIRE(sum(face_array) == 2 * face_array.sizeOfArrays() * global_number_of_faces);
+          REQUIRE(sum(face_array) == sum(face_sum));
         }
 
         SECTION(named_mesh.name() + " for N^2 data")
diff --git a/tests/test_ItemValueUtils.cpp b/tests/test_ItemValueUtils.cpp
index 755392515ae3078421375f71af8727aa8bff144e..e9df51a05bc9b5a411a186ad742c2309c93ad9d4 100644
--- a/tests/test_ItemValueUtils.cpp
+++ b/tests/test_ItemValueUtils.cpp
@@ -8,6 +8,8 @@
 #include <mesh/ItemValue.hpp>
 #include <mesh/ItemValueUtils.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <utils/Messenger.hpp>
 
 // clazy:excludeall=non-pod-global-static
@@ -240,26 +242,60 @@ TEST_CASE("ItemValueUtils", "[mesh]")
       std::array mesh_list = MeshDataBaseForTests::get().all1DMeshes();
 
       for (const auto& named_mesh : mesh_list) {
-        SECTION(named_mesh.name())
+        SECTION(named_mesh.name() + " for size_t data")
         {
           auto mesh_1d = named_mesh.mesh();
 
           const Connectivity<1>& connectivity = mesh_1d->connectivity();
 
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_1d).Vj();
+
           CellValue<size_t> cell_value{connectivity};
-          cell_value.fill(5);
+
+          parallel_for(
+            connectivity.numberOfCells(),
+            PUGS_LAMBDA(CellId cell_id) { cell_value[cell_id] = std::ceil(100 * Vj[cell_id]); });
 
           auto cell_is_owned = connectivity.cellIsOwned();
 
-          const size_t global_number_of_cells = [&] {
-            size_t number_of_cells = 0;
+          const size_t global_sum = [&] {
+            size_t local_dum = 0;
             for (CellId cell_id = 0; cell_id < cell_is_owned.numberOfItems(); ++cell_id) {
-              number_of_cells += cell_is_owned[cell_id];
+              local_dum += cell_is_owned[cell_id] * cell_value[cell_id];
             }
-            return parallel::allReduceSum(number_of_cells);
+            return parallel::allReduceSum(local_dum);
           }();
 
-          REQUIRE(sum(cell_value) == 5 * global_number_of_cells);
+          REQUIRE(sum(cell_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + " for double data")
+        {
+          auto mesh_1d = named_mesh.mesh();
+
+          const Connectivity<1>& connectivity = mesh_1d->connectivity();
+
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_1d).Vj();
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+
+          size_t nb_owned_cells = 0;
+          for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+            nb_owned_cells += cell_is_owned[cell_id];
+          }
+
+          Array<double> local_Vj(nb_owned_cells);
+          {
+            size_t index = 0;
+            for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+              if (cell_is_owned[cell_id]) {
+                local_Vj[index++] = Vj[cell_id];
+              }
+            }
+          }
+          Array<double> global_Vj = parallel::allGatherVariable(local_Vj);
+
+          REQUIRE(sum(Vj) == sum(global_Vj));
         }
       }
     }
@@ -275,20 +311,54 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
           FaceValue<size_t> face_value{connectivity};
-          face_value.fill(2);
+
+          parallel_for(
+            connectivity.numberOfFaces(),
+            PUGS_LAMBDA(FaceId face_id) { face_value[face_id] = std::ceil(100 * ll[face_id]); });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
+          const size_t global_sum = [&] {
+            size_t local_sum = 0;
             for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
+              local_sum += face_is_owned[face_id] * face_value[face_id];
             }
-            return parallel::allReduceSum(number_of_faces);
+            return parallel::allReduceSum(local_sum);
           }();
 
-          REQUIRE(sum(face_value) == 2 * global_number_of_faces);
+          REQUIRE(sum(face_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + "for double data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_2d).Vj();
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+
+          size_t nb_owned_cells = 0;
+          for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+            nb_owned_cells += cell_is_owned[cell_id];
+          }
+
+          Array<double> local_Vj(nb_owned_cells);
+          {
+            size_t index = 0;
+            for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+              if (cell_is_owned[cell_id]) {
+                local_Vj[index++] = Vj[cell_id];
+              }
+            }
+          }
+          Array<double> global_Vj = parallel::allGatherVariable(local_Vj);
+
+          REQUIRE(sum(Vj) == sum(global_Vj));
         }
 
         SECTION(named_mesh.name() + "for N^3 data")
@@ -297,22 +367,66 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
           using N3 = TinyVector<3, size_t>;
           FaceValue<N3> face_value{connectivity};
-          const N3 data(2, 1, 4);
-          face_value.fill(data);
+
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = N3(std::ceil(100 * ll[face_id]),        //
+                                       std::ceil(200 * ll[face_id] - 40),   //
+                                       std::ceil(17.3 * ll[face_id] + 12.9));
+            });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
+          const N3 global_sum = [=] {
+            N3 local_sum = zero;
             for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
+              local_sum += face_is_owned[face_id] * face_value[face_id];
             }
-            return parallel::allReduceSum(number_of_faces);
+            return parallel::allReduceSum(local_sum);
           }();
 
-          REQUIRE(sum(face_value) == global_number_of_faces * data);
+          REQUIRE(sum(face_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + "for R2 data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
+          using R2x3 = TinyVector<2, double>;
+          FaceValue<R2x3> face_value{connectivity};
+
+          auto face_is_owned = connectivity.faceIsOwned();
+
+          size_t nb_owned_faces = 0;
+          for (FaceId face_id = 0; face_id < connectivity.numberOfFaces(); ++face_id) {
+            nb_owned_faces += face_is_owned[face_id];
+          }
+
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = R2x3{ll[face_id], 1. / ll[face_id]};
+            });
+
+          Array<R2x3> local_array(nb_owned_faces);
+          {
+            size_t index = 0;
+            for (FaceId face_id = 0; face_id < connectivity.numberOfFaces(); ++face_id) {
+              if (face_is_owned[face_id]) {
+                local_array[index++] = face_value[face_id];
+              }
+            }
+          }
+          Array global_array = parallel::allGatherVariable(local_array);
+
+          REQUIRE(sum(face_value) == sum(global_array));
         }
       }
     }
@@ -369,6 +483,48 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           REQUIRE(sum(node_value) == global_number_of_nodes * data);
         }
+
+        SECTION(named_mesh.name() + "for R^2x3 data")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          auto xr = mesh_3d->xr();
+
+          using R2x3 = TinyMatrix<2, 3, double>;
+          NodeValue<R2x3> node_value{connectivity};
+
+          auto node_is_owned = connectivity.nodeIsOwned();
+
+          size_t nb_owned_nodes = 0;
+          for (NodeId node_id = 0; node_id < connectivity.numberOfNodes(); ++node_id) {
+            nb_owned_nodes += node_is_owned[node_id];
+          }
+
+          parallel_for(
+            connectivity.numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              node_value[node_id] = R2x3{xr[node_id][0],
+                                         l2Norm(xr[node_id]),
+                                         3.178 * xr[node_id][0] - 3 * xr[node_id][2],
+                                         xr[node_id][1],
+                                         xr[node_id][2] + xr[node_id][0],
+                                         xr[node_id][0] * xr[node_id][1] * xr[node_id][2]};
+            });
+
+          Array<R2x3> local_array(nb_owned_nodes);
+          {
+            size_t index = 0;
+            for (NodeId node_id = 0; node_id < connectivity.numberOfNodes(); ++node_id) {
+              if (node_is_owned[node_id]) {
+                local_array[index++] = node_value[node_id];
+              }
+            }
+          }
+          Array global_array = parallel::allGatherVariable(local_array);
+
+          REQUIRE(sum(node_value) == sum(global_array));
+        }
       }
     }
   }
diff --git a/tests/test_MeshFlatNodeBoundary.cpp b/tests/test_MeshFlatNodeBoundary.cpp
index 9b9169f73c4306f9591ba0a9c0f5c1c622f7ef69..760f5564226f77fa53ec8de15f3f85552f02cbc4 100644
--- a/tests/test_MeshFlatNodeBoundary.cpp
+++ b/tests/test_MeshFlatNodeBoundary.cpp
@@ -1133,6 +1133,211 @@ TEST_CASE("MeshFlatNodeBoundary", "[mesh]")
     }
   }
 
+  SECTION("rotated diamond")
+  {
+    SECTION("2D")
+    {
+      static constexpr size_t Dimension = 2;
+
+      using ConnectivityType = Connectivity<Dimension>;
+      using MeshType         = Mesh<ConnectivityType>;
+
+      using R2 = TinyVector<2>;
+
+      auto T = [](const R2& x) -> R2 { return R2{x[0] + 0.1 * x[1], x[1] + 0.1 * x[0]}; };
+
+      SECTION("cartesian 2d")
+      {
+        std::shared_ptr p_mesh = MeshDataBaseForTests::get().cartesian2DMesh();
+
+        const ConnectivityType& connectivity = p_mesh->connectivity();
+
+        auto xr = p_mesh->xr();
+
+        NodeValue<R2> rotated_xr{connectivity};
+
+        parallel_for(
+          connectivity.numberOfNodes(), PUGS_LAMBDA(const NodeId node_id) { rotated_xr[node_id] = T(xr[node_id]); });
+
+        MeshType mesh{p_mesh->shared_connectivity(), rotated_xr};
+
+        {
+          const std::set<size_t> tag_set = {0, 1, 2, 3};
+
+          for (auto tag : tag_set) {
+            NumberedBoundaryDescriptor numbered_boundary_descriptor(tag);
+            const auto& node_boundary = getMeshFlatNodeBoundary(mesh, numbered_boundary_descriptor);
+
+            auto node_list = get_node_list_from_tag(tag, connectivity);
+            REQUIRE(is_same(node_boundary.nodeList(), node_list));
+
+            R2 normal = zero;
+
+            switch (tag) {
+            case 0: {
+              normal = 1. / std::sqrt(1.01) * R2{-1, 0.1};
+              break;
+            }
+            case 1: {
+              normal = 1. / std::sqrt(1.01) * R2{1, -0.1};
+              break;
+            }
+            case 2: {
+              normal = 1. / std::sqrt(1.01) * R2{0.1, -1};
+              break;
+            }
+            case 3: {
+              normal = 1. / std::sqrt(1.01) * R2{-0.1, 1};
+              break;
+            }
+            default: {
+              FAIL("unexpected tag number");
+            }
+            }
+            REQUIRE(l2Norm(node_boundary.outgoingNormal() - normal) == Catch::Approx(0).margin(1E-13));
+          }
+        }
+
+        {
+          const std::set<std::string> name_set = {"XMIN", "XMAX", "YMIN", "YMAX"};
+
+          for (const auto& name : name_set) {
+            NamedBoundaryDescriptor named_boundary_descriptor(name);
+            const auto& node_boundary = getMeshFlatNodeBoundary(mesh, named_boundary_descriptor);
+
+            auto node_list = get_node_list_from_name(name, connectivity);
+            REQUIRE(is_same(node_boundary.nodeList(), node_list));
+
+            R2 normal = zero;
+
+            if (name == "XMIN") {
+              normal = 1. / std::sqrt(1.01) * R2{-1, 0.1};
+            } else if (name == "XMAX") {
+              normal = 1. / std::sqrt(1.01) * R2{1, -0.1};
+            } else if (name == "YMIN") {
+              normal = 1. / std::sqrt(1.01) * R2{0.1, -1};
+            } else if (name == "YMAX") {
+              normal = 1. / std::sqrt(1.01) * R2{-0.1, 1};
+            } else {
+              FAIL("unexpected name: " + name);
+            }
+
+            REQUIRE(l2Norm(node_boundary.outgoingNormal() - normal) == Catch::Approx(0).margin(1E-13));
+          }
+        }
+      }
+    }
+
+    SECTION("3D")
+    {
+      static constexpr size_t Dimension = 3;
+
+      using ConnectivityType = Connectivity<Dimension>;
+      using MeshType         = Mesh<ConnectivityType>;
+
+      using R3 = TinyVector<3>;
+
+      auto T = [](const R3& x) -> R3 {
+        return R3{x[0] + 0.1 * x[1] + 0.2 * x[2], x[1] + 0.1 * x[0] + 0.1 * x[2], x[2] + 0.1 * x[0]};
+      };
+
+      SECTION("cartesian 3d")
+      {
+        std::shared_ptr p_mesh = MeshDataBaseForTests::get().cartesian3DMesh();
+
+        const ConnectivityType& connectivity = p_mesh->connectivity();
+
+        auto xr = p_mesh->xr();
+
+        NodeValue<R3> rotated_xr{connectivity};
+
+        parallel_for(
+          connectivity.numberOfNodes(), PUGS_LAMBDA(const NodeId node_id) { rotated_xr[node_id] = T(xr[node_id]); });
+
+        MeshType mesh{p_mesh->shared_connectivity(), rotated_xr};
+
+        {
+          const std::set<size_t> tag_set = {0, 1, 2, 3, 4, 5};
+
+          for (auto tag : tag_set) {
+            NumberedBoundaryDescriptor numbered_boundary_descriptor(tag);
+            const auto& node_boundary = getMeshFlatNodeBoundary(mesh, numbered_boundary_descriptor);
+
+            auto node_list = get_node_list_from_tag(tag, connectivity);
+            REQUIRE(is_same(node_boundary.nodeList(), node_list));
+
+            R3 normal = zero;
+
+            switch (tag) {
+            case 0: {
+              normal = R3{-0.977717523265611, 0.0977717523265611, 0.185766329420466};
+              break;
+            }
+            case 1: {
+              normal = R3{0.977717523265611, -0.0977717523265612, -0.185766329420466};
+              break;
+            }
+            case 2: {
+              normal = R3{0.0911512175788074, -0.992535480302569, 0.0810233045144955};
+              break;
+            }
+            case 3: {
+              normal = R3{-0.0911512175788074, 0.992535480302569, -0.0810233045144955};
+              break;
+            }
+            case 4: {
+              normal = R3{0.100493631166705, -0.0100493631166705, -0.994886948550377};
+              break;
+            }
+            case 5: {
+              normal = R3{-0.100493631166705, 0.0100493631166705, 0.994886948550377};
+              break;
+            }
+            default: {
+              FAIL("unexpected tag number");
+            }
+            }
+
+            REQUIRE(l2Norm(node_boundary.outgoingNormal() - normal) == Catch::Approx(0).margin(1E-13));
+          }
+        }
+
+        {
+          const std::set<std::string> name_set = {"XMIN", "XMAX", "YMIN", "YMAX", "ZMIN", "ZMAX"};
+
+          for (const auto& name : name_set) {
+            NamedBoundaryDescriptor named_boundary_descriptor(name);
+            const auto& node_boundary = getMeshFlatNodeBoundary(mesh, named_boundary_descriptor);
+
+            auto node_list = get_node_list_from_name(name, connectivity);
+
+            REQUIRE(is_same(node_boundary.nodeList(), node_list));
+
+            R3 normal = zero;
+
+            if (name == "XMIN") {
+              normal = R3{-0.977717523265611, 0.0977717523265611, 0.185766329420466};
+            } else if (name == "XMAX") {
+              normal = R3{0.977717523265611, -0.0977717523265612, -0.185766329420466};
+            } else if (name == "YMIN") {
+              normal = R3{0.0911512175788074, -0.992535480302569, 0.0810233045144955};
+            } else if (name == "YMAX") {
+              normal = R3{-0.0911512175788074, 0.992535480302569, -0.0810233045144955};
+            } else if (name == "ZMIN") {
+              normal = R3{0.100493631166705, -0.0100493631166705, -0.994886948550377};
+            } else if (name == "ZMAX") {
+              normal = R3{-0.100493631166705, 0.0100493631166705, 0.994886948550377};
+            } else {
+              FAIL("unexpected name: " + name);
+            }
+
+            REQUIRE(l2Norm(node_boundary.outgoingNormal() - normal) == Catch::Approx(0).margin(1E-13));
+          }
+        }
+      }
+    }
+  }
+
   SECTION("curved mesh")
   {
     SECTION("2D")
diff --git a/tests/test_OFStream.cpp b/tests/test_OFStream.cpp
index fbb5818dbc70451ff212c820edacfa595963e961..6ca7d98df0a6f36bf30add2b891a659beba757c2 100644
--- a/tests/test_OFStream.cpp
+++ b/tests/test_OFStream.cpp
@@ -13,7 +13,8 @@ TEST_CASE("OFStream", "[language]")
 {
   SECTION("ofstream")
   {
-    const std::string basename = std::filesystem::path{PUGS_BINARY_DIR}.append("tests").append("ofstream_");
+    const std::string basename =
+      std::filesystem::path{PUGS_BINARY_DIR}.append("tests").append("ofstream_dir").append("ofstream_");
     const std::string filename = basename + stringify(parallel::rank());
 
     // Ensures that the file is closed after this line
@@ -47,13 +48,4 @@ TEST_CASE("OFStream", "[language]")
 
     REQUIRE(not std::filesystem::exists(filename));
   }
-
-  SECTION("invalid filename")
-  {
-    if (parallel::rank() == 0) {
-      const std::string filename = "badpath/invalidpath/ofstream";
-
-      REQUIRE_THROWS_WITH(std::make_shared<OFStream>(filename), "error: cannot create file " + filename);
-    }
-  }
 }
diff --git a/tests/test_SubItemArrayPerItemUtils.cpp b/tests/test_SubItemArrayPerItemUtils.cpp
index e534ec34600ef526336249a4b7d1cfc794721827..d36829250ce4a5e9b88c7015504c1ffb68362f96 100644
--- a/tests/test_SubItemArrayPerItemUtils.cpp
+++ b/tests/test_SubItemArrayPerItemUtils.cpp
@@ -6,6 +6,8 @@
 #include <algebra/TinyVector.hpp>
 #include <mesh/Connectivity.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <mesh/SubItemArrayPerItem.hpp>
 #include <mesh/SubItemArrayPerItemUtils.hpp>
 #include <utils/Messenger.hpp>
@@ -368,6 +370,43 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
           REQUIRE(sum(node_array_per_face) ==
                   global_number_of_nodes_per_faces * node_array_per_face.sizeOfArrays() * data);
         }
+
+        SECTION(named_mesh.name() + " for float data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          EdgeArrayPerCell<float> edge_array_per_cell{connectivity, 3};
+          edge_array_per_cell.fill(1E10);
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+          parallel_for(
+            connectivity.numberOfCells(), PUGS_LAMBDA(CellId cell_id) {
+              if (cell_is_owned[cell_id]) {
+                auto cell_table = edge_array_per_cell.itemTable(cell_id);
+                for (size_t i = 0; i < cell_table.numberOfRows(); ++i) {
+                  for (size_t j = 0; j < cell_table.numberOfColumns(); ++j) {
+                    cell_table(i, j) = 10 + parallel::rank() - i - j + 1.5;
+                  }
+                }
+              }
+            });
+
+          CellValue<float> cell_sum(connectivity);
+          cell_sum.fill(0);
+          parallel_for(
+            connectivity.numberOfCells(), PUGS_LAMBDA(CellId cell_id) {
+              auto cell_table = edge_array_per_cell.itemTable(cell_id);
+              for (size_t i = 0; i < cell_table.numberOfRows(); ++i) {
+                for (size_t j = 0; j < cell_table.numberOfColumns(); ++j) {
+                  cell_sum[cell_id] += cell_table(i, j);
+                }
+              }
+            });
+
+          REQUIRE(sum(edge_array_per_cell) == sum(cell_sum));
+        }
       }
     }
 
@@ -405,10 +444,10 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
 
           const Connectivity<3>& connectivity = mesh_3d->connectivity();
 
-          using N2x3 = TinyMatrix<2, 3, size_t>;
-          EdgeArrayPerFace<N2x3> edge_array_per_face{connectivity, 3};
+          using R2x3 = TinyMatrix<2, 3, size_t>;
+          EdgeArrayPerFace<R2x3> edge_array_per_face{connectivity, 3};
 
-          const N2x3 data(1, 3, 4, 6, 2, 5);
+          const R2x3 data(1, 3, 4, 6, 2, 5);
           edge_array_per_face.fill(data);
 
           auto face_is_owned = connectivity.faceIsOwned();
@@ -424,6 +463,46 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
           REQUIRE(sum(edge_array_per_face) ==
                   global_number_of_edges_per_faces * edge_array_per_face.sizeOfArrays() * data);
         }
+
+        SECTION(named_mesh.name() + " for R^2x3 data")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          using R2x3 = TinyMatrix<2, 3, double>;
+          EdgeArrayPerFace<R2x3> edge_array_per_face{connectivity, 3};
+
+          auto Nl = MeshDataManager::instance().getMeshData(*mesh_3d).Nl();
+
+          parallel_for(
+            mesh_3d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_edge = 0; i_edge < edge_array_per_face.numberOfSubArrays(face_id); ++i_edge) {
+                for (size_t i = 0; i < edge_array_per_face[face_id][i_edge].size(); ++i) {
+                  R2x3 value((2.3 * i_edge + 1.7) * Nl[face_id][0],                    //
+                             (3. - i_edge) * Nl[face_id][1],                           //
+                             (2 * i_edge + 1.3) * Nl[face_id][2],                      //
+                             (i_edge + 1.7) * Nl[face_id][0] + 3.2 * Nl[face_id][1],   //
+                             Nl[face_id][2] + 3 * i_edge,                              //
+                             7.13 * i_edge * i_edge * l2Norm(Nl[face_id]));
+                  edge_array_per_face[face_id][i_edge][i] = (i + 1.3) * value;
+                }
+              }
+            });
+
+          FaceValue<R2x3> face_value{connectivity};
+          face_value.fill(zero);
+          parallel_for(
+            mesh_3d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_edge = 0; i_edge < edge_array_per_face.numberOfSubArrays(face_id); ++i_edge) {
+                for (size_t i = 0; i < edge_array_per_face[face_id][i_edge].size(); ++i) {
+                  face_value[face_id] += edge_array_per_face[face_id][i_edge][i];
+                }
+              }
+            });
+
+          REQUIRE(sum(edge_array_per_face) == sum(face_value));
+        }
       }
     }
   }
diff --git a/tests/test_SubItemValuePerItemUtils.cpp b/tests/test_SubItemValuePerItemUtils.cpp
index aca9a14498e447b1cd6ffd21411f45e847b34ac5..353d7a9500a2cba3384ae22b8863f020d1c07361 100644
--- a/tests/test_SubItemValuePerItemUtils.cpp
+++ b/tests/test_SubItemValuePerItemUtils.cpp
@@ -6,6 +6,8 @@
 #include <algebra/TinyVector.hpp>
 #include <mesh/Connectivity.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <mesh/SubItemValuePerItem.hpp>
 #include <mesh/SubItemValuePerItemUtils.hpp>
 #include <utils/Messenger.hpp>
@@ -325,29 +327,34 @@ TEST_CASE("SubItemValuePerItemUtils", "[mesh]")
           REQUIRE(sum(node_value_per_face) == 2 * global_number_of_nodes_per_faces);
         }
 
-        SECTION(named_mesh.name() + " for N^2 data")
+        SECTION(named_mesh.name() + " for R^2 data")
         {
           auto mesh_2d = named_mesh.mesh();
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
-          using N2 = TinyVector<2, size_t>;
-          NodeValuePerFace<N2> node_value_per_face{connectivity};
+          using R2 = TinyVector<2, double>;
+          NodeValuePerFace<R2> node_value_per_face{connectivity};
 
-          const N2 data(3, 2);
-          node_value_per_face.fill(data);
+          auto Nl = MeshDataManager::instance().getMeshData(*mesh_2d).Nl();
 
-          auto face_is_owned = connectivity.faceIsOwned();
+          parallel_for(
+            mesh_2d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_node = 0; i_node < node_value_per_face.numberOfSubValues(face_id); ++i_node) {
+                node_value_per_face[face_id][i_node] = (i_node + 1) * Nl[face_id];
+              }
+            });
 
-          const size_t global_number_of_nodes_per_faces = [&] {
-            size_t number_of_nodes_per_faces = 0;
-            for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_nodes_per_faces += face_is_owned[face_id] * node_value_per_face.numberOfSubValues(face_id);
-            }
-            return parallel::allReduceSum(number_of_nodes_per_faces);
-          }();
+          FaceValue<R2> face_value{connectivity};
+          parallel_for(
+            mesh_2d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = node_value_per_face[face_id][0];
+              for (size_t i_node = 1; i_node < node_value_per_face.numberOfSubValues(face_id); ++i_node) {
+                face_value[face_id] += node_value_per_face[face_id][i_node];
+              }
+            });
 
-          REQUIRE(sum(node_value_per_face) == global_number_of_nodes_per_faces * data);
+          REQUIRE(sum(node_value_per_face) == sum(face_value));
         }
       }
     }
@@ -403,6 +410,42 @@ TEST_CASE("SubItemValuePerItemUtils", "[mesh]")
 
           REQUIRE(sum(edge_value_per_face) == global_number_of_edges_per_faces * data);
         }
+
+        SECTION(named_mesh.name() + " for R^3x2 float")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          auto xr = mesh_3d->xr();
+
+          using R3x2 = TinyMatrix<3, 2, float>;
+          EdgeValuePerNode<R3x2> edge_value_per_node{connectivity};
+
+          parallel_for(
+            mesh_3d->numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              for (size_t i_edge = 0; i_edge < edge_value_per_node.numberOfSubValues(node_id); ++i_edge) {
+                R3x2 value((2.3 * i_edge + 1.7) * xr[node_id][0],                    //
+                           (3. - i_edge) * xr[node_id][1],                           //
+                           (2 * i_edge + 1.3) * xr[node_id][2],                      //
+                           (i_edge + 1.7) * xr[node_id][0] + 3.2 * xr[node_id][1],   //
+                           xr[node_id][2] + 3 * i_edge,                              //
+                           7.13 * i_edge * i_edge * l2Norm(xr[node_id]));
+                edge_value_per_node[node_id][i_edge] = value;
+              }
+            });
+
+          NodeValue<R3x2> node_value{connectivity};
+          parallel_for(
+            mesh_3d->numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              node_value[node_id] = edge_value_per_node[node_id][0];
+              for (size_t i_edge = 1; i_edge < edge_value_per_node.numberOfSubValues(node_id); ++i_edge) {
+                node_value[node_id] += edge_value_per_node[node_id][i_edge];
+              }
+            });
+
+          REQUIRE(sum(edge_value_per_node) == sum(node_value));
+        }
       }
     }
   }
diff --git a/tests/test_main.cpp b/tests/test_main.cpp
index 0e334f92c8a73ca526e905c1c5d6e82236f2d1ea..084d9b9453cb683586ce8f9a6d7b72678bcd3aaa 100644
--- a/tests/test_main.cpp
+++ b/tests/test_main.cpp
@@ -21,7 +21,7 @@ int
 main(int argc, char* argv[])
 {
   parallel::Messenger::create(argc, argv);
-  const int nb_threads = std::max(std::thread::hardware_concurrency(), 1u);
+  const int nb_threads = std::max(std::thread::hardware_concurrency() / 2, 1u);
 
   {
     Kokkos::InitializationSettings args;