diff --git a/src/mesh/ItemArrayUtils.hpp b/src/mesh/ItemArrayUtils.hpp
index b1c3ba8cdad569f73f91029c24fb1333e05037ee..1bb8d929f12a2f834535a70166e43a130fb73e40 100644
--- a/src/mesh/ItemArrayUtils.hpp
+++ b/src/mesh/ItemArrayUtils.hpp
@@ -5,6 +5,7 @@
 
 #include <mesh/Connectivity.hpp>
 #include <mesh/ItemArray.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 
@@ -12,7 +13,7 @@
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
   using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
   using ItemIsOwnedType = ItemValue<const bool, item_type>;
@@ -25,7 +26,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
   class ItemArrayMin
   {
    private:
-    const ItemArrayType& m_item_value;
+    const ItemArrayType& m_item_array;
     const ItemIsOwnedType m_is_owned;
 
    public:
@@ -33,7 +34,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator data_type()
     {
       data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
+      parallel_reduce(m_item_array.numberOfItems(), *this, reduced_value);
       return reduced_value;
     }
 
@@ -42,8 +43,8 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator()(const index_type& i_item, data_type& data) const
     {
       if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
+        auto array = m_item_array[i_item];
+        for (size_t i = 0; i < m_item_array.sizeOfArrays(); ++i) {
           if (array[i] < data) {
             data = array[i];
           }
@@ -69,8 +70,8 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemArrayMin(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
+    ItemArrayMin(const ItemArrayType& item_array)
+      : m_item_array(item_array), m_is_owned([&](const IConnectivity& connectivity) {
           Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
                  "unexpected connectivity dimension");
 
@@ -96,7 +97,7 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
           }
             // LCOV_EXCL_STOP
           }
-        }(*item_value.connectivity_ptr()))
+        }(*item_array.connectivity_ptr()))
     {
       ;
     }
@@ -105,13 +106,13 @@ min(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemArrayMin() = default;
   };
 
-  const DataType local_min = ItemArrayMin{item_value};
+  const DataType local_min = ItemArrayMin{item_array};
   return parallel::allReduceMin(local_min);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
   using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
   using ItemIsOwnedType = ItemValue<const bool, item_type>;
@@ -124,7 +125,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
   class ItemArrayMax
   {
    private:
-    const ItemArrayType& m_item_value;
+    const ItemArrayType& m_item_array;
     const ItemIsOwnedType m_is_owned;
 
    public:
@@ -132,7 +133,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator data_type()
     {
       data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
+      parallel_reduce(m_item_array.numberOfItems(), *this, reduced_value);
       return reduced_value;
     }
 
@@ -141,8 +142,8 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     operator()(const index_type& i_item, data_type& data) const
     {
       if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
+        auto array = m_item_array[i_item];
+        for (size_t i = 0; i < m_item_array.sizeOfArrays(); ++i) {
           if (array[i] > data) {
             data = array[i];
           }
@@ -168,8 +169,8 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemArrayMax(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
+    ItemArrayMax(const ItemArrayType& item_array)
+      : m_item_array(item_array), m_is_owned([&](const IConnectivity& connectivity) {
           Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
                  "unexpected connectivity dimension");
 
@@ -195,7 +196,7 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
           }
             // LCOV_EXCL_STOP
           }
-        }(*item_value.connectivity_ptr()))
+        }(*item_array.connectivity_ptr()))
     {
       ;
     }
@@ -204,106 +205,34 @@ max(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemArrayMax() = default;
   };
 
-  const DataType local_max = ItemArrayMax{item_value};
+  const DataType local_max = ItemArrayMax{item_array};
   return parallel::allReduceMax(local_max);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const ItemArray<DataType, item_type, ConnectivityPtr>& item_value)
+sum(const ItemArray<DataType, item_type, ConnectivityPtr>& item_array)
 {
-  using ItemArrayType   = ItemArray<DataType, item_type, ConnectivityPtr>;
-  using ItemIsOwnedType = ItemValue<const bool, item_type>;
-  using data_type       = std::remove_const_t<typename ItemArrayType::data_type>;
-  using index_type      = typename ItemArrayType::index_type;
+  using ItemArrayType = ItemArray<DataType, item_type, ConnectivityPtr>;
+  using data_type     = std::remove_const_t<typename ItemArrayType::data_type>;
+  using index_type    = typename ItemArrayType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class ItemArraySum
-  {
-   private:
-    const ItemArrayType& m_item_value;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_value;
-      parallel_reduce(m_item_value.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& i_item, data_type& data) const
-    {
-      if (m_is_owned[i_item]) {
-        auto array = m_item_value[i_item];
-        for (size_t i = 0; i < m_item_value.sizeOfArrays(); ++i) {
-          data += array[i];
-        }
-      }
-    }
+  ItemValue<data_type, item_type> item_sum{*item_array.connectivity_ptr()};
 
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
+  parallel_for(
+    item_array.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto row          = item_array[item_id];
+      data_type row_sum = row[0];
 
-    PUGS_INLINE
-    void
-    init(data_type& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        value = zero;
+      for (size_t i = 1; i < row.size(); ++i) {
+        row_sum += row[i];
       }
-    }
-
-    PUGS_INLINE
-    ItemArraySum(const ItemArrayType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
-    {
-      ;
-    }
-
-    PUGS_INLINE
-    ~ItemArraySum() = default;
-  };
+      item_sum[item_id] = row_sum;
+    });
 
-  const DataType local_sum = ItemArraySum{item_value};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
diff --git a/src/mesh/ItemValueUtils.hpp b/src/mesh/ItemValueUtils.hpp
index 9ef745b550e42add842ac1c822cf65b07e34dc2b..48f94728a91c705d61fcfe537084ade59818f3f1 100644
--- a/src/mesh/ItemValueUtils.hpp
+++ b/src/mesh/ItemValueUtils.hpp
@@ -8,6 +8,7 @@
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
+#include <utils/ReproductibleSumMPI.hpp>
 
 #include <iostream>
 
@@ -210,6 +211,33 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
+  auto get_is_owned = [](const IConnectivity& connectivity) -> ItemIsOwnedType {
+    Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3), "unexpected connectivity dimension");
+
+    switch (connectivity.dimension()) {
+    case 1: {
+      const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
+      return connectivity_1d.isOwned<item_type>();
+      break;
+    }
+    case 2: {
+      const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
+      return connectivity_2d.isOwned<item_type>();
+      break;
+    }
+    case 3: {
+      const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
+      return connectivity_3d.isOwned<item_type>();
+      break;
+    }
+      // LCOV_EXCL_START
+    default: {
+      throw UnexpectedError("unexpected dimension");
+    }
+      // LCOV_EXCL_STOP
+    }
+  };
+
   class ItemValueSum
   {
    private:
@@ -254,34 +282,8 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
     }
 
     PUGS_INLINE
-    ItemValueSum(const ItemValueType& item_value)
-      : m_item_value(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
+    ItemValueSum(const ItemValueType& item_value, const ItemIsOwnedType& is_owned)
+      : m_item_value{item_value}, m_is_owned{is_owned}
     {
       ;
     }
@@ -290,8 +292,34 @@ sum(const ItemValue<DataType, item_type, ConnectivityPtr>& item_value)
     ~ItemValueSum() = default;
   };
 
-  const DataType local_sum = ItemValueSum{item_value};
-  return parallel::allReduceSum(local_sum);
+  auto is_owned = get_is_owned(*item_value.connectivity_ptr());
+
+  if (ReproductibleSumManager::reproductibleSums()) {
+    if constexpr (std::is_floating_point_v<data_type>) {
+      ReproductibleScalarSum r_sum(item_value.arrayView(), is_owned.arrayView());
+      using ReproductibleSumType = decltype(r_sum);
+      return ReproductibleSumType::getValue(allReduceReproductibleSum(r_sum));
+    } else if constexpr (not std::is_arithmetic_v<data_type>) {
+      if constexpr (is_tiny_vector_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproductibleTinyVectorSum r_sum(item_value.arrayView(), is_owned.arrayView());
+        using ReproductibleSumType = decltype(r_sum);
+        return ReproductibleSumType::getValue(allReduceReproductibleSum(r_sum));
+      } else if constexpr (is_tiny_matrix_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproductibleTinyMatrixSum r_sum(item_value.arrayView(), is_owned.arrayView());
+        using ReproductibleSumType = decltype(r_sum);
+        return ReproductibleSumType::getValue(allReduceReproductibleSum(r_sum));
+      } else {
+        const DataType local_sum = ItemValueSum{item_value, is_owned};
+        return parallel::allReduceSum(local_sum);
+      }
+    } else {
+      const DataType local_sum = ItemValueSum{item_value, is_owned};
+      return parallel::allReduceSum(local_sum);
+    }
+  } else {
+    const DataType local_sum = ItemValueSum{item_value, is_owned};
+    return parallel::allReduceSum(local_sum);
+  }
 }
 
 template <typename DataType, ItemType item_type, typename ConnectivityPtr>
diff --git a/src/mesh/SubItemArrayPerItemUtils.hpp b/src/mesh/SubItemArrayPerItemUtils.hpp
index d2553590ab0f0dad219cdfa82491c05a70b5edbe..aec08ef845cc80bafd20ec8fa1a3b9bcc94415de 100644
--- a/src/mesh/SubItemArrayPerItemUtils.hpp
+++ b/src/mesh/SubItemArrayPerItemUtils.hpp
@@ -4,13 +4,12 @@
 #include <utils/Messenger.hpp>
 
 #include <mesh/Connectivity.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/SubItemArrayPerItem.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
 
-#include <iostream>
-
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
 min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_array_per_item)
@@ -34,9 +33,9 @@ min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
     PUGS_INLINE
     operator data_type()
     {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
+      data_type reduced_value;
+      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_value);
+      return reduced_value;
     }
 
     PUGS_INLINE
@@ -67,9 +66,9 @@ min(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
     PUGS_INLINE
     void
-    init(data_type& array) const
+    init(data_type& value) const
     {
-      array = std::numeric_limits<data_type>::max();
+      value = std::numeric_limits<data_type>::max();
     }
 
     PUGS_INLINE
@@ -136,9 +135,9 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
     PUGS_INLINE
     operator data_type()
     {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
+      data_type reduced_value;
+      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_value);
+      return reduced_value;
     }
 
     PUGS_INLINE
@@ -169,9 +168,9 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
     PUGS_INLINE
     void
-    init(data_type& array) const
+    init(data_type& value) const
     {
-      array = std::numeric_limits<data_type>::min();
+      value = std::numeric_limits<data_type>::min();
     }
 
     PUGS_INLINE
@@ -217,103 +216,38 @@ max(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_a
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& item_array)
+sum(const SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_array_per_item)
 {
   using SubItemArrayPerItemType = SubItemArrayPerItem<DataType, ItemOfItem, ConnectivityPtr>;
   constexpr ItemType item_type  = ItemOfItem::item_type;
-  using ItemIsOwnedType         = ItemValue<const bool, item_type>;
   using data_type               = std::remove_const_t<typename SubItemArrayPerItemType::data_type>;
   using index_type              = typename SubItemArrayPerItemType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class SubItemArrayPerItemSum
-  {
-   private:
-    const SubItemArrayPerItemType& m_sub_item_array_per_item;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_array;
-      parallel_reduce(m_sub_item_array_per_item.numberOfItems(), *this, reduced_array);
-      return reduced_array;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& item_id, data_type& data) const
-    {
-      if (m_is_owned[item_id]) {
-        auto sub_item_table = m_sub_item_array_per_item.itemTable(item_id);
-        for (size_t i = 0; i < sub_item_table.numberOfRows(); ++i) {
-          for (size_t j = 0; j < sub_item_table.numberOfColumns(); ++j) {
-            data += sub_item_table(i, j);
-          }
+  ItemValue<data_type, item_type> item_sum{*sub_item_array_per_item.connectivity_ptr()};
+  parallel_for(
+    sub_item_array_per_item.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto sub_item_table          = sub_item_array_per_item.itemTable(item_id);
+      data_type sub_item_table_sum = [] {
+        if constexpr (std::is_arithmetic_v<data_type>) {
+          return 0;
+        } else {
+          static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
+          return zero;
         }
-      }
-    }
+      }();
 
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(data_type& array) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        array = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        array = zero;
+      for (size_t i = 0; i < sub_item_table.numberOfRows(); ++i) {
+        for (size_t j = 0; j < sub_item_table.numberOfColumns(); ++j) {
+          sub_item_table_sum += sub_item_table(i, j);
+        }
       }
-    }
 
-    PUGS_INLINE
-    SubItemArrayPerItemSum(const SubItemArrayPerItemType& item_array)
-      : m_sub_item_array_per_item(item_array), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_array.connectivity_ptr()))
-    {
-      ;
-    }
-
-    PUGS_INLINE
-    ~SubItemArrayPerItemSum() = default;
-  };
+      item_sum[item_id] = sub_item_table_sum;
+    });
 
-  const DataType local_sum = SubItemArrayPerItemSum{item_array};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
diff --git a/src/mesh/SubItemValuePerItemUtils.hpp b/src/mesh/SubItemValuePerItemUtils.hpp
index 304acae23a31e610695a2b309e72e7d8035d01a5..575c39c5238877bb064883c54b7d08792547000b 100644
--- a/src/mesh/SubItemValuePerItemUtils.hpp
+++ b/src/mesh/SubItemValuePerItemUtils.hpp
@@ -4,13 +4,12 @@
 #include <utils/Messenger.hpp>
 
 #include <mesh/Connectivity.hpp>
+#include <mesh/ItemValueUtils.hpp>
 #include <mesh/SubItemValuePerItem.hpp>
 #include <mesh/Synchronizer.hpp>
 #include <mesh/SynchronizerManager.hpp>
 #include <utils/PugsTraits.hpp>
 
-#include <iostream>
-
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
 min(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_value_per_item)
@@ -213,101 +212,28 @@ max(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_v
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
 std::remove_const_t<DataType>
-sum(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& item_value)
+sum(const SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>& sub_item_value_per_item)
 {
   using SubItemValuePerItemType = SubItemValuePerItem<DataType, ItemOfItem, ConnectivityPtr>;
   constexpr ItemType item_type  = ItemOfItem::item_type;
-  using ItemIsOwnedType         = ItemValue<const bool, item_type>;
   using data_type               = std::remove_const_t<typename SubItemValuePerItemType::data_type>;
   using index_type              = typename SubItemValuePerItemType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
 
-  class SubItemValuePerItemSum
-  {
-   private:
-    const SubItemValuePerItemType& m_sub_item_value_per_item;
-    const ItemIsOwnedType m_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator data_type()
-    {
-      data_type reduced_value;
-      parallel_reduce(m_sub_item_value_per_item.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const index_type& item_id, data_type& data) const
-    {
-      if (m_is_owned[item_id]) {
-        auto sub_item_array = m_sub_item_value_per_item.itemArray(item_id);
-        for (size_t i = 0; i < sub_item_array.size(); ++i) {
-          data += sub_item_array[i];
-        }
-      }
-    }
-
-    PUGS_INLINE
-    void
-    join(data_type& dst, const data_type& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(data_type& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<data_type> or is_tiny_matrix_v<data_type>, "invalid data type");
-        value = zero;
+  ItemValue<data_type, item_type> item_sum{*sub_item_value_per_item.connectivity_ptr()};
+  parallel_for(
+    sub_item_value_per_item.numberOfItems(), PUGS_LAMBDA(index_type item_id) {
+      auto sub_item_array          = sub_item_value_per_item.itemArray(item_id);
+      data_type sub_item_array_sum = sub_item_array[0];
+      for (size_t i = 1; i < sub_item_array.size(); ++i) {
+        sub_item_array_sum += sub_item_array[i];
       }
-    }
-
-    PUGS_INLINE
-    SubItemValuePerItemSum(const SubItemValuePerItemType& item_value)
-      : m_sub_item_value_per_item(item_value), m_is_owned([&](const IConnectivity& connectivity) {
-          Assert((connectivity.dimension() > 0) and (connectivity.dimension() <= 3),
-                 "unexpected connectivity dimension");
-
-          switch (connectivity.dimension()) {
-          case 1: {
-            const auto& connectivity_1d = static_cast<const Connectivity1D&>(connectivity);
-            return connectivity_1d.isOwned<item_type>();
-            break;
-          }
-          case 2: {
-            const auto& connectivity_2d = static_cast<const Connectivity2D&>(connectivity);
-            return connectivity_2d.isOwned<item_type>();
-            break;
-          }
-          case 3: {
-            const auto& connectivity_3d = static_cast<const Connectivity3D&>(connectivity);
-            return connectivity_3d.isOwned<item_type>();
-            break;
-          }
-            // LCOV_EXCL_START
-          default: {
-            throw UnexpectedError("unexpected dimension");
-          }
-            // LCOV_EXCL_STOP
-          }
-        }(*item_value.connectivity_ptr()))
-    {
-      ;
-    }
 
-    PUGS_INLINE
-    ~SubItemValuePerItemSum() = default;
-  };
+      item_sum[item_id] = sub_item_array_sum;
+    });
 
-  const DataType local_sum = SubItemValuePerItemSum{item_value};
-  return parallel::allReduceSum(local_sum);
+  return sum(item_sum);
 }
 
 template <typename DataType, typename ItemOfItem, typename ConnectivityPtr>
diff --git a/src/scheme/DiscreteFunctionP0.hpp b/src/scheme/DiscreteFunctionP0.hpp
index 6587a19f2c77d1f063f38765b4e768d92798f7a4..5a49b9db3d7f6ecd6b589f8a58703ec02d311f6b 100644
--- a/src/scheme/DiscreteFunctionP0.hpp
+++ b/src/scheme/DiscreteFunctionP0.hpp
@@ -708,79 +708,19 @@ class DiscreteFunctionP0
     return sum(f.m_cell_values);
   }
 
- private:
-  class Integrator
-  {
-   private:
-    const DiscreteFunctionP0 m_function;
-    CellValue<const double> m_cell_volume;
-    CellValue<const bool> m_cell_is_owned;
-
-   public:
-    PUGS_INLINE
-    operator std::decay_t<data_type>()
-    {
-      std::decay_t<data_type> reduced_value;
-      if constexpr (std::is_arithmetic_v<std::decay_t<data_type>>) {
-        reduced_value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<std::decay_t<data_type>> or is_tiny_matrix_v<std::decay_t<data_type>>,
-                      "invalid data type");
-        reduced_value = zero;
-      }
-      parallel_reduce(m_cell_volume.numberOfItems(), *this, reduced_value);
-      return reduced_value;
-    }
-
-    PUGS_INLINE
-    void
-    operator()(const CellId& cell_id, std::decay_t<data_type>& data) const
-    {
-      if (m_cell_is_owned[cell_id]) {
-        data += m_cell_volume[cell_id] * m_function[cell_id];
-      }
-    }
-
-    PUGS_INLINE
-    void
-    join(std::decay_t<data_type>& dst, const std::decay_t<data_type>& src) const
-    {
-      dst += src;
-    }
-
-    PUGS_INLINE
-    void
-    init(std::decay_t<data_type>& value) const
-    {
-      if constexpr (std::is_arithmetic_v<data_type>) {
-        value = 0;
-      } else {
-        static_assert(is_tiny_vector_v<std::decay_t<data_type>> or is_tiny_matrix_v<std::decay_t<data_type>>,
-                      "invalid data type");
-        value = zero;
-      }
-    }
-
-    PUGS_INLINE
-    Integrator(const DiscreteFunctionP0& function) : m_function(function)
-    {
-      const Mesh<Connectivity<Dimension>>& mesh = *function.m_mesh;
-      m_cell_volume                             = MeshDataManager::instance().getMeshData(mesh).Vj();
-      m_cell_is_owned                           = mesh.connectivity().cellIsOwned();
-    }
-
-    PUGS_INLINE
-    ~Integrator() = default;
-  };
-
- public:
   PUGS_INLINE friend DataType
   integrate(const DiscreteFunctionP0& f)
   {
     Assert(f.m_cell_values.isBuilt());
-    const DataType integral = Integrator{f};
 
-    return parallel::allReduceSum(integral);
+    const Mesh<Connectivity<Dimension>>& mesh = *f.m_mesh;
+    CellValue<const double> cell_volume       = MeshDataManager::instance().getMeshData(mesh).Vj();
+
+    CellValue<std::remove_const_t<DataType>> f_v{mesh.connectivity()};
+    parallel_for(
+      mesh.numberOfCells(), PUGS_LAMBDA(CellId cell_id) { f_v[cell_id] = cell_volume[cell_id] * f[cell_id]; });
+
+    return sum(f_v);
   }
 
   DiscreteFunctionP0(const std::shared_ptr<const MeshType>& mesh) : m_mesh{mesh}, m_cell_values{mesh->connectivity()} {}
diff --git a/src/utils/Array.hpp b/src/utils/Array.hpp
index 4943d4c08d983f67d1550262e34aa80030d49bf1..fa3299bea5fefa99be94f4038ebae6da2c9779b8 100644
--- a/src/utils/Array.hpp
+++ b/src/utils/Array.hpp
@@ -6,6 +6,8 @@
 #include <utils/PugsAssert.hpp>
 #include <utils/PugsMacros.hpp>
 #include <utils/PugsUtils.hpp>
+#include <utils/ReproductibleSumManager.hpp>
+#include <utils/ReproductibleSumUtils.hpp>
 #include <utils/Types.hpp>
 
 #include <algorithm>
@@ -24,8 +26,7 @@ class [[nodiscard]] Array
     const size_t m_size;
 
    public:
-    friend std::ostream&
-    operator<<(std::ostream& os, const UnsafeArrayView& x)
+    friend std::ostream& operator<<(std::ostream& os, const UnsafeArrayView& x)
     {
       if (x.size() > 0) {
         os << 0 << ':' << NaNHelper(x[0]);
@@ -36,21 +37,18 @@ class [[nodiscard]] Array
       return os;
     }
 
-    [[nodiscard]] PUGS_INLINE size_t
-    size() const
+    [[nodiscard]] PUGS_INLINE size_t size() const
     {
       return m_size;
     }
 
-    [[nodiscard]] PUGS_INLINE DataType&
-    operator[](size_t i) const
+    [[nodiscard]] PUGS_INLINE DataType& operator[](size_t i) const
     {
       Assert(i < m_size, "invalid index");
       return m_values[i];
     }
 
-    PUGS_INLINE void
-    fill(const DataType& data) const
+    PUGS_INLINE void fill(const DataType& data) const
     {
       for (size_t i = 0; i < m_size; ++i) {
         m_values[i] = data;
@@ -58,7 +56,7 @@ class [[nodiscard]] Array
     }
 
     UnsafeArrayView& operator=(const UnsafeArrayView&) = delete;
-    UnsafeArrayView& operator=(UnsafeArrayView&&)      = delete;
+    UnsafeArrayView& operator=(UnsafeArrayView&&) = delete;
 
     UnsafeArrayView(const Array<DataType>& array, index_type begin, index_type size)
       : m_values{&array[begin]}, m_size{size}
@@ -82,14 +80,12 @@ class [[nodiscard]] Array
   friend Array<std::add_const_t<DataType>>;
 
  public:
-  [[nodiscard]] PUGS_INLINE size_t
-  size() const noexcept
+  [[nodiscard]] PUGS_INLINE size_t size() const noexcept
   {
     return m_values.extent(0);
   }
 
-  [[nodiscard]] friend PUGS_INLINE Array<std::remove_const_t<DataType>>
-  copy(const Array<DataType>& source)
+  [[nodiscard]] friend PUGS_INLINE Array<std::remove_const_t<DataType>> copy(const Array<DataType>& source)
   {
     Array<std::remove_const_t<DataType>> image(source.size());
     Kokkos::deep_copy(image.m_values, source.m_values);
@@ -97,8 +93,8 @@ class [[nodiscard]] Array
     return image;
   }
 
-  friend PUGS_INLINE void
-  copy_to(const Array<DataType>& source, const Array<std::remove_const_t<DataType>>& destination)
+  friend PUGS_INLINE void copy_to(const Array<DataType>& source,
+                                  const Array<std::remove_const_t<DataType>>& destination)
   {
     Assert(source.size() == destination.size(), "incompatible Array sizes");
     Kokkos::deep_copy(destination.m_values, source.m_values);
@@ -112,16 +108,14 @@ class [[nodiscard]] Array
                                                                       typename Array<DataType2>::index_type begin,
                                                                       typename Array<DataType2>::index_type size);
 
-  [[nodiscard]] PUGS_INLINE DataType&
-  operator[](index_type i) const noexcept(NO_ASSERT)
+  [[nodiscard]] PUGS_INLINE DataType& operator[](index_type i) const noexcept(NO_ASSERT)
   {
     Assert(i < m_values.extent(0), "invalid index");
     return m_values[i];
   }
 
   PUGS_INLINE
-  void
-  fill(const DataType& data) const
+  void fill(const DataType& data) const
   {
     static_assert(not std::is_const_v<DataType>, "Cannot modify Array of const");
 
@@ -129,8 +123,7 @@ class [[nodiscard]] Array
   }
 
   template <typename DataType2>
-  PUGS_INLINE Array&
-  operator=(const Array<DataType2>& array) noexcept
+  PUGS_INLINE Array& operator=(const Array<DataType2>& array) noexcept
   {
     // ensures that DataType is the same as source DataType2
     static_assert(std::is_same<std::remove_const_t<DataType>, std::remove_const_t<DataType2>>(),
@@ -173,8 +166,7 @@ class [[nodiscard]] Array
 #endif   // NDEBUG
   }
 
-  friend std::ostream&
-  operator<<(std::ostream& os, const Array& x)
+  friend std::ostream& operator<<(std::ostream& os, const Array& x)
   {
     if (x.size() > 0) {
       os << 0 << ':' << NaNHelper(x[0]);
@@ -192,14 +184,13 @@ class [[nodiscard]] Array
   Array(const Array&) = default;
 
   template <typename DataType2>
-  PUGS_INLINE
-  Array(const Array<DataType2>& array) noexcept
+  PUGS_INLINE Array(const Array<DataType2>& array) noexcept
   {
     this->operator=(array);
   }
 
   PUGS_INLINE
-  Array(Array&&) = default;
+  Array(Array &&) = default;
 
   PUGS_INLINE
   ~Array() = default;
@@ -373,8 +364,9 @@ template <typename DataType>
 [[nodiscard]] std::remove_const_t<DataType>
 sum(const Array<DataType>& array)
 {
-  using ArrayType  = Array<DataType>;
-  using data_type  = std::remove_const_t<DataType>;
+  using ArrayType = Array<DataType>;
+  using data_type = std::remove_const_t<DataType>;
+
   using index_type = typename ArrayType::index_type;
 
   static_assert(not std::is_same_v<data_type, bool>, "sum cannot be called on boolean data");
@@ -429,7 +421,26 @@ sum(const Array<DataType>& array)
     ~ArraySum() = default;
   };
 
-  return ArraySum(array);
+  if (ReproductibleSumManager::reproductibleSums()) {
+    if constexpr (std::is_floating_point_v<data_type>) {
+      ReproductibleScalarSum r_sum(array);
+      return r_sum.getSum();
+    } else if constexpr (not std::is_arithmetic_v<data_type>) {
+      if constexpr (is_tiny_vector_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproductibleTinyVectorSum r_sum(array);
+        return r_sum.getSum();
+      } else if constexpr (is_tiny_matrix_v<data_type> and std::is_floating_point_v<typename data_type::data_type>) {
+        ReproductibleTinyMatrixSum r_sum(array);
+        return r_sum.getSum();
+      } else {
+        return ArraySum(array);
+      }
+    } else {
+      return ArraySum(array);
+    }
+  } else {
+    return ArraySum(array);
+  }
 }
 
 #endif   // ARRAY_HPP
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index 2289a91d5f74c566a18af0f9fbcc8bf85ed844bb..3e61d14bb59cf96b873e13c55d92a6e6d9fe3f07 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(
   PETScWrapper.cpp
   PugsUtils.cpp
   RandomEngine.cpp
+  ReproductibleSumManager.cpp
   RevisionInfo.cpp
   SignalManager.cpp
   SLEPcWrapper.cpp
diff --git a/src/utils/Messenger.hpp b/src/utils/Messenger.hpp
index 416dbdf1d65e8b028b9ed39fff2a2acb4ebcb2d5..6685037b1f47249eef70cfb4255378b08df25342 100644
--- a/src/utils/Messenger.hpp
+++ b/src/utils/Messenger.hpp
@@ -23,7 +23,7 @@ namespace parallel
 {
 class Messenger
 {
- private:
+ public:
   struct helper
   {
 #ifdef PUGS_HAS_MPI
@@ -517,7 +517,7 @@ class Messenger
       MPI_Allreduce(&data, &data_sum, 1, mpi_datatype, MPI_SUM, MPI_COMM_WORLD);
 
       return data_sum;
-    } else if (is_trivially_castable<DataType>) {
+    } else if constexpr (is_trivially_castable<DataType>) {
       using InnerDataType = typename DataType::data_type;
 
       MPI_Datatype mpi_datatype = Messenger::helper::mpiType<InnerDataType>();
@@ -526,6 +526,8 @@ class Messenger
       MPI_Allreduce(&data, &data_sum, size, mpi_datatype, MPI_SUM, MPI_COMM_WORLD);
 
       return data_sum;
+    } else {
+      throw UnexpectedError("invalid data type for reduce sum");
     }
 #else    // PUGS_HAS_MPI
     return data;
diff --git a/src/utils/PugsUtils.cpp b/src/utils/PugsUtils.cpp
index 60380902556f88f774b698c4f7f196ea5192930e..538d700745bb7f24749fa7e878c28a64db0e0bfd 100644
--- a/src/utils/PugsUtils.cpp
+++ b/src/utils/PugsUtils.cpp
@@ -6,6 +6,7 @@
 #include <utils/FPEManager.hpp>
 #include <utils/Messenger.hpp>
 #include <utils/PETScWrapper.hpp>
+#include <utils/ReproductibleSumManager.hpp>
 #include <utils/RevisionInfo.hpp>
 #include <utils/SLEPcWrapper.hpp>
 #include <utils/SignalManager.hpp>
@@ -117,6 +118,10 @@ initialize(int& argc, char* argv[])
     bool pause_on_error = false;
     app.add_flag("-p,--pause-on-error", pause_on_error, "Pause for debugging on unexpected error [default: false]");
 
+    bool reproductible_sums = true;
+    app.add_flag("--reproductible-sums,!--no-reproductible-sums", show_preamble,
+                 "Special treatment of array sums to ensure reproducibility [default: true]");
+
     std::atexit([]() { std::cout << rang::style::reset; });
     try {
       app.parse(argc, argv);
@@ -130,6 +135,7 @@ initialize(int& argc, char* argv[])
     ConsoleManager::setShowPreamble(show_preamble);
     ConsoleManager::init(enable_color);
     SignalManager::setPauseForDebug(pause_on_error);
+    ReproductibleSumManager::setReproductibleSums(reproductible_sums);
   }
 
   PETScWrapper::initialize(argc, argv);
diff --git a/src/utils/ReproductibleSumMPI.hpp b/src/utils/ReproductibleSumMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..91caafded09f11b31d29427d434c12704b5869e3
--- /dev/null
+++ b/src/utils/ReproductibleSumMPI.hpp
@@ -0,0 +1,124 @@
+#ifndef REPRODUCTIBLE_SUM_MPI_HPP
+#define REPRODUCTIBLE_SUM_MPI_HPP
+
+#include <utils/Messenger.hpp>
+#include <utils/ReproductibleSumUtils.hpp>
+#include <utils/pugs_config.hpp>
+
+#ifdef PUGS_HAS_MPI
+
+template <typename RSumType>
+struct ReproductibeMPIReducer
+{
+  static void
+  all_reduce_sum_embedder(void* local_value, void* sum_value, int* length, MPI_Datatype*)
+  {
+    Assert(*length == 1, "reduction must be performed on single bin");
+    using Bin        = typename RSumType::Bin;
+    Bin* p_local_bin = reinterpret_cast<Bin*>(local_value);
+    Bin* p_sum_bin   = reinterpret_cast<Bin*>(sum_value);
+
+    RSumType::addBinTo(*p_local_bin, *p_sum_bin);
+  }
+};
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleScalarSum<ArrayT, BoolArrayT>& s)
+{
+  using DataType = std::decay_t<typename ArrayT::data_type>;
+  using RSumType = std::decay_t<decltype(s)>;
+  using BinType  = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproductibeMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, MPI_COMM_WORLD);
+
+  return sum_bin;
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleTinyVectorSum<ArrayT, BoolArrayT>& s)
+{
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  using DataType       = typename TinyVectorType::data_type;
+  using RSumType       = std::decay_t<decltype(s)>;
+  using BinType        = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproductibeMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, MPI_COMM_WORLD);
+
+  return sum_bin;
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleTinyMatrixSum<ArrayT, BoolArrayT>& s)
+{
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  using DataType       = typename TinyVectorType::data_type;
+  using RSumType       = std::decay_t<decltype(s)>;
+  using BinType        = typename RSumType::Bin;
+
+  BinType local_bin = s.getSummationBin();
+
+  MPI_Datatype mpi_bin_type;
+  MPI_Type_contiguous(sizeof(BinType) / sizeof(DataType), parallel::Messenger::helper::mpiType<DataType>(),
+                      &mpi_bin_type);
+  MPI_Type_commit(&mpi_bin_type);
+
+  MPI_Op mpi_bin_sum_op;
+  MPI_Op_create(ReproductibeMPIReducer<RSumType>::all_reduce_sum_embedder, 1, &mpi_bin_sum_op);
+
+  BinType sum_bin = zero;
+  MPI_Allreduce(&local_bin, &sum_bin, 1, mpi_bin_type, mpi_bin_sum_op, MPI_COMM_WORLD);
+
+  return sum_bin;
+}
+
+#else   // PUGS_HAS_MPI
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleScalarSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleTinyVectorSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+template <typename ArrayT, typename BoolArrayT>
+auto
+allReduceReproductibleSum(const ReproductibleTinyMatrixSum<ArrayT, BoolArrayT>& s)
+{
+  return s.getSummationBin();
+}
+
+#endif   // PUGS_HAS_MPI
+
+#endif   // REPRODUCTIBLE_SUM_MPI_HPP
diff --git a/src/utils/ReproductibleSumManager.cpp b/src/utils/ReproductibleSumManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f2851c5b740df50578d4f10800ad6255985d5d4
--- /dev/null
+++ b/src/utils/ReproductibleSumManager.cpp
@@ -0,0 +1,15 @@
+#include <utils/ReproductibleSumManager.hpp>
+
+bool ReproductibleSumManager::s_reproductible_sums = true;
+
+bool
+ReproductibleSumManager::reproductibleSums()
+{
+  return s_reproductible_sums;
+}
+
+void
+ReproductibleSumManager::setReproductibleSums(bool reproductible_sum)
+{
+  s_reproductible_sums = reproductible_sum;
+}
diff --git a/src/utils/ReproductibleSumManager.hpp b/src/utils/ReproductibleSumManager.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd0b3a2df6006024ab4fa86e4ae88857c3a25233
--- /dev/null
+++ b/src/utils/ReproductibleSumManager.hpp
@@ -0,0 +1,14 @@
+#ifndef REPRODUCTIBLE_SUM_MANAGER_HPP
+#define REPRODUCTIBLE_SUM_MANAGER_HPP
+
+class ReproductibleSumManager
+{
+ private:
+  static bool s_reproductible_sums;
+
+ public:
+  static void setReproductibleSums(bool);
+  static bool reproductibleSums();
+};
+
+#endif   // REPRODUCTIBLE_SUM_MANAGER_HPP
diff --git a/src/utils/ReproductibleSumUtils.hpp b/src/utils/ReproductibleSumUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b41980ccfb94e7705a83e201379e4dd16660e88
--- /dev/null
+++ b/src/utils/ReproductibleSumUtils.hpp
@@ -0,0 +1,877 @@
+#ifndef REPRODUCTIBLE_SUM_UTILS_HPP
+#define REPRODUCTIBLE_SUM_UTILS_HPP
+
+#include <utils/PugsUtils.hpp>
+#include <utils/Types.hpp>
+
+template <typename DataType>
+class Array;
+
+namespace reproductible_sum_utils
+{
+template <size_t NumberOfBits>
+struct IntegerFromBitSize
+{
+};
+
+template <>
+struct IntegerFromBitSize<8>
+{
+  using integer_t = std::int8_t;
+};
+
+template <>
+struct IntegerFromBitSize<16>
+{
+  using integer_t = std::int16_t;
+};
+
+template <>
+struct IntegerFromBitSize<32>
+{
+  using integer_t = std::int32_t;
+};
+
+template <>
+struct IntegerFromBitSize<64>
+{
+  using integer_t = std::int64_t;
+};
+
+template <typename DataType>
+struct IntegerType
+{
+  using integer_t = typename IntegerFromBitSize<sizeof(DataType) * 8>::integer_t;
+};
+
+template <typename DataType>
+DataType
+ulp(const DataType& x) noexcept(NO_ASSERT)
+{
+  static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
+
+  if (x == 0) {
+    return std::numeric_limits<DataType>::denorm_min();
+  }
+
+  return std::pow(DataType{2}, std::ilogb(std::abs(x)) - std::numeric_limits<DataType>::digits);
+}
+
+template <typename DataType>
+DataType
+ufp(const DataType& x) noexcept(NO_ASSERT)
+{
+  static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
+
+  return std::pow(DataType{2}, std::ilogb(std::abs(x)));
+}
+
+// Useful bits per bin
+template <typename DataType>
+constexpr inline size_t bin_size;
+template <>
+constexpr inline size_t bin_size<double> = 40;
+template <>
+constexpr inline size_t bin_size<float> = 12;
+
+// IEEE 754 epsilon
+template <typename DataType>
+constexpr inline double bin_epsilon;
+template <>
+constexpr inline double bin_epsilon<double> = std::pow(2., -53);
+template <>
+constexpr inline double bin_epsilon<float> = std::pow(2., -24);
+
+// number of bins: improves precision
+template <typename DataType>
+constexpr inline size_t bin_number;
+
+template <>
+constexpr inline size_t bin_number<double> = 3;
+template <>
+constexpr inline size_t bin_number<float> = 4;
+
+// max local sum size to avoid overflow
+template <typename DataType>
+constexpr inline size_t bin_max_size;
+
+template <>
+constexpr inline size_t bin_max_size<double> = 2048;
+template <>
+constexpr inline size_t bin_max_size<float> = 1024;
+
+struct NoMask
+{
+  PUGS_INLINE bool operator[](size_t) const
+  {
+    return true;
+  }
+};
+
+}   // namespace reproductible_sum_utils
+
+template <typename ArrayT, typename MaskType = reproductible_sum_utils::NoMask>
+class ReproductibleScalarSum
+{
+ public:
+  using DataType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproductible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproductible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproductible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<DataType, K> S;   // sum
+    std::array<DataType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&) = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        S[k] = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        C[k] = 0;
+      }
+    }
+
+    Bin() = default;
+
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k] = bin.S[k - g];
+      bin.C[k] = bin.C[k - g];
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k]);
+      bin.C[k] = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const DataType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    Assert(m > 0);
+
+    using namespace reproductible_sum_utils;
+    if (m >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0])) {
+      const size_t shift = 1 + std::floor(std::log2(m / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0]))) / W);
+
+      _shift(shift, bin);
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(DataType& S, DataType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    x_bar.as_DataType = x;
+    x_bar.as_integer |= 0x1;
+
+    const DataType S0 = S;
+    S += x_bar.as_DataType;
+    x -= S - S0;
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      if (bin.S[k] >= 1.75 * ufp(bin.S[k])) {
+        bin.S[k] -= 0.25 * ufp(bin.S[k]);
+        bin.C[k] += 1;
+      } else if (bin.S[k] < 1.25 * ufp(bin.S[k])) {
+        bin.S[k] += 0.5 * ufp(bin.S[k]);
+        bin.C[k] -= 2;
+      } else if (bin.S[k] < 1.5 * ufp(bin.S[k])) {
+        bin.S[k] += 0.25 * ufp(bin.S[k]);
+        bin.C[k] -= 1;
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproductible_sum_utils;
+
+    DataType ulp_bin = ulp(bin.S[0]);
+    DataType ulp_sum = ulp(bin_sum.S[0]);
+
+    if (ulp_bin < ulp_sum) {
+      const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+      if (shift > 0) {
+        _shift(shift, bin);
+      }
+    } else if (ulp_bin > ulp_sum) {
+      const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+      if (shift > 0) {
+        _shift(shift, bin_sum);
+      }
+    }
+
+    for (size_t k = 0; k < K; ++k) {
+      bin_sum.S[k] += bin.S[k] - 1.5 * ufp(bin.S[k]);
+      bin_sum.C[k] += bin.C[k];
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static DataType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    DataType value = 0;
+    for (size_t k = 0; k < K; ++k) {
+      value += 0.25 * (bin.C[k] - 6) * ufp(bin.S[k]) + bin.S[k];
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE DataType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproductibleScalarSum(const ArrayT& array, const MaskType mask = reproductible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproductible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproductible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<DataType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<DataType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(0);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              DataType& m        = local_max[thread_id];
+              DataType abs_value = std::abs(array[league_start + i]);
+              if (abs_value > m) {
+                m = abs_value;
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              DataType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              };
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproductibleScalarSum() = default;
+};
+
+template <typename ArrayT, typename MaskType = reproductible_sum_utils::NoMask>
+class ReproductibleTinyVectorSum
+{
+ public:
+  using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(is_tiny_vector_v<TinyVectorType>);
+
+  using DataType = std::decay_t<typename TinyVectorType::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproductible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproductible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproductible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<TinyVectorType, K> S;   // sum
+    std::array<TinyVectorType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&) = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+          S[k][i_component] = init_value;
+        }
+        C[k] = zero;
+      }
+    }
+
+    Bin() = default;
+
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin, const size_t& i_component) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k][i_component] = bin.S[k - g][i_component];
+      bin.C[k][i_component] = bin.C[k - g][i_component];
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k][i_component] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k][i_component]);
+      bin.C[k][i_component] = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const TinyVectorType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      if (m[i_component] >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component])) {
+        const size_t shift =
+          1 + std::floor(std::log2(m[i_component] / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component]))) / W);
+
+        _shift(shift, bin, i_component);
+      }
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(TinyVectorType& S, TinyVectorType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      x_bar.as_DataType = x[i_component];
+      x_bar.as_integer |= 0x1;
+
+      DataType& S_i = S[i_component];
+      DataType& x_i = x[i_component];
+
+      const DataType S0 = S_i;
+      S_i += x_bar.as_DataType;
+      x_i -= S_i - S0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      TinyVectorType& S_k = bin.S[k];
+      TinyVectorType& C_k = bin.C[k];
+      for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+        DataType& Sk_i = S_k[i_component];
+        DataType& Ck_i = C_k[i_component];
+
+        if (Sk_i >= 1.75 * ufp(Sk_i)) {
+          Sk_i -= 0.25 * ufp(Sk_i);
+          Ck_i += 1;
+        } else if (Sk_i < 1.25 * ufp(Sk_i)) {
+          Sk_i += 0.5 * ufp(Sk_i);
+          Ck_i -= 2;
+        } else if (Sk_i < 1.5 * ufp(Sk_i)) {
+          Sk_i += 0.25 * ufp(Sk_i);
+          Ck_i -= 1;
+        }
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      DataType ulp_bin = ulp(bin.S[0][i_component]);
+      DataType ulp_sum = ulp(bin_sum.S[0][i_component]);
+
+      if (ulp_bin < ulp_sum) {
+        const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+        if (shift > 0) {
+          _shift(shift, bin, i_component);
+        }
+      } else if (ulp_bin > ulp_sum) {
+        const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+        if (shift > 0) {
+          _shift(shift, bin_sum, i_component);
+        }
+      }
+
+      for (size_t k = 0; k < K; ++k) {
+        bin_sum.S[k][i_component] += bin.S[k][i_component] - 1.5 * ufp(bin.S[k][i_component]);
+        bin_sum.C[k][i_component] += bin.C[k][i_component];
+      }
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static TinyVectorType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    TinyVectorType value = zero;
+    for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+      for (size_t k = 0; k < K; ++k) {
+        value[i_component] += 0.25 * (bin.C[k][i_component] - 6) * ufp(bin.S[k][i_component]) + bin.S[k][i_component];
+      }
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE TinyVectorType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproductibleTinyVectorSum(const ArrayT& array, const MaskType mask = reproductible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproductible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproductible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<TinyVectorType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<TinyVectorType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(zero);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
+                DataType& m        = local_max[thread_id][i_component];
+                DataType abs_value = std::abs(array[league_start + i][i_component]);
+                if (abs_value > m) {
+                  m = abs_value;
+                }
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              TinyVectorType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              }
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproductibleTinyVectorSum() = default;
+};
+
+template <typename ArrayT, typename MaskType = reproductible_sum_utils::NoMask>
+class ReproductibleTinyMatrixSum
+{
+ public:
+  using TinyMatrixType = std::decay_t<typename ArrayT::data_type>;
+  static_assert(is_tiny_matrix_v<TinyMatrixType>);
+
+  using DataType = std::decay_t<typename TinyMatrixType::data_type>;
+  static_assert(std::is_floating_point_v<DataType>);
+
+  static constexpr size_t K     = reproductible_sum_utils::bin_number<DataType>;
+  static constexpr DataType eps = reproductible_sum_utils::bin_epsilon<DataType>;
+  static constexpr size_t W     = reproductible_sum_utils::bin_size<DataType>;
+
+  struct Bin
+  {
+    std::array<TinyMatrixType, K> S;   // sum
+    std::array<TinyMatrixType, K> C;   // carry
+
+    Bin& operator=(const Bin&) = default;
+    Bin& operator=(Bin&&) = default;
+
+    Bin(Bin&&)      = default;
+    Bin(const Bin&) = default;
+
+    Bin(ZeroType)
+    {
+      for (size_t k = 0; k < K; ++k) {
+        const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
+        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+            S[k](i_component, j_component) = init_value;
+          }
+        }
+        C[k] = zero;
+      }
+    }
+
+    Bin()  = default;
+    ~Bin() = default;
+  };
+
+ private:
+  Bin m_summation_bin;
+
+  PUGS_INLINE
+  static void
+  _shift(const size_t g, Bin& bin, const size_t i_component, const size_t j_component) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = K - 1; k >= g; --k) {
+      bin.S[k](i_component, j_component) = bin.S[k - g](i_component, j_component);
+      bin.C[k](i_component, j_component) = bin.C[k - g](i_component, j_component);
+    }
+    for (size_t k = 0; k < std::min(K, g); ++k) {
+      bin.S[k](i_component, j_component) = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k](i_component, j_component));
+      bin.C[k](i_component, j_component) = 0;
+    }
+  }
+
+  PUGS_INLINE static void
+  _update(const TinyMatrixType& m, Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        if (m(i_component, j_component) >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component))) {
+          const size_t shift =
+            1 + std::floor(std::log2(m(i_component, j_component) /
+                                     (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component)))) /
+                           W);
+
+          _shift(shift, bin, i_component, j_component);
+        }
+      }
+    }
+  }
+
+  PUGS_INLINE
+  void static _split2(TinyMatrixType& S, TinyMatrixType& x) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+    union
+    {
+      DataType as_DataType;
+      typename IntegerType<DataType>::integer_t as_integer;
+    } x_bar;
+
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        DataType& S_ij = S(i_component, j_component);
+        DataType& x_ij = x(i_component, j_component);
+
+        x_bar.as_DataType = x_ij;
+        x_bar.as_integer |= 0x1;
+
+        const DataType S0 = S_ij;
+        S_ij += x_bar.as_DataType;
+        x_ij -= S_ij - S0;
+      }
+    }
+  }
+
+  PUGS_INLINE static void
+  _renormalize(Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t k = 0; k < K; ++k) {
+      TinyMatrixType& S_k = bin.S[k];
+      TinyMatrixType& C_k = bin.C[k];
+      for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+        for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+          DataType& Sk_ij = S_k(i_component, j_component);
+          DataType& Ck_ij = C_k(i_component, j_component);
+
+          if (Sk_ij >= 1.75 * ufp(Sk_ij)) {
+            Sk_ij -= 0.25 * ufp(Sk_ij);
+            Ck_ij += 1;
+          } else if (Sk_ij < 1.25 * ufp(Sk_ij)) {
+            Sk_ij += 0.5 * ufp(Sk_ij);
+            Ck_ij -= 2;
+          } else if (Sk_ij < 1.5 * ufp(Sk_ij)) {
+            Sk_ij += 0.25 * ufp(Sk_ij);
+            Ck_ij -= 1;
+          }
+        }
+      }
+    }
+  }
+
+ public:
+  static void
+  addBinTo(Bin& bin, Bin& bin_sum)
+  {
+    using namespace reproductible_sum_utils;
+
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        DataType ulp_bin = ulp(bin.S[0](i_component, j_component));
+        DataType ulp_sum = ulp(bin_sum.S[0](i_component, j_component));
+
+        if (ulp_bin < ulp_sum) {
+          const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
+          if (shift > 0) {
+            _shift(shift, bin, i_component, j_component);
+          }
+        } else if (ulp_bin > ulp_sum) {
+          const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
+          if (shift > 0) {
+            _shift(shift, bin_sum, i_component, j_component);
+          }
+        }
+
+        for (size_t k = 0; k < K; ++k) {
+          bin_sum.S[k](i_component, j_component) +=
+            bin.S[k](i_component, j_component) - 1.5 * ufp(bin.S[k](i_component, j_component));
+          bin_sum.C[k](i_component, j_component) += bin.C[k](i_component, j_component);
+        }
+      }
+    }
+
+    _renormalize(bin_sum);
+  }
+
+  PUGS_INLINE
+  static TinyMatrixType
+  getValue(const Bin& bin) noexcept(NO_ASSERT)
+  {
+    using namespace reproductible_sum_utils;
+
+    TinyMatrixType value = zero;
+    for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+      for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+        for (size_t k = 0; k < K; ++k) {
+          value(i_component, j_component) +=
+            0.25 * (bin.C[k](i_component, j_component) - 6) * ufp(bin.S[k](i_component, j_component)) +
+            bin.S[k](i_component, j_component);
+        }
+      }
+    }
+    return value;
+  }
+
+  PUGS_INLINE Bin
+  getSummationBin() const noexcept(NO_ASSERT)
+  {
+    return m_summation_bin;
+  }
+
+  PUGS_INLINE TinyMatrixType
+  getSum() const noexcept(NO_ASSERT)
+  {
+    return getValue(m_summation_bin);
+  }
+
+  ReproductibleTinyMatrixSum(const ArrayT& array, const MaskType mask = reproductible_sum_utils::NoMask{})
+  {
+    if constexpr (not std::is_same_v<MaskType, reproductible_sum_utils::NoMask>) {
+      static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
+                    "when provided, mask must be an array of bool");
+    }
+
+    using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
+    using TeamMemberT = TeamPolicyT::member_type;
+
+    int nx = reproductible_sum_utils::bin_max_size<DataType>;
+    int ny = std::max(array.size() / nx, 1ul);
+
+    const TeamPolicyT policy(ny, Kokkos::AUTO());
+
+    Array<TinyMatrixType> thread_sum(policy.team_size() * policy.league_size());
+
+    Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
+    bin_by_thread.fill(zero);
+
+    Array<TinyMatrixType> local_max(policy.team_size() * policy.league_size());
+    local_max.fill(zero);
+
+    parallel_for(
+      policy, PUGS_LAMBDA(const TeamMemberT& member) {
+        const int i_team   = member.league_rank();
+        const int i_thread = member.team_rank();
+
+        const int thread_id = i_team * member.team_size() + i_thread;
+
+        const int league_start = nx * i_team;
+        const int block_size   = [&] {
+          int size = nx;
+          if (i_team == member.league_size() - 1) {
+            size = array.size() - league_start;
+          }
+          return size;
+        }();
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
+                for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
+                  DataType& m        = local_max[thread_id](i_component, j_component);
+                  DataType abs_value = std::abs(array[league_start + i](i_component, j_component));
+                  if (abs_value > m) {
+                    m = abs_value;
+                  }
+                }
+              }
+            }
+          });
+
+        _update(local_max[thread_id], bin_by_thread[thread_id]);
+
+        parallel_for(
+          Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
+            if (mask[league_start + i]) {
+              TinyMatrixType x = array[nx * i_team + i];
+              for (size_t k = 0; k < K; ++k) {
+                _split2(bin_by_thread[thread_id].S[k], x);
+              }
+            }
+          });
+
+        _renormalize(bin_by_thread[thread_id]);
+      });
+
+    m_summation_bin = bin_by_thread[0];
+    for (size_t i = 1; i < bin_by_thread.size(); ++i) {
+      addBinTo(bin_by_thread[i], m_summation_bin);
+    }
+  }
+
+  ~ReproductibleTinyMatrixSum() = default;
+};
+
+#endif   // REPRODUCTIBLE_SUM_UTILS_HPP
diff --git a/tests/test_Array.cpp b/tests/test_Array.cpp
index e999cb06d8a4ab0302c1a40896950cd93c4640d4..fb3a50ef9ec02bd3a98d5d7b5af95be8bbb9b439 100644
--- a/tests/test_Array.cpp
+++ b/tests/test_Array.cpp
@@ -14,152 +14,10 @@
 #include <valarray>
 #include <vector>
 
+#include <utils/ReproductibleSumUtils.hpp>
 #include <utils/SmallArray.hpp>
 #include <utils/Timer.hpp>
 
-template <typename DataType>
-class ArrayReproductibleSum
-{
- private:
-  static constexpr double eps = std::numeric_limits<DataType>::epsilon();
-
-  static constexpr size_t W = 40;
-  static constexpr size_t K = 3;
-
-  static consteval size_t
-  _getNB()
-  {
-    return std::floor((1 / eps) * std::pow(2., -2. - W));
-  }
-
-  static constexpr size_t NB = _getNB();
-
-  Array<const DataType> m_array;
-
-  PUGS_INLINE DataType
-  ulp(const DataType& x)
-  {
-    static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
-
-    if (x == 0) {
-      return std::numeric_limits<DataType>::denorm_min();
-    }
-
-    return std::pow(DataType{2}, std::ilogb(std::abs(x)) - std::numeric_limits<DataType>::digits);
-  };
-
-  PUGS_INLINE DataType
-  ufp(const DataType& x)
-  {
-    static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
-
-    return std::pow(DataType{2}, std::ilogb(std::abs(x)));
-  };
-
- public:
-  operator DataType()
-  {
-    auto local_max = [](const auto& v, const size_t i, const size_t lB) noexcept(NO_ASSERT) {
-      DataType m = std::abs(v[i]);
-      for (size_t j = 1; j < lB; ++j) {
-        const DataType& abs_vj = std::abs(v[i + j]);
-        if (m < abs_vj) {
-          m = abs_vj;
-        }
-      }
-      return m;
-    };
-
-    auto update = [&](const DataType& m, auto& S, auto& C) noexcept(NO_ASSERT) {
-      if (m >= std::pow(DataType{2}, W - 1.) * ulp(S[0])) {
-        const size_t g = 1 + std::floor(std::log2(m / (std::pow(DataType{2}, W - 1.) * ulp(S[0]))) / W);
-
-        for (size_t k = K - 1; k >= g; --k) {
-          S[k] = S[k - g];
-          C[k] = C[k - g];
-        }
-
-        for (size_t k = 0; k < std::min(K, g); ++k) {
-          S[k] = 1.5 * std::pow(DataType{2}, g * W) * ufp(S[k]);
-          C[k] = 0;
-        }
-      }
-    };
-
-    auto split2 = [](DataType& S, DataType& x) noexcept(NO_ASSERT) {
-      union
-      {
-        static_assert(sizeof(DataType) == sizeof(unsigned long));
-        DataType as_DataType;
-        unsigned long as_long;
-      } x_bar;
-
-      x_bar.as_DataType = x;
-      x_bar.as_long |= 0x1;
-
-      const DataType S0 = S;
-      S += x_bar.as_DataType;
-      x -= S - S0;
-    };
-
-    auto extract_vector3 = [&](DataType& S, auto& v, size_t i, const size_t lB) noexcept(NO_ASSERT) {
-      for (size_t j = 0; j < lB; ++j) {
-        split2(S, v[i + j]);
-      }
-    };
-
-    auto renormalize = [&](auto& S, auto& C) noexcept(NO_ASSERT) {
-      for (size_t k = 0; k < K; ++k) {
-        if (S[k] >= 1.75 * ufp(S[k])) {
-          S[k] -= 0.25 * ufp(S[k]);
-          C[k] += 1;
-        } else if (S[k] < 1.25 * ufp(S[k])) {
-          S[k] += 0.5 * ufp(S[k]);
-          C[k] -= 2;
-        } else if (S[k] < 1.5 * ufp(S[k])) {
-          S[k] += 0.25 * ufp(S[k]);
-          C[k] -= 1;
-        }
-      }
-    };
-
-    TinyVector<K, DataType> S;
-    for (size_t k = 0; k < K; ++k) {
-      S[k] = 0.75 * eps * std::pow(2., (K - k - 1) * W);
-    }
-
-    TinyVector<K, DataType> C = zero;
-
-    // Array<DataType> local_array(NB);
-    Array<DataType> local_array = copy(m_array);
-    for (size_t i = 0; i < m_array.size(); i += NB) {
-      const size_t lB = std::min(NB, m_array.size() - i);
-
-      //      std::copy_n(&(m_array[i]), lB, &(local_array[i]));
-
-      const DataType m = local_max(local_array, i, lB);
-
-      update(m, S, C);
-
-      for (size_t k = 0; k < K; ++k) {
-        extract_vector3(S[k], local_array, i, lB);
-      }
-
-      renormalize(S, C);
-    }
-
-    DataType sum = 0;
-    for (size_t k = 0; k < S.dimension(); ++k) {
-      sum += 0.25 * (C[k] - 6) * ufp(S[k]) + S[k];
-    }
-
-    return sum;
-  };
-
-  ArrayReproductibleSum(Array<DataType> array) : m_array{array} {}
-  ~ArrayReproductibleSum() = default;
-};
-
 // Instantiate to ensure full coverage is performed
 template class Array<int>;
 
@@ -465,63 +323,79 @@ TEST_CASE("Array", "[utils]")
     }
   }
 
-  SECTION("reproducible floating point sum 2023")
+  SECTION("reproducible double sum")
   {
-    std::clog.precision(16);
+    Array<double> array(10'000);
+
+    for (size_t i = 0; i < array.size(); ++i) {
+      array[i] = ((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1);
+    }
 
-    std::clog << "***** REP 2023 *****\n";
+    const auto sum_before_shuffle = sum(array);
+
+    std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+    const auto sum_after_shuffle = sum(array);
+
+    REQUIRE(sum_before_shuffle == sum_after_shuffle);
+  }
+
+  SECTION("reproducible TinyVector<3,float> sum")
+  {
+    Array<TinyVector<3, float>> array(10'000);
 
-    Array<double> array(10'000'000);
     for (size_t i = 0; i < array.size(); ++i) {
-      array[i] = 1E25 * ((i + 1) % 1'000'000) * std::sin(3 * i + 1);
+      array[i] =
+        TinyVector<3, float>(((i < (array.size() / 10)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                             17 * i - 3,   //
+                             1);
     }
 
-    Timer t_direct_sum1;
-    double direct_sum1 = sum(array);
-    t_direct_sum1.pause();
+    const auto sum_before_shuffle = sum(array);
 
-    Timer t_rsum1;
-    const double reprod_sum1 = ArrayReproductibleSum(array);
-    t_rsum1.pause();
+    std::random_shuffle(&(array[0]), &(array[0]) + array.size());
+
+    ReproductibleTinyVectorSum s0(array);
+
+    auto bin0 = s0.getSummationBin();
+    std::clog << "S0 = " << bin0.S[0] << ";" << bin0.S[1] << ";" << bin0.S[2] << " C0 = " << bin0.C[0] << ";"
+              << bin0.C[1] << ";" << bin0.C[2] << "\n";
+    const auto sum_after_shuffle = sum(array);
+
+    ReproductibleTinyVectorSum s1(array);
+
+    auto bin1 = s1.getSummationBin();
+    std::clog << "S1 = " << bin1.S[0] << ";" << bin1.S[1] << ";" << bin1.S[2] << " C1 = " << bin1.C[0] << ";"
+              << bin1.C[1] << ";" << bin1.C[2] << "\n";
+
+    std::clog << "DIFF_BIN: DS = " << bin1.S[0] - bin0.S[0] << ";" << bin1.S[1] - bin0.S[1] << ";"
+              << bin1.S[1] - bin0.S[2] << '\n';
+
+    std::clog << "DIFF=" << sum_before_shuffle - sum_after_shuffle << '\n';
+
+    REQUIRE(sum_before_shuffle == sum_after_shuffle);
+  }
+
+  SECTION("reproducible TinyMatrix<2, 3> sum")
+  {
+    Array<TinyMatrix<2, 3>> array(10'000);
+
+    for (size_t i = 0; i < array.size(); ++i) {
+      array[i] = TinyMatrix<2, 3>{((i < (array.size() / 1000)) * 1E25 + 1E10) * ((i + 1) % 1'000) * std::sin(3 * i + 1),
+                                  17 * i - 3,            //
+                                  1,                     //
+                                  std::cos(7 * i) * i,   //
+                                  (2 - 3 * i) * 1E-30,   //
+                                  (i * 100 + 1000)};
+    }
+
+    const auto sum_before_shuffle = sum(array);
 
-    std::clog << "# shuffling ...\n" << std::flush;
-    Timer t_shuffle;
     std::random_shuffle(&(array[0]), &(array[0]) + array.size());
-    t_shuffle.pause();
-    std::clog << "  shuffling done\n" << std::flush;
-
-    Timer t_direct_sum2;
-    double direct_sum2 = sum(array);
-    t_direct_sum2.pause();
-
-    Timer t_rsum2;
-    const double reprod_sum2 = ArrayReproductibleSum(array);
-    t_rsum2.pause();
-
-    std::clog << "---------\n";
-    std::clog << "direct sum1 =" << direct_sum1 << '\n';
-    std::clog << "direct sum2 =" << direct_sum2 << '\n';
-    std::clog << "reprod sum1 = " << reprod_sum1 << '\n';
-    std::clog << "reprod sum2 = " << reprod_sum2 << '\n';
-    std::clog << "---------\n";
-    std::clog << "delta direct =" << direct_sum1 - direct_sum2 << '\n';
-    std::clog << "delta reprod =" << reprod_sum1 - reprod_sum2 << '\n';
-    std::clog << "---------\n";
-    std::clog << "[t shuffle " << t_shuffle << "]\n";
-    std::clog << "---------\n";
-    std::clog << "[t dsum : " << t_direct_sum1 << " efficiency = " << t_direct_sum1.seconds() / t_direct_sum1.seconds()
-              << "]\n";
-    std::clog << "[t rsum : " << t_rsum1 << " efficiency = " << t_rsum1.seconds() / t_direct_sum1.seconds() << "]\n";
-    std::clog << "---------\n";
-    std::clog << "[t dsum2: " << t_direct_sum2 << " efficiency = " << t_direct_sum2.seconds() / t_direct_sum2.seconds()
-              << "]\n";
-    std::clog << "[t rsum2: " << t_rsum2 << " efficiency = " << t_rsum2.seconds() / t_direct_sum2.seconds() << "]\n";
-
-    double sum_new = ArrayReproductibleSum(array);
-
-    std::clog << "sum_new = " << sum_new << '\n';
-
-    REQUIRE(reprod_sum1 == reprod_sum2);
+
+    const auto sum_after_shuffle = sum(array);
+
+    REQUIRE(sum_before_shuffle == sum_after_shuffle);
   }
 
   SECTION("checking for subArrayView")
diff --git a/tests/test_DiscreteFunctionP0.cpp b/tests/test_DiscreteFunctionP0.cpp
index 39b91c4de355c1a96feaa1f1b5e2a59bf9c494d8..d0f71e984d516ec243de289f399d882663c9c499 100644
--- a/tests/test_DiscreteFunctionP0.cpp
+++ b/tests/test_DiscreteFunctionP0.cpp
@@ -2787,7 +2787,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -2806,8 +2806,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -2826,10 +2826,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
@@ -3131,7 +3131,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -3150,8 +3150,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -3170,10 +3170,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
@@ -3563,7 +3563,7 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
                 cell_value[cell_id] = cell_volume[cell_id] * positive_function[cell_id];
               });
 
-            REQUIRE(integrate(positive_function) == Catch::Approx(sum(cell_value)));
+            REQUIRE(integrate(positive_function) == sum(cell_value));
           }
 
           SECTION("integrate vector")
@@ -3582,8 +3582,8 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)[0] == Catch::Approx(sum(cell_value)[0]));
-            REQUIRE(integrate(uh)[1] == Catch::Approx(sum(cell_value)[1]));
+            REQUIRE(integrate(uh)[0] == sum(cell_value)[0]);
+            REQUIRE(integrate(uh)[1] == sum(cell_value)[1]);
           }
 
           SECTION("integrate matrix")
@@ -3602,10 +3602,10 @@ TEST_CASE("DiscreteFunctionP0", "[scheme]")
               mesh->numberOfCells(),
               PUGS_LAMBDA(const CellId cell_id) { cell_value[cell_id] = cell_volume[cell_id] * uh[cell_id]; });
 
-            REQUIRE(integrate(uh)(0, 0) == Catch::Approx(sum(cell_value)(0, 0)));
-            REQUIRE(integrate(uh)(0, 1) == Catch::Approx(sum(cell_value)(0, 1)));
-            REQUIRE(integrate(uh)(1, 0) == Catch::Approx(sum(cell_value)(1, 0)));
-            REQUIRE(integrate(uh)(1, 1) == Catch::Approx(sum(cell_value)(1, 1)));
+            REQUIRE(integrate(uh)(0, 0) == sum(cell_value)(0, 0));
+            REQUIRE(integrate(uh)(0, 1) == sum(cell_value)(0, 1));
+            REQUIRE(integrate(uh)(1, 0) == sum(cell_value)(1, 0));
+            REQUIRE(integrate(uh)(1, 1) == sum(cell_value)(1, 1));
           }
         }
       }
diff --git a/tests/test_ItemArrayUtils.cpp b/tests/test_ItemArrayUtils.cpp
index 8c832d16c1dac5ed7a84828303f68bceaf25c3d8..346345eae1134da16e7e75b7c1d6f553ac14ef6d 100644
--- a/tests/test_ItemArrayUtils.cpp
+++ b/tests/test_ItemArrayUtils.cpp
@@ -8,6 +8,8 @@
 #include <mesh/ItemArray.hpp>
 #include <mesh/ItemArrayUtils.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <utils/Messenger.hpp>
 
 // clazy:excludeall=non-pod-global-static
@@ -1055,26 +1057,31 @@ TEST_CASE("ItemArrayUtils", "[mesh]")
       std::array mesh_list = MeshDataBaseForTests::get().all2DMeshes();
 
       for (const auto& named_mesh : mesh_list) {
-        SECTION(named_mesh.name() + " for size_t data")
+        SECTION(named_mesh.name() + " for double data")
         {
           auto mesh_2d = named_mesh.mesh();
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
-          FaceArray<size_t> face_array{connectivity, 3};
-          face_array.fill(2);
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
+          FaceArray<double> face_array{connectivity, 3};
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_array[face_id][0] = ll[face_id];
+              face_array[face_id][1] = (3 + ll[face_id]) * ll[face_id];
+              face_array[face_id][2] = 2 * ll[face_id] - 3;
+            });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
-            for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
-            }
-            return parallel::allReduceSum(number_of_faces);
-          }();
+          FaceValue<double> face_sum{connectivity};
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_sum[face_id] = face_array[face_id][0] + face_array[face_id][1] + face_array[face_id][2];
+            });
 
-          REQUIRE(sum(face_array) == 2 * face_array.sizeOfArrays() * global_number_of_faces);
+          REQUIRE(sum(face_array) == sum(face_sum));
         }
 
         SECTION(named_mesh.name() + " for N^2 data")
diff --git a/tests/test_ItemValueUtils.cpp b/tests/test_ItemValueUtils.cpp
index 755392515ae3078421375f71af8727aa8bff144e..b1901713a2065cb054dbb18debaa60722116cb7d 100644
--- a/tests/test_ItemValueUtils.cpp
+++ b/tests/test_ItemValueUtils.cpp
@@ -8,6 +8,8 @@
 #include <mesh/ItemValue.hpp>
 #include <mesh/ItemValueUtils.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <utils/Messenger.hpp>
 
 // clazy:excludeall=non-pod-global-static
@@ -240,26 +242,60 @@ TEST_CASE("ItemValueUtils", "[mesh]")
       std::array mesh_list = MeshDataBaseForTests::get().all1DMeshes();
 
       for (const auto& named_mesh : mesh_list) {
-        SECTION(named_mesh.name())
+        SECTION(named_mesh.name() + "for size_t data")
         {
           auto mesh_1d = named_mesh.mesh();
 
           const Connectivity<1>& connectivity = mesh_1d->connectivity();
 
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_1d).Vj();
+
           CellValue<size_t> cell_value{connectivity};
-          cell_value.fill(5);
+
+          parallel_for(
+            connectivity.numberOfCells(),
+            PUGS_LAMBDA(CellId cell_id) { cell_value[cell_id] = std::ceil(100 * Vj[cell_id]); });
 
           auto cell_is_owned = connectivity.cellIsOwned();
 
-          const size_t global_number_of_cells = [&] {
-            size_t number_of_cells = 0;
+          const size_t global_sum = [&] {
+            size_t local_dum = 0;
             for (CellId cell_id = 0; cell_id < cell_is_owned.numberOfItems(); ++cell_id) {
-              number_of_cells += cell_is_owned[cell_id];
+              local_dum += cell_is_owned[cell_id] * cell_value[cell_id];
             }
-            return parallel::allReduceSum(number_of_cells);
+            return parallel::allReduceSum(local_dum);
           }();
 
-          REQUIRE(sum(cell_value) == 5 * global_number_of_cells);
+          REQUIRE(sum(cell_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + "for double data")
+        {
+          auto mesh_1d = named_mesh.mesh();
+
+          const Connectivity<1>& connectivity = mesh_1d->connectivity();
+
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_1d).Vj();
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+
+          size_t nb_owned_cells = 0;
+          for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+            nb_owned_cells += cell_is_owned[cell_id];
+          }
+
+          Array<double> local_Vj(nb_owned_cells);
+          {
+            size_t index = 0;
+            for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+              if (cell_is_owned[cell_id]) {
+                local_Vj[index++] = Vj[cell_id];
+              }
+            }
+          }
+          Array<double> global_Vj = parallel::allGatherVariable(local_Vj);
+
+          REQUIRE(sum(Vj) == sum(global_Vj));
         }
       }
     }
@@ -275,20 +311,54 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
           FaceValue<size_t> face_value{connectivity};
-          face_value.fill(2);
+
+          parallel_for(
+            connectivity.numberOfFaces(),
+            PUGS_LAMBDA(FaceId face_id) { face_value[face_id] = std::ceil(100 * ll[face_id]); });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
+          const size_t global_sum = [&] {
+            size_t local_sum = 0;
             for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
+              local_sum += face_is_owned[face_id] * face_value[face_id];
             }
-            return parallel::allReduceSum(number_of_faces);
+            return parallel::allReduceSum(local_sum);
           }();
 
-          REQUIRE(sum(face_value) == 2 * global_number_of_faces);
+          REQUIRE(sum(face_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + "for double data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          auto Vj = MeshDataManager::instance().getMeshData(*mesh_2d).Vj();
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+
+          size_t nb_owned_cells = 0;
+          for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+            nb_owned_cells += cell_is_owned[cell_id];
+          }
+
+          Array<double> local_Vj(nb_owned_cells);
+          {
+            size_t index = 0;
+            for (CellId cell_id = 0; cell_id < connectivity.numberOfCells(); ++cell_id) {
+              if (cell_is_owned[cell_id]) {
+                local_Vj[index++] = Vj[cell_id];
+              }
+            }
+          }
+          Array<double> global_Vj = parallel::allGatherVariable(local_Vj);
+
+          REQUIRE(sum(Vj) == sum(global_Vj));
         }
 
         SECTION(named_mesh.name() + "for N^3 data")
@@ -297,22 +367,66 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
           using N3 = TinyVector<3, size_t>;
           FaceValue<N3> face_value{connectivity};
-          const N3 data(2, 1, 4);
-          face_value.fill(data);
+
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = N3(std::ceil(100 * ll[face_id]),        //
+                                       std::ceil(200 * ll[face_id] - 40),   //
+                                       std::ceil(17.3 * ll[face_id] + 12.9));
+            });
 
           auto face_is_owned = connectivity.faceIsOwned();
 
-          const size_t global_number_of_faces = [&] {
-            size_t number_of_faces = 0;
+          const N3 global_sum = [=] {
+            N3 local_sum = zero;
             for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_faces += face_is_owned[face_id];
+              local_sum += face_is_owned[face_id] * face_value[face_id];
             }
-            return parallel::allReduceSum(number_of_faces);
+            return parallel::allReduceSum(local_sum);
           }();
 
-          REQUIRE(sum(face_value) == global_number_of_faces * data);
+          REQUIRE(sum(face_value) == global_sum);
+        }
+
+        SECTION(named_mesh.name() + "for R2 data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          auto ll = MeshDataManager::instance().getMeshData(*mesh_2d).ll();
+
+          using R2x3 = TinyVector<2, double>;
+          FaceValue<R2x3> face_value{connectivity};
+
+          auto face_is_owned = connectivity.faceIsOwned();
+
+          size_t nb_owned_faces = 0;
+          for (FaceId face_id = 0; face_id < connectivity.numberOfFaces(); ++face_id) {
+            nb_owned_faces += face_is_owned[face_id];
+          }
+
+          parallel_for(
+            connectivity.numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = R2x3{ll[face_id], 1. / ll[face_id]};
+            });
+
+          Array<R2x3> local_array(nb_owned_faces);
+          {
+            size_t index = 0;
+            for (FaceId face_id = 0; face_id < connectivity.numberOfFaces(); ++face_id) {
+              if (face_is_owned[face_id]) {
+                local_array[index++] = face_value[face_id];
+              }
+            }
+          }
+          Array global_array = parallel::allGatherVariable(local_array);
+
+          REQUIRE(sum(face_value) == sum(global_array));
         }
       }
     }
@@ -369,6 +483,48 @@ TEST_CASE("ItemValueUtils", "[mesh]")
 
           REQUIRE(sum(node_value) == global_number_of_nodes * data);
         }
+
+        SECTION(named_mesh.name() + "for R^2x3 data")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          auto xr = mesh_3d->xr();
+
+          using R2x3 = TinyMatrix<2, 3, double>;
+          NodeValue<R2x3> node_value{connectivity};
+
+          auto node_is_owned = connectivity.nodeIsOwned();
+
+          size_t nb_owned_nodes = 0;
+          for (NodeId node_id = 0; node_id < connectivity.numberOfNodes(); ++node_id) {
+            nb_owned_nodes += node_is_owned[node_id];
+          }
+
+          parallel_for(
+            connectivity.numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              node_value[node_id] = R2x3{xr[node_id][0],
+                                         l2Norm(xr[node_id]),
+                                         3.178 * xr[node_id][0] - 3 * xr[node_id][2],
+                                         xr[node_id][1],
+                                         xr[node_id][2] + xr[node_id][0],
+                                         xr[node_id][0] * xr[node_id][1] * xr[node_id][2]};
+            });
+
+          Array<R2x3> local_array(nb_owned_nodes);
+          {
+            size_t index = 0;
+            for (NodeId node_id = 0; node_id < connectivity.numberOfNodes(); ++node_id) {
+              if (node_is_owned[node_id]) {
+                local_array[index++] = node_value[node_id];
+              }
+            }
+          }
+          Array global_array = parallel::allGatherVariable(local_array);
+
+          REQUIRE(sum(node_value) == sum(global_array));
+        }
       }
     }
   }
diff --git a/tests/test_SubItemArrayPerItemUtils.cpp b/tests/test_SubItemArrayPerItemUtils.cpp
index e534ec34600ef526336249a4b7d1cfc794721827..d36829250ce4a5e9b88c7015504c1ffb68362f96 100644
--- a/tests/test_SubItemArrayPerItemUtils.cpp
+++ b/tests/test_SubItemArrayPerItemUtils.cpp
@@ -6,6 +6,8 @@
 #include <algebra/TinyVector.hpp>
 #include <mesh/Connectivity.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <mesh/SubItemArrayPerItem.hpp>
 #include <mesh/SubItemArrayPerItemUtils.hpp>
 #include <utils/Messenger.hpp>
@@ -368,6 +370,43 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
           REQUIRE(sum(node_array_per_face) ==
                   global_number_of_nodes_per_faces * node_array_per_face.sizeOfArrays() * data);
         }
+
+        SECTION(named_mesh.name() + " for float data")
+        {
+          auto mesh_2d = named_mesh.mesh();
+
+          const Connectivity<2>& connectivity = mesh_2d->connectivity();
+
+          EdgeArrayPerCell<float> edge_array_per_cell{connectivity, 3};
+          edge_array_per_cell.fill(1E10);
+
+          auto cell_is_owned = connectivity.cellIsOwned();
+          parallel_for(
+            connectivity.numberOfCells(), PUGS_LAMBDA(CellId cell_id) {
+              if (cell_is_owned[cell_id]) {
+                auto cell_table = edge_array_per_cell.itemTable(cell_id);
+                for (size_t i = 0; i < cell_table.numberOfRows(); ++i) {
+                  for (size_t j = 0; j < cell_table.numberOfColumns(); ++j) {
+                    cell_table(i, j) = 10 + parallel::rank() - i - j + 1.5;
+                  }
+                }
+              }
+            });
+
+          CellValue<float> cell_sum(connectivity);
+          cell_sum.fill(0);
+          parallel_for(
+            connectivity.numberOfCells(), PUGS_LAMBDA(CellId cell_id) {
+              auto cell_table = edge_array_per_cell.itemTable(cell_id);
+              for (size_t i = 0; i < cell_table.numberOfRows(); ++i) {
+                for (size_t j = 0; j < cell_table.numberOfColumns(); ++j) {
+                  cell_sum[cell_id] += cell_table(i, j);
+                }
+              }
+            });
+
+          REQUIRE(sum(edge_array_per_cell) == sum(cell_sum));
+        }
       }
     }
 
@@ -405,10 +444,10 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
 
           const Connectivity<3>& connectivity = mesh_3d->connectivity();
 
-          using N2x3 = TinyMatrix<2, 3, size_t>;
-          EdgeArrayPerFace<N2x3> edge_array_per_face{connectivity, 3};
+          using R2x3 = TinyMatrix<2, 3, size_t>;
+          EdgeArrayPerFace<R2x3> edge_array_per_face{connectivity, 3};
 
-          const N2x3 data(1, 3, 4, 6, 2, 5);
+          const R2x3 data(1, 3, 4, 6, 2, 5);
           edge_array_per_face.fill(data);
 
           auto face_is_owned = connectivity.faceIsOwned();
@@ -424,6 +463,46 @@ TEST_CASE("SubItemArrayPerItemUtils", "[mesh]")
           REQUIRE(sum(edge_array_per_face) ==
                   global_number_of_edges_per_faces * edge_array_per_face.sizeOfArrays() * data);
         }
+
+        SECTION(named_mesh.name() + " for R^2x3 data")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          using R2x3 = TinyMatrix<2, 3, double>;
+          EdgeArrayPerFace<R2x3> edge_array_per_face{connectivity, 3};
+
+          auto Nl = MeshDataManager::instance().getMeshData(*mesh_3d).Nl();
+
+          parallel_for(
+            mesh_3d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_edge = 0; i_edge < edge_array_per_face.numberOfSubArrays(face_id); ++i_edge) {
+                for (size_t i = 0; i < edge_array_per_face[face_id][i_edge].size(); ++i) {
+                  R2x3 value((2.3 * i_edge + 1.7) * Nl[face_id][0],                    //
+                             (3. - i_edge) * Nl[face_id][1],                           //
+                             (2 * i_edge + 1.3) * Nl[face_id][2],                      //
+                             (i_edge + 1.7) * Nl[face_id][0] + 3.2 * Nl[face_id][1],   //
+                             Nl[face_id][2] + 3 * i_edge,                              //
+                             7.13 * i_edge * i_edge * l2Norm(Nl[face_id]));
+                  edge_array_per_face[face_id][i_edge][i] = (i + 1.3) * value;
+                }
+              }
+            });
+
+          FaceValue<R2x3> face_value{connectivity};
+          face_value.fill(zero);
+          parallel_for(
+            mesh_3d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_edge = 0; i_edge < edge_array_per_face.numberOfSubArrays(face_id); ++i_edge) {
+                for (size_t i = 0; i < edge_array_per_face[face_id][i_edge].size(); ++i) {
+                  face_value[face_id] += edge_array_per_face[face_id][i_edge][i];
+                }
+              }
+            });
+
+          REQUIRE(sum(edge_array_per_face) == sum(face_value));
+        }
       }
     }
   }
diff --git a/tests/test_SubItemValuePerItemUtils.cpp b/tests/test_SubItemValuePerItemUtils.cpp
index aca9a14498e447b1cd6ffd21411f45e847b34ac5..353d7a9500a2cba3384ae22b8863f020d1c07361 100644
--- a/tests/test_SubItemValuePerItemUtils.cpp
+++ b/tests/test_SubItemValuePerItemUtils.cpp
@@ -6,6 +6,8 @@
 #include <algebra/TinyVector.hpp>
 #include <mesh/Connectivity.hpp>
 #include <mesh/Mesh.hpp>
+#include <mesh/MeshData.hpp>
+#include <mesh/MeshDataManager.hpp>
 #include <mesh/SubItemValuePerItem.hpp>
 #include <mesh/SubItemValuePerItemUtils.hpp>
 #include <utils/Messenger.hpp>
@@ -325,29 +327,34 @@ TEST_CASE("SubItemValuePerItemUtils", "[mesh]")
           REQUIRE(sum(node_value_per_face) == 2 * global_number_of_nodes_per_faces);
         }
 
-        SECTION(named_mesh.name() + " for N^2 data")
+        SECTION(named_mesh.name() + " for R^2 data")
         {
           auto mesh_2d = named_mesh.mesh();
 
           const Connectivity<2>& connectivity = mesh_2d->connectivity();
 
-          using N2 = TinyVector<2, size_t>;
-          NodeValuePerFace<N2> node_value_per_face{connectivity};
+          using R2 = TinyVector<2, double>;
+          NodeValuePerFace<R2> node_value_per_face{connectivity};
 
-          const N2 data(3, 2);
-          node_value_per_face.fill(data);
+          auto Nl = MeshDataManager::instance().getMeshData(*mesh_2d).Nl();
 
-          auto face_is_owned = connectivity.faceIsOwned();
+          parallel_for(
+            mesh_2d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              for (size_t i_node = 0; i_node < node_value_per_face.numberOfSubValues(face_id); ++i_node) {
+                node_value_per_face[face_id][i_node] = (i_node + 1) * Nl[face_id];
+              }
+            });
 
-          const size_t global_number_of_nodes_per_faces = [&] {
-            size_t number_of_nodes_per_faces = 0;
-            for (FaceId face_id = 0; face_id < face_is_owned.numberOfItems(); ++face_id) {
-              number_of_nodes_per_faces += face_is_owned[face_id] * node_value_per_face.numberOfSubValues(face_id);
-            }
-            return parallel::allReduceSum(number_of_nodes_per_faces);
-          }();
+          FaceValue<R2> face_value{connectivity};
+          parallel_for(
+            mesh_2d->numberOfFaces(), PUGS_LAMBDA(FaceId face_id) {
+              face_value[face_id] = node_value_per_face[face_id][0];
+              for (size_t i_node = 1; i_node < node_value_per_face.numberOfSubValues(face_id); ++i_node) {
+                face_value[face_id] += node_value_per_face[face_id][i_node];
+              }
+            });
 
-          REQUIRE(sum(node_value_per_face) == global_number_of_nodes_per_faces * data);
+          REQUIRE(sum(node_value_per_face) == sum(face_value));
         }
       }
     }
@@ -403,6 +410,42 @@ TEST_CASE("SubItemValuePerItemUtils", "[mesh]")
 
           REQUIRE(sum(edge_value_per_face) == global_number_of_edges_per_faces * data);
         }
+
+        SECTION(named_mesh.name() + " for R^3x2 float")
+        {
+          auto mesh_3d = named_mesh.mesh();
+
+          const Connectivity<3>& connectivity = mesh_3d->connectivity();
+
+          auto xr = mesh_3d->xr();
+
+          using R3x2 = TinyMatrix<3, 2, float>;
+          EdgeValuePerNode<R3x2> edge_value_per_node{connectivity};
+
+          parallel_for(
+            mesh_3d->numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              for (size_t i_edge = 0; i_edge < edge_value_per_node.numberOfSubValues(node_id); ++i_edge) {
+                R3x2 value((2.3 * i_edge + 1.7) * xr[node_id][0],                    //
+                           (3. - i_edge) * xr[node_id][1],                           //
+                           (2 * i_edge + 1.3) * xr[node_id][2],                      //
+                           (i_edge + 1.7) * xr[node_id][0] + 3.2 * xr[node_id][1],   //
+                           xr[node_id][2] + 3 * i_edge,                              //
+                           7.13 * i_edge * i_edge * l2Norm(xr[node_id]));
+                edge_value_per_node[node_id][i_edge] = value;
+              }
+            });
+
+          NodeValue<R3x2> node_value{connectivity};
+          parallel_for(
+            mesh_3d->numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+              node_value[node_id] = edge_value_per_node[node_id][0];
+              for (size_t i_edge = 1; i_edge < edge_value_per_node.numberOfSubValues(node_id); ++i_edge) {
+                node_value[node_id] += edge_value_per_node[node_id][i_edge];
+              }
+            });
+
+          REQUIRE(sum(edge_value_per_node) == sum(node_value));
+        }
       }
     }
   }