Skip to content
Snippets Groups Projects
Select Git revision
  • 77e6ceeb543b13a9afed245086ecd300db7aa878
  • develop default protected
  • feature/gmsh-reader
  • origin/stage/bouguettaia
  • feature/kinetic-schemes
  • feature/reconstruction
  • feature/local-dt-fsi
  • feature/composite-scheme-sources
  • feature/composite-scheme-other-fluxes
  • feature/serraille
  • feature/variational-hydro
  • feature/composite-scheme
  • hyperplastic
  • feature/polynomials
  • feature/gks
  • feature/implicit-solver-o2
  • feature/coupling_module
  • feature/implicit-solver
  • feature/merge-local-dt-fsi
  • master protected
  • feature/escobar-smoother
  • v0.5.0 protected
  • v0.4.1 protected
  • v0.4.0 protected
  • v0.3.0 protected
  • v0.2.0 protected
  • v0.1.0 protected
  • Kidder
  • v0.0.4 protected
  • v0.0.3 protected
  • v0.0.2 protected
  • v0 protected
  • v0.0.1 protected
33 results

BoundaryConditionDescriptor.hpp

Blame
  • ReproducibleSumUtils.hpp 25.37 KiB
    #ifndef REPRODUCIBLE_SUM_UTILS_HPP
    #define REPRODUCIBLE_SUM_UTILS_HPP
    
    #include <utils/PugsUtils.hpp>
    #include <utils/Types.hpp>
    
    template <typename DataType>
    class Array;
    
    namespace reproducible_sum_utils
    {
    template <size_t NumberOfBits>
    struct IntegerFromBitSize
    {
    };
    
    template <>
    struct IntegerFromBitSize<8>
    {
      using integer_t = std::int8_t;
    };
    
    template <>
    struct IntegerFromBitSize<16>
    {
      using integer_t = std::int16_t;
    };
    
    template <>
    struct IntegerFromBitSize<32>
    {
      using integer_t = std::int32_t;
    };
    
    template <>
    struct IntegerFromBitSize<64>
    {
      using integer_t = std::int64_t;
    };
    
    template <typename DataType>
    struct IntegerType
    {
      using integer_t = typename IntegerFromBitSize<sizeof(DataType) * 8>::integer_t;
    };
    
    template <typename DataType>
    DataType
    ulp(const DataType& x) noexcept(NO_ASSERT)
    {
      static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
    
      if (x == 0) {
        return std::numeric_limits<DataType>::denorm_min();
      }
    
      return std::pow(DataType{2}, std::ilogb(std::abs(x)) - std::numeric_limits<DataType>::digits);
    }
    
    template <typename DataType>
    DataType
    ufp(const DataType& x) noexcept(NO_ASSERT)
    {
      static_assert(std::is_floating_point_v<DataType>, "expecting floating point value");
    
      return std::pow(DataType{2}, std::ilogb(std::abs(x)));
    }
    
    // Useful bits per bin
    template <typename DataType>
    constexpr inline size_t bin_size = 0;
    template <>
    constexpr inline size_t bin_size<double> = 40;
    template <>
    constexpr inline size_t bin_size<float> = 12;
    
    // IEEE 754 epsilon
    template <typename DataType>
    constexpr inline double bin_epsilon = 0;
    template <>
    constexpr inline double bin_epsilon<double> = std::numeric_limits<double>::epsilon();
    template <>
    constexpr inline double bin_epsilon<float> = std::numeric_limits<float>::epsilon();
    
    // number of bins: improves precision
    template <typename DataType>
    constexpr inline size_t bin_number = 0;
    
    template <>
    constexpr inline size_t bin_number<double> = 3;
    template <>
    constexpr inline size_t bin_number<float> = 4;
    
    // max local sum size to avoid overflow
    template <typename DataType>
    constexpr inline size_t bin_max_size = 0;
    
    template <>
    constexpr inline size_t bin_max_size<double> = 2048;
    template <>
    constexpr inline size_t bin_max_size<float> = 1024;
    
    struct NoMask
    {
      PUGS_INLINE bool
      operator[](size_t) const
      {
        return true;
      }
    };
    
    }   // namespace reproducible_sum_utils
    
    template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
    class ReproducibleScalarSum
    {
     public:
      using DataType = std::decay_t<typename ArrayT::data_type>;
      static_assert(std::is_floating_point_v<DataType>);
    
      static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
      static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
      static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
    
      struct Bin
      {
        std::array<DataType, K> S;   // sum
        std::array<DataType, K> C;   // carry
    
        Bin& operator=(const Bin&) = default;
        Bin& operator=(Bin&&)      = default;
    
        Bin(Bin&&)      = default;
        Bin(const Bin&) = default;
    
        Bin(ZeroType)
        {
          for (size_t k = 0; k < K; ++k) {
            S[k] = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
            C[k] = 0;
          }
        }
    
        Bin() = default;
    
        ~Bin() = default;
      };
    
     private:
      Bin m_summation_bin;
    
      PUGS_INLINE
      static void
      _shift(const size_t g, Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = K - 1; k >= g; --k) {
          bin.S[k] = bin.S[k - g];
          bin.C[k] = bin.C[k - g];
        }
        for (size_t k = 0; k < std::min(K, g); ++k) {
          bin.S[k] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k]);
          bin.C[k] = 0;
        }
      }
    
      PUGS_INLINE static void
      _update(const DataType& m, Bin& bin) noexcept(NO_ASSERT)
      {
        Assert(m >= 0);
    
        using namespace reproducible_sum_utils;
        if (m >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0])) {
          const size_t shift = 1 + std::floor(std::log2(m / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0]))) / W);
    
          _shift(shift, bin);
        }
      }
    
      PUGS_INLINE
      void static _split2(DataType& S, DataType& x) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
        union
        {
          DataType as_DataType;
          typename IntegerType<DataType>::integer_t as_integer;
        } x_bar;
    
        x_bar.as_DataType = x;
        x_bar.as_integer |= 0x1;
    
        const DataType S0 = S;
        S += x_bar.as_DataType;
        x -= S - S0;
      }
    
      PUGS_INLINE static void
      _renormalize(Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = 0; k < K; ++k) {
          if (bin.S[k] >= 1.75 * ufp(bin.S[k])) {
            bin.S[k] -= 0.25 * ufp(bin.S[k]);
            bin.C[k] += 1;
          } else if (bin.S[k] < 1.25 * ufp(bin.S[k])) {
            bin.S[k] += 0.5 * ufp(bin.S[k]);
            bin.C[k] -= 2;
          } else if (bin.S[k] < 1.5 * ufp(bin.S[k])) {
            bin.S[k] += 0.25 * ufp(bin.S[k]);
            bin.C[k] -= 1;
          }
        }
      }
    
     public:
      static void
      addBinTo(Bin& bin, Bin& bin_sum)
      {
        using namespace reproducible_sum_utils;
    
        DataType ulp_bin = ulp(bin.S[0]);
        DataType ulp_sum = ulp(bin_sum.S[0]);
    
        if (ulp_bin < ulp_sum) {
          const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
          if (shift > 0) {
            _shift(shift, bin);
          }
        } else if (ulp_bin > ulp_sum) {
          const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
          if (shift > 0) {
            _shift(shift, bin_sum);
          }
        }
    
        for (size_t k = 0; k < K; ++k) {
          bin_sum.S[k] += bin.S[k] - 1.5 * ufp(bin.S[k]);
          bin_sum.C[k] += bin.C[k];
        }
    
        _renormalize(bin_sum);
      }
    
      PUGS_INLINE
      static DataType
      getValue(const Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        DataType value = 0;
        for (size_t k = 0; k < K; ++k) {
          value += 0.25 * (bin.C[k] - 6) * ufp(bin.S[k]) + bin.S[k];
        }
        return value;
      }
    
      PUGS_INLINE Bin
      getSummationBin() const noexcept(NO_ASSERT)
      {
        return m_summation_bin;
      }
    
      PUGS_INLINE DataType
      getSum() const noexcept(NO_ASSERT)
      {
        return getValue(m_summation_bin);
      }
    
      ReproducibleScalarSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
      {
        if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
          static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
                        "when provided, mask must be an array of bool");
        }
    
        using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
        using TeamMemberT = TeamPolicyT::member_type;
    
        int nx = reproducible_sum_utils::bin_max_size<DataType>;
        int ny = std::max(array.size() / nx, 1ul);
    
        const TeamPolicyT policy(ny, Kokkos::AUTO());
    
        Array<DataType> thread_sum(policy.team_size() * policy.league_size());
    
        Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
        bin_by_thread.fill(zero);
    
        Array<DataType> local_max(policy.team_size() * policy.league_size());
        local_max.fill(0);
    
        parallel_for(
          policy, PUGS_LAMBDA(const TeamMemberT& member) {
            const int i_team   = member.league_rank();
            const int i_thread = member.team_rank();
    
            const int thread_id = i_team * member.team_size() + i_thread;
    
            const int league_start = nx * i_team;
            const int block_size   = [&] {
              int size = nx;
              if (i_team == member.league_size() - 1) {
                size = array.size() - league_start;
              }
              return size;
            }();
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  DataType& m        = local_max[thread_id];
                  DataType abs_value = std::abs(array[league_start + i]);
                  if (abs_value > m) {
                    m = abs_value;
                  }
                }
              });
    
            _update(local_max[thread_id], bin_by_thread[thread_id]);
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  DataType x = array[nx * i_team + i];
                  for (size_t k = 0; k < K; ++k) {
                    _split2(bin_by_thread[thread_id].S[k], x);
                  };
                }
              });
    
            _renormalize(bin_by_thread[thread_id]);
          });
    
        m_summation_bin = bin_by_thread[0];
        for (size_t i = 1; i < bin_by_thread.size(); ++i) {
          addBinTo(bin_by_thread[i], m_summation_bin);
        }
      }
    
      ~ReproducibleScalarSum() = default;
    };
    
    template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
    class ReproducibleTinyVectorSum
    {
     public:
      using TinyVectorType = std::decay_t<typename ArrayT::data_type>;
      static_assert(is_tiny_vector_v<TinyVectorType>);
    
      using DataType = std::decay_t<typename TinyVectorType::data_type>;
      static_assert(std::is_floating_point_v<DataType>);
    
      static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
      static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
      static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
    
      struct Bin
      {
        std::array<TinyVectorType, K> S;   // sum
        std::array<TinyVectorType, K> C;   // carry
    
        Bin& operator=(const Bin&) = default;
        Bin& operator=(Bin&&)      = default;
    
        Bin(Bin&&)      = default;
        Bin(const Bin&) = default;
    
        Bin(ZeroType)
        {
          for (size_t k = 0; k < K; ++k) {
            const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
            for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
              S[k][i_component] = init_value;
            }
            C[k] = zero;
          }
        }
    
        Bin() = default;
    
        ~Bin() = default;
      };
    
     private:
      Bin m_summation_bin;
    
      PUGS_INLINE
      static void
      _shift(const size_t g, Bin& bin, const size_t& i_component) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = K - 1; k >= g; --k) {
          bin.S[k][i_component] = bin.S[k - g][i_component];
          bin.C[k][i_component] = bin.C[k - g][i_component];
        }
        for (size_t k = 0; k < std::min(K, g); ++k) {
          bin.S[k][i_component] = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k][i_component]);
          bin.C[k][i_component] = 0;
        }
      }
    
      PUGS_INLINE static void
      _update(const TinyVectorType& m, Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
          if (m[i_component] >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component])) {
            const size_t shift =
              1 + std::floor(std::log2(m[i_component] / (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0][i_component]))) / W);
    
            _shift(shift, bin, i_component);
          }
        }
      }
    
      PUGS_INLINE
      void static _split2(TinyVectorType& S, TinyVectorType& x) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
        union
        {
          DataType as_DataType;
          typename IntegerType<DataType>::integer_t as_integer;
        } x_bar;
    
        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
          x_bar.as_DataType = x[i_component];
          x_bar.as_integer |= 0x1;
    
          DataType& S_i = S[i_component];
          DataType& x_i = x[i_component];
    
          const DataType S0 = S_i;
          S_i += x_bar.as_DataType;
          x_i -= S_i - S0;
        }
      }
    
      PUGS_INLINE static void
      _renormalize(Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = 0; k < K; ++k) {
          TinyVectorType& S_k = bin.S[k];
          TinyVectorType& C_k = bin.C[k];
          for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
            DataType& Sk_i = S_k[i_component];
            DataType& Ck_i = C_k[i_component];
    
            if (Sk_i >= 1.75 * ufp(Sk_i)) {
              Sk_i -= 0.25 * ufp(Sk_i);
              Ck_i += 1;
            } else if (Sk_i < 1.25 * ufp(Sk_i)) {
              Sk_i += 0.5 * ufp(Sk_i);
              Ck_i -= 2;
            } else if (Sk_i < 1.5 * ufp(Sk_i)) {
              Sk_i += 0.25 * ufp(Sk_i);
              Ck_i -= 1;
            }
          }
        }
      }
    
     public:
      static void
      addBinTo(Bin& bin, Bin& bin_sum)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
          DataType ulp_bin = ulp(bin.S[0][i_component]);
          DataType ulp_sum = ulp(bin_sum.S[0][i_component]);
    
          if (ulp_bin < ulp_sum) {
            const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
            if (shift > 0) {
              _shift(shift, bin, i_component);
            }
          } else if (ulp_bin > ulp_sum) {
            const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
            if (shift > 0) {
              _shift(shift, bin_sum, i_component);
            }
          }
    
          for (size_t k = 0; k < K; ++k) {
            bin_sum.S[k][i_component] += bin.S[k][i_component] - 1.5 * ufp(bin.S[k][i_component]);
            bin_sum.C[k][i_component] += bin.C[k][i_component];
          }
        }
    
        _renormalize(bin_sum);
      }
    
      PUGS_INLINE
      static TinyVectorType
      getValue(const Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        TinyVectorType value = zero;
        for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
          for (size_t k = 0; k < K; ++k) {
            value[i_component] += 0.25 * (bin.C[k][i_component] - 6) * ufp(bin.S[k][i_component]) + bin.S[k][i_component];
          }
        }
        return value;
      }
    
      PUGS_INLINE Bin
      getSummationBin() const noexcept(NO_ASSERT)
      {
        return m_summation_bin;
      }
    
      PUGS_INLINE TinyVectorType
      getSum() const noexcept(NO_ASSERT)
      {
        return getValue(m_summation_bin);
      }
    
      ReproducibleTinyVectorSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
      {
        if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
          static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
                        "when provided, mask must be an array of bool");
        }
    
        using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
        using TeamMemberT = TeamPolicyT::member_type;
    
        int nx = reproducible_sum_utils::bin_max_size<DataType>;
        int ny = std::max(array.size() / nx, 1ul);
    
        const TeamPolicyT policy(ny, Kokkos::AUTO());
    
        Array<TinyVectorType> thread_sum(policy.team_size() * policy.league_size());
    
        Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
        bin_by_thread.fill(zero);
    
        Array<TinyVectorType> local_max(policy.team_size() * policy.league_size());
        local_max.fill(zero);
    
        parallel_for(
          policy, PUGS_LAMBDA(const TeamMemberT& member) {
            const int i_team   = member.league_rank();
            const int i_thread = member.team_rank();
    
            const int thread_id = i_team * member.team_size() + i_thread;
    
            const int league_start = nx * i_team;
            const int block_size   = [&] {
              int size = nx;
              if (i_team == member.league_size() - 1) {
                size = array.size() - league_start;
              }
              return size;
            }();
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  for (size_t i_component = 0; i_component < TinyVectorType::Dimension; ++i_component) {
                    DataType& m        = local_max[thread_id][i_component];
                    DataType abs_value = std::abs(array[league_start + i][i_component]);
                    if (abs_value > m) {
                      m = abs_value;
                    }
                  }
                }
              });
    
            _update(local_max[thread_id], bin_by_thread[thread_id]);
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  TinyVectorType x = array[nx * i_team + i];
                  for (size_t k = 0; k < K; ++k) {
                    _split2(bin_by_thread[thread_id].S[k], x);
                  }
                }
              });
    
            _renormalize(bin_by_thread[thread_id]);
          });
    
        m_summation_bin = bin_by_thread[0];
        for (size_t i = 1; i < bin_by_thread.size(); ++i) {
          addBinTo(bin_by_thread[i], m_summation_bin);
        }
      }
    
      ~ReproducibleTinyVectorSum() = default;
    };
    
    template <typename ArrayT, typename MaskType = reproducible_sum_utils::NoMask>
    class ReproducibleTinyMatrixSum
    {
     public:
      using TinyMatrixType = std::decay_t<typename ArrayT::data_type>;
      static_assert(is_tiny_matrix_v<TinyMatrixType>);
    
      using DataType = std::decay_t<typename TinyMatrixType::data_type>;
      static_assert(std::is_floating_point_v<DataType>);
    
      static constexpr size_t K     = reproducible_sum_utils::bin_number<DataType>;
      static constexpr DataType eps = reproducible_sum_utils::bin_epsilon<DataType>;
      static constexpr size_t W     = reproducible_sum_utils::bin_size<DataType>;
    
      struct Bin
      {
        std::array<TinyMatrixType, K> S;   // sum
        std::array<TinyMatrixType, K> C;   // carry
    
        Bin& operator=(const Bin&) = default;
        Bin& operator=(Bin&&)      = default;
    
        Bin(Bin&&)      = default;
        Bin(const Bin&) = default;
    
        Bin(ZeroType)
        {
          for (size_t k = 0; k < K; ++k) {
            const DataType init_value = 0.75 * eps * std::pow(DataType{2}, (K - k - 1) * W);
            for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
              for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
                S[k](i_component, j_component) = init_value;
              }
            }
            C[k] = zero;
          }
        }
    
        Bin()  = default;
        ~Bin() = default;
      };
    
     private:
      Bin m_summation_bin;
    
      PUGS_INLINE
      static void
      _shift(const size_t g, Bin& bin, const size_t i_component, const size_t j_component) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = K - 1; k >= g; --k) {
          bin.S[k](i_component, j_component) = bin.S[k - g](i_component, j_component);
          bin.C[k](i_component, j_component) = bin.C[k - g](i_component, j_component);
        }
        for (size_t k = 0; k < std::min(K, g); ++k) {
          bin.S[k](i_component, j_component) = 1.5 * std::pow(DataType{2}, g * W) * ufp(bin.S[k](i_component, j_component));
          bin.C[k](i_component, j_component) = 0;
        }
      }
    
      PUGS_INLINE static void
      _update(const TinyMatrixType& m, Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
            if (m(i_component, j_component) >= std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component))) {
              const size_t shift =
                1 + std::floor(std::log2(m(i_component, j_component) /
                                         (std::pow(DataType{2}, W - 1.) * ulp(bin.S[0](i_component, j_component)))) /
                               W);
    
              _shift(shift, bin, i_component, j_component);
            }
          }
        }
      }
    
      PUGS_INLINE
      void static _split2(TinyMatrixType& S, TinyMatrixType& x) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
        union
        {
          DataType as_DataType;
          typename IntegerType<DataType>::integer_t as_integer;
        } x_bar;
    
        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
            DataType& S_ij = S(i_component, j_component);
            DataType& x_ij = x(i_component, j_component);
    
            x_bar.as_DataType = x_ij;
            x_bar.as_integer |= 0x1;
    
            const DataType S0 = S_ij;
            S_ij += x_bar.as_DataType;
            x_ij -= S_ij - S0;
          }
        }
      }
    
      PUGS_INLINE static void
      _renormalize(Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t k = 0; k < K; ++k) {
          TinyMatrixType& S_k = bin.S[k];
          TinyMatrixType& C_k = bin.C[k];
          for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
            for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
              DataType& Sk_ij = S_k(i_component, j_component);
              DataType& Ck_ij = C_k(i_component, j_component);
    
              if (Sk_ij >= 1.75 * ufp(Sk_ij)) {
                Sk_ij -= 0.25 * ufp(Sk_ij);
                Ck_ij += 1;
              } else if (Sk_ij < 1.25 * ufp(Sk_ij)) {
                Sk_ij += 0.5 * ufp(Sk_ij);
                Ck_ij -= 2;
              } else if (Sk_ij < 1.5 * ufp(Sk_ij)) {
                Sk_ij += 0.25 * ufp(Sk_ij);
                Ck_ij -= 1;
              }
            }
          }
        }
      }
    
     public:
      static void
      addBinTo(Bin& bin, Bin& bin_sum)
      {
        using namespace reproducible_sum_utils;
    
        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
            DataType ulp_bin = ulp(bin.S[0](i_component, j_component));
            DataType ulp_sum = ulp(bin_sum.S[0](i_component, j_component));
    
            if (ulp_bin < ulp_sum) {
              const size_t shift = std::floor(std::log2(ulp_sum / ulp_bin) / W);
              if (shift > 0) {
                _shift(shift, bin, i_component, j_component);
              }
            } else if (ulp_bin > ulp_sum) {
              const size_t shift = std::floor(std::log2(ulp_bin / ulp_sum) / W);
              if (shift > 0) {
                _shift(shift, bin_sum, i_component, j_component);
              }
            }
    
            for (size_t k = 0; k < K; ++k) {
              bin_sum.S[k](i_component, j_component) +=
                bin.S[k](i_component, j_component) - 1.5 * ufp(bin.S[k](i_component, j_component));
              bin_sum.C[k](i_component, j_component) += bin.C[k](i_component, j_component);
            }
          }
        }
    
        _renormalize(bin_sum);
      }
    
      PUGS_INLINE
      static TinyMatrixType
      getValue(const Bin& bin) noexcept(NO_ASSERT)
      {
        using namespace reproducible_sum_utils;
    
        TinyMatrixType value = zero;
        for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
          for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
            for (size_t k = 0; k < K; ++k) {
              value(i_component, j_component) +=
                0.25 * (bin.C[k](i_component, j_component) - 6) * ufp(bin.S[k](i_component, j_component)) +
                bin.S[k](i_component, j_component);
            }
          }
        }
        return value;
      }
    
      PUGS_INLINE Bin
      getSummationBin() const noexcept(NO_ASSERT)
      {
        return m_summation_bin;
      }
    
      PUGS_INLINE TinyMatrixType
      getSum() const noexcept(NO_ASSERT)
      {
        return getValue(m_summation_bin);
      }
    
      ReproducibleTinyMatrixSum(const ArrayT& array, const MaskType mask = reproducible_sum_utils::NoMask{})
      {
        if constexpr (not std::is_same_v<MaskType, reproducible_sum_utils::NoMask>) {
          static_assert(std::is_same_v<std::decay_t<typename MaskType::data_type>, bool>,
                        "when provided, mask must be an array of bool");
        }
    
        using TeamPolicyT = Kokkos::TeamPolicy<Kokkos::IndexType<int>>;
        using TeamMemberT = TeamPolicyT::member_type;
    
        int nx = reproducible_sum_utils::bin_max_size<DataType>;
        int ny = std::max(array.size() / nx, 1ul);
    
        const TeamPolicyT policy(ny, Kokkos::AUTO());
    
        Array<TinyMatrixType> thread_sum(policy.team_size() * policy.league_size());
    
        Array<Bin> bin_by_thread(policy.team_size() * policy.league_size());
        bin_by_thread.fill(zero);
    
        Array<TinyMatrixType> local_max(policy.team_size() * policy.league_size());
        local_max.fill(zero);
    
        parallel_for(
          policy, PUGS_LAMBDA(const TeamMemberT& member) {
            const int i_team   = member.league_rank();
            const int i_thread = member.team_rank();
    
            const int thread_id = i_team * member.team_size() + i_thread;
    
            const int league_start = nx * i_team;
            const int block_size   = [&] {
              int size = nx;
              if (i_team == member.league_size() - 1) {
                size = array.size() - league_start;
              }
              return size;
            }();
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  for (size_t i_component = 0; i_component < TinyMatrixType::NumberOfRows; ++i_component) {
                    for (size_t j_component = 0; j_component < TinyMatrixType::NumberOfColumns; ++j_component) {
                      DataType& m        = local_max[thread_id](i_component, j_component);
                      DataType abs_value = std::abs(array[league_start + i](i_component, j_component));
                      if (abs_value > m) {
                        m = abs_value;
                      }
                    }
                  }
                }
              });
    
            _update(local_max[thread_id], bin_by_thread[thread_id]);
    
            parallel_for(
              Kokkos::TeamThreadRange(member, block_size), PUGS_LAMBDA(int i) {
                if (mask[league_start + i]) {
                  TinyMatrixType x = array[nx * i_team + i];
                  for (size_t k = 0; k < K; ++k) {
                    _split2(bin_by_thread[thread_id].S[k], x);
                  }
                }
              });
    
            _renormalize(bin_by_thread[thread_id]);
          });
    
        m_summation_bin = bin_by_thread[0];
        for (size_t i = 1; i < bin_by_thread.size(); ++i) {
          addBinTo(bin_by_thread[i], m_summation_bin);
        }
      }
    
      ~ReproducibleTinyMatrixSum() = default;
    };
    
    #endif   // REPRODUCIBLE_SUM_UTILS_HPP