diff --git a/src/algebra/CMakeLists.txt b/src/algebra/CMakeLists.txt
index d1f01ec310858be0e775514f934ce0a7c0e9da28..5625ef41d001a72f06bf3ebfc7e0237590743e3c 100644
--- a/src/algebra/CMakeLists.txt
+++ b/src/algebra/CMakeLists.txt
@@ -2,6 +2,7 @@
 
 add_library(
   PugsAlgebra
+  LeastSquareSolver.cpp
   LinearSolver.cpp
   LinearSolverOptions.cpp
   PETScWrapper.cpp)
diff --git a/src/algebra/LeastSquareSolver.cpp b/src/algebra/LeastSquareSolver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aba9ff3dae12772b1cc4e653eb8e304d3afb3784
--- /dev/null
+++ b/src/algebra/LeastSquareSolver.cpp
@@ -0,0 +1,26 @@
+#include <algebra/LeastSquareSolver.hpp>
+
+#include <algebra/CG.hpp>
+
+template <typename T>
+void
+LeastSquareSolver::solveLocalSystem(const LocalRectangularMatrix<T>& A, Vector<T>& x, const Vector<T>& b)
+{
+  if (A.dim0() >= A.dim1()) {
+    const LocalRectangularMatrix tA = transpose(A);
+    const LocalRectangularMatrix B  = tA * A;
+    const Vector y                  = tA * b;
+    CG{B, x, y, 1e-12, 10, false};
+  } else {
+    const LocalRectangularMatrix tA = transpose(A);
+    const LocalRectangularMatrix B  = A * tA;
+    Vector<double> y{A.dim0()};
+    CG{B, y, b, 1e-12, 10, false};
+
+    x = tA * y;
+  }
+}
+
+template void LeastSquareSolver::solveLocalSystem(const LocalRectangularMatrix<double>&,
+                                                  Vector<double>&,
+                                                  const Vector<double>&);
diff --git a/src/algebra/LeastSquareSolver.hpp b/src/algebra/LeastSquareSolver.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b097e940c5e0de5d894392d2d25f0ca03d5149cb
--- /dev/null
+++ b/src/algebra/LeastSquareSolver.hpp
@@ -0,0 +1,17 @@
+#ifndef LEAST_SQUARE_SOLVER_HPP
+#define LEAST_SQUARE_SOLVER_HPP
+
+#include <algebra/LocalRectangularMatrix.hpp>
+#include <algebra/Vector.hpp>
+
+class LeastSquareSolver
+{
+ public:
+  template <typename T>
+  void solveLocalSystem(const LocalRectangularMatrix<T>& A, Vector<T>& x, const Vector<T>& b);
+
+  LeastSquareSolver()  = default;
+  ~LeastSquareSolver() = default;
+};
+
+#endif   // LEAST_SQUARE_SOLVER_HPP
diff --git a/src/language/algorithms/ElasticityDiamondAlgorithm.cpp b/src/language/algorithms/ElasticityDiamondAlgorithm.cpp
index 755d979314b34b87af98cbf521b5eae17d616812..949397d46fb7502a16b0c8f88e7abb1ae5e55585 100644
--- a/src/language/algorithms/ElasticityDiamondAlgorithm.cpp
+++ b/src/language/algorithms/ElasticityDiamondAlgorithm.cpp
@@ -1,7 +1,7 @@
 #include <language/algorithms/ElasticityDiamondAlgorithm.hpp>
 
-#include <algebra/CG.hpp>
 #include <algebra/CRSMatrix.hpp>
+#include <algebra/LeastSquareSolver.hpp>
 #include <algebra/LinearSolver.hpp>
 #include <algebra/LocalRectangularMatrix.hpp>
 #include <algebra/SparseMatrixDescriptor.hpp>
@@ -307,12 +307,12 @@ ElasticityDiamondScheme<Dimension>::ElasticityDiamondScheme(
             A(i, j)        = xj[J][i - 1];
           }
         }
+
         Vector<double> x{node_to_cell.size()};
         x = 0;
 
-        LocalRectangularMatrix B = transpose(A) * A;
-        Vector y                 = transpose(A) * b;
-        CG{B, x, y, 1e-12, 10, false};
+        LeastSquareSolver ls_solver;
+        ls_solver.solveLocalSystem(A, x, b);
 
         for (size_t j = 0; j < node_to_cell.size(); j++) {
           w_rj(i_node, j) = x[j];
@@ -357,9 +357,8 @@ ElasticityDiamondScheme<Dimension>::ElasticityDiamondScheme(
         Vector<double> x{node_to_cell.size() + nb_face_used};
         x = 0;
 
-        LocalRectangularMatrix B = transpose(A) * A;
-        Vector y                 = transpose(A) * b;
-        CG{B, x, y, 1e-12, 10, false};
+        LeastSquareSolver ls_solver;
+        ls_solver.solveLocalSystem(A, x, b);
 
         for (size_t j = 0; j < node_to_cell.size(); j++) {
           w_rj(i_node, j) = x[j];
@@ -726,7 +725,31 @@ ElasticityDiamondScheme<Dimension>::ElasticityDiamondScheme(
 
       VTKWriter vtk_writer("Speed_" + std::to_string(Dimension), 0.01);
 
-      vtk_writer.write(mesh, {NamedItemValue{"U", Speed}, NamedItemValue{"Uexacte", Uj}}, 0,
+      NodeValue<TinyVector<3>> ur3d{mesh->connectivity()};
+      ur3d.fill(zero);
+
+      parallel_for(
+        mesh->numberOfNodes(), PUGS_LAMBDA(NodeId node_id) {
+          TinyVector<Dimension> x = zero;
+          const auto node_cells   = node_to_cell_matrix[node_id];
+          for (size_t i_cell = 0; i_cell < node_cells.size(); ++i_cell) {
+            CellId cell_id = node_cells[i_cell];
+            x += w_rj(node_id, i_cell) * Speed[cell_id];
+          }
+          const auto node_faces = node_to_face_matrix[node_id];
+          for (size_t i_face = 0; i_face < node_faces.size(); ++i_face) {
+            FaceId face_id = node_faces[i_face];
+            if (primal_face_is_on_boundary[face_id]) {
+              x += w_rl(node_id, i_face) * Speed_face[face_id];
+            }
+          }
+          for (size_t i = 0; i < Dimension; ++i) {
+            ur3d[node_id][i] = x[i];
+          }
+        });
+
+      vtk_writer.write(mesh, {NamedItemValue{"U", Speed}, NamedItemValue{"Ur3d", ur3d}, NamedItemValue{"Uexacte", Uj}},
+                       0,
                        true);   // forces last output
     }
   } else {
diff --git a/src/language/algorithms/Heat5PointsAlgorithm.cpp b/src/language/algorithms/Heat5PointsAlgorithm.cpp
index 512a64dd2bc04e85c0bfd0bd6e97b1c28d5e8e86..7e968083c4c9ab52daf924699769df6baa696d82 100644
--- a/src/language/algorithms/Heat5PointsAlgorithm.cpp
+++ b/src/language/algorithms/Heat5PointsAlgorithm.cpp
@@ -1,7 +1,7 @@
 #include <language/algorithms/Heat5PointsAlgorithm.hpp>
 
-#include <algebra/CG.hpp>
 #include <algebra/CRSMatrix.hpp>
+#include <algebra/LeastSquareSolver.hpp>
 #include <algebra/LinearSolver.hpp>
 #include <algebra/LinearSolverOptions.hpp>
 #include <algebra/LocalRectangularMatrix.hpp>
@@ -67,10 +67,8 @@ Heat5PointsAlgorithm<Dimension>::Heat5PointsAlgorithm(
       Vector<double> x{node_to_cell.size()};
       x = 0;
 
-      LocalRectangularMatrix B = transpose(A) * A;
-      Vector y                 = transpose(A) * b;
-
-      CG{B, x, y, 1e-12, 10, true};
+      LeastSquareSolver ls_solver;
+      ls_solver.solveLocalSystem(A, x, b);
 
       Tr[i_node] = 0;
       for (size_t j = 0; j < node_to_cell.size(); j++) {
diff --git a/src/language/algorithms/HeatDiamondAlgorithm.cpp b/src/language/algorithms/HeatDiamondAlgorithm.cpp
index 68994e2525c9069ba08d178b7b57a7fb8208319e..dbefc34bffbe660d44937030155053f2411327e3 100644
--- a/src/language/algorithms/HeatDiamondAlgorithm.cpp
+++ b/src/language/algorithms/HeatDiamondAlgorithm.cpp
@@ -1,7 +1,7 @@
 #include <language/algorithms/HeatDiamondAlgorithm.hpp>
 
-#include <algebra/CG.hpp>
 #include <algebra/CRSMatrix.hpp>
+#include <algebra/LeastSquareSolver.hpp>
 #include <algebra/LinearSolver.hpp>
 #include <algebra/LocalRectangularMatrix.hpp>
 #include <algebra/SparseMatrixDescriptor.hpp>
@@ -270,9 +270,8 @@ HeatDiamondScheme<Dimension>::HeatDiamondScheme(
         Vector<double> x{node_to_cell.size()};
         x = 0;
 
-        LocalRectangularMatrix B = transpose(A) * A;
-        Vector y                 = transpose(A) * b;
-        CG{B, x, y, 1e-12, 10, false};
+        LeastSquareSolver ls_solver;
+        ls_solver.solveLocalSystem(A, x, b);
 
         Tr[i_node] = 0;
         for (size_t j = 0; j < node_to_cell.size(); j++) {
@@ -311,9 +310,8 @@ HeatDiamondScheme<Dimension>::HeatDiamondScheme(
         Vector<double> x{node_to_cell.size() + nb_face_used};
         x = 0;
 
-        LocalRectangularMatrix B = transpose(A) * A;
-        Vector y                 = transpose(A) * b;
-        CG{B, x, y, 1e-12, 10, false};
+        LeastSquareSolver ls_solver;
+        ls_solver.solveLocalSystem(A, x, b);
 
         Tr[i_node] = 0;
         for (size_t j = 0; j < node_to_cell.size(); j++) {
diff --git a/tests/test_CG.cpp b/tests/test_CG.cpp
index 9cf2b2a9d6a2e8f8744b395638976c8c2a4247c5..c334f4413f0766131b5387c5fd2f0112d1241102 100644
--- a/tests/test_CG.cpp
+++ b/tests/test_CG.cpp
@@ -110,117 +110,4 @@ TEST_CASE("CG", "[algebra]")
     Vector error = x - x_exact;
     REQUIRE(std::sqrt((error, error)) > 1E-10 * std::sqrt((x, x)));
   }
-
-  SECTION("Least Squares")
-  {
-    Vector<double> b{3};
-    b[0] = 1;
-    b[1] = 0;
-    b[2] = 0;
-
-    LocalRectangularMatrix<double> A{3, 4};
-    A(0, 0) = 1;
-    A(0, 1) = 1;
-    A(0, 2) = 1;
-    A(0, 3) = 1;
-    A(1, 0) = 1;
-    A(1, 1) = -1;
-    A(1, 2) = 0;
-    A(1, 3) = 0;
-    A(2, 0) = 0;
-    A(2, 1) = 0;
-    A(2, 2) = 1;
-    A(2, 3) = -1;
-
-    LocalRectangularMatrix B = transpose(A) * A;
-    Vector y                 = transpose(A) * b;
-
-    Vector<double> x_exact{4};
-    x_exact[0] = 0.25;
-    x_exact[1] = 0.25;
-    x_exact[2] = 0.25;
-    x_exact[3] = 0.25;
-
-    Vector<double> x{4};
-    x = 0;
-
-    CG{B, x, y, 1e-12, 10, false};
-    Vector error = x - x_exact;
-    REQUIRE(error[0] == 0);
-    REQUIRE(error[1] == 0);
-    REQUIRE(error[2] == 0);
-    REQUIRE(error[3] == 0);
-    REQUIRE(std::sqrt((error, error)) < 1E-10 * std::sqrt((x, x)));
-  }
-
-  SECTION("Least Squares 2")
-  {
-    LocalRectangularMatrix<double> A{3, 4};
-    A(0, 0) = 1;
-    A(0, 1) = 1;
-    A(0, 2) = 1;
-    A(0, 3) = 1;
-    A(1, 0) = 1;
-    A(1, 1) = 4;
-    A(1, 2) = -1;
-    A(1, 3) = -2;
-    A(2, 0) = 2;
-    A(2, 1) = -0.5;
-    A(2, 2) = -3;
-    A(2, 3) = 0.5;
-
-    LocalRectangularMatrix B = transpose(A) * A;
-
-    Vector<double> x_exact{4};
-    x_exact[0] = 0.4;
-    x_exact[1] = 0.1;
-    x_exact[2] = 0.2;
-    x_exact[3] = 0.3;
-
-    Vector y = B * x_exact;
-
-    Vector<double> x{4};
-    x = 0;
-
-    CG{B, x, y, 1e-12, 10, false};
-    Vector error = A * x - A * x_exact;
-    REQUIRE(std::sqrt((error, error)) < 1E-10 * std::sqrt((x, x)));
-  }
-
-  SECTION("Least Squares 3")
-  {
-    LocalRectangularMatrix<double> A{3, 3};
-    A(0, 0) = 1;
-    A(0, 1) = 1;
-    A(0, 2) = 1;
-    A(1, 0) = 1;
-    A(1, 1) = 4;
-    A(1, 2) = -2;
-    A(2, 0) = 2;
-    A(2, 1) = -3;
-    A(2, 2) = -1;
-
-    Vector<double> b{3};
-    b[0] = 1;
-    b[1] = 1;
-    b[2] = -2. / 3;
-
-    LocalRectangularMatrix B = transpose(A) * A;
-
-    Vector<double> x_exact{3};
-    x_exact[0] = 1. / 3;
-    x_exact[1] = 1. / 3;
-    x_exact[2] = 1. / 3;
-
-    Vector y = transpose(A) * b;
-
-    Vector<double> x{3};
-    x = 0;
-
-    CG{B, x, y, 1e-12, 10, false};
-    Vector error1 = A * x - b;
-    REQUIRE(std::sqrt((error1, error1)) < 1E-10 * std::sqrt((x, x)));
-    Vector error2 = x - x_exact;
-    REQUIRE(std::sqrt((error2, error2)) < 1E-10 * std::sqrt((x, x)));
-  }
 }
diff --git a/tests/test_LeastSquareSolver.cpp b/tests/test_LeastSquareSolver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6022bbe151ff2c261dc21b015692b9045edcc9ca
--- /dev/null
+++ b/tests/test_LeastSquareSolver.cpp
@@ -0,0 +1,77 @@
+#include <catch2/catch.hpp>
+
+#include <algebra/LeastSquareSolver.hpp>
+#include <algebra/LocalRectangularMatrix.hpp>
+#include <algebra/Vector.hpp>
+
+// clazy:excludeall=non-pod-global-static
+
+TEST_CASE("LeastSquareSolver", "[algebra]")
+{
+  SECTION("Least Squares [under-determined]")
+  {
+    Vector<double> b{3};
+    b[0] = 1;
+    b[1] = 0;
+    b[2] = 0;
+
+    LocalRectangularMatrix<double> A{3, 4};
+    A(0, 0) = 1;
+    A(0, 1) = 1;
+    A(0, 2) = 1;
+    A(0, 3) = 1;
+    A(1, 0) = 1;
+    A(1, 1) = -1;
+    A(1, 2) = 0;
+    A(1, 3) = 0;
+    A(2, 0) = 0;
+    A(2, 1) = 0;
+    A(2, 2) = 1;
+    A(2, 3) = -1;
+
+    Vector<double> x_exact{4};
+
+    x_exact[0] = 0.25;
+    x_exact[1] = 0.25;
+    x_exact[2] = 0.25;
+    x_exact[3] = 0.25;
+
+    Vector<double> x{4};
+    x = 0;
+
+    LeastSquareSolver ls_solver;
+    ls_solver.solveLocalSystem(A, x, b);
+
+    Vector error = x - x_exact;
+    REQUIRE(std::sqrt((error, error)) < 1E-10 * std::sqrt((x, x)));
+  }
+
+  SECTION("Least Squares [over-determined]")
+  {
+    LocalRectangularMatrix<double> A{3, 2};
+    A(0, 0) = 0;
+    A(0, 1) = 1;
+    A(1, 0) = 1;
+    A(1, 1) = 1;
+    A(2, 0) = 2;
+    A(2, 1) = 1;
+
+    Vector<double> x_exact{2};
+    x_exact[0] = -3;
+    x_exact[1] = 5;
+
+    Vector b{3};
+    b[0] = 6;
+    b[1] = 0;
+    b[2] = 0;
+
+    Vector<double> x{2};
+    x = 0;
+
+    LeastSquareSolver ls_solver;
+    ls_solver.solveLocalSystem(A, x, b);
+
+    Vector error = x - x_exact;
+    REQUIRE(std::sqrt((error, error)) < 1E-10 * std::sqrt((x_exact, x_exact)));
+  }
+}