From 0ff0d30d8a756e7ca17502191d5a6cbe269cfd80 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 2 Jun 2026 14:13:06 -0700
Subject: [PATCH 01/91] #525 adding LagrangeBasis/Serendipity function support
 and unit tests for refactored basis functions

---
 Code/Source/solver/CMakeLists.txt             |   25 +-
 Code/Source/solver/FE/Basis/BasisCache.cpp    |  309 +
 Code/Source/solver/FE/Basis/BasisCache.h      |  456 +
 Code/Source/solver/FE/Basis/BasisExceptions.h |  134 +
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  160 +
 Code/Source/solver/FE/Basis/BasisFactory.h    |   57 +
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  366 +
 Code/Source/solver/FE/Basis/BasisFunction.h   |  426 +
 Code/Source/solver/FE/Basis/BasisTolerance.h  |   52 +
 Code/Source/solver/FE/Basis/BasisTraits.h     |  218 +
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 8323 +++++++++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  175 +
 .../solver/FE/Basis/LagrangeBasisFast.h       | 1378 +++
 .../solver/FE/Basis/LagrangeBasisPyramid.cpp  | 2069 ++++
 .../solver/FE/Basis/LagrangeBasisPyramid.h    |   67 +
 .../solver/FE/Basis/LagrangeBasisSimplex.cpp  | 2457 +++++
 .../solver/FE/Basis/LagrangeBasisSimplex.h    |   78 +
 .../solver/FE/Basis/LagrangeBasisUtility.h    |   25 +
 .../FE/Basis/NodeOrderingConventions.cpp      |  818 ++
 .../solver/FE/Basis/NodeOrderingConventions.h |  538 ++
 .../solver/FE/Basis/PyramidModalBasis.h       |  265 +
 .../solver/FE/Basis/SerendipityBasis.cpp      |  882 ++
 .../Source/solver/FE/Basis/SerendipityBasis.h |   70 +
 Code/Source/solver/FE/Basis/VectorBasis.h     |  255 +
 .../FE/Basis/VectorBasisEvaluationHelpers.cpp |  593 ++
 .../FE/Basis/VectorBasisEvaluationHelpers.h   |  751 ++
 .../FE/Basis/VectorBasisModalPolynomial.h     |   77 +
 Code/Source/solver/FE/Common/Alignment.h      |   23 +
 Code/Source/solver/FE/Common/Types.h          |  532 ++
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  480 +
 .../solver/FE/Math/DenseLinearAlgebra.h       |  119 +
 .../solver/FE/Math/DenseTransformKernels.h    |   78 +
 Code/Source/solver/FE/Math/ExpressionOps.h    |   99 +
 Code/Source/solver/FE/Math/IntegerMath.h      |   98 +
 Code/Source/solver/FE/Math/MathConstants.h    |  388 +
 Code/Source/solver/FE/Math/Matrix.h           | 1487 +++
 Code/Source/solver/FE/Math/MatrixExpr.h       |  626 ++
 Code/Source/solver/FE/Math/Vector.h           |  831 ++
 Code/Source/solver/FE/Math/VectorExpr.h       |  418 +
 .../solver/FE/Quadrature/QuadratureRule.h     |  237 +
 Code/Source/solver/fs.cpp                     |   71 +-
 Code/Source/solver/nn.cpp                     |  666 +-
 .../FE/Basis/test_BasisCacheFactory.cpp       |  256 +
 .../FE/Basis/test_BasisErrorPaths.cpp         |  203 +
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  314 +
 .../FE/Basis/test_ConstexprBasis.cpp          |  226 +
 .../FE/Basis/test_HigherOrderWedgePyramid.cpp |  173 +
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 3028 ++++++
 .../FE/Basis/test_SerendipityTensorModal.cpp  |  116 +
 .../FE/Math/test_DenseLinearAlgebra.cpp       |  265 +
 .../unitTests/FE/Math/test_ExpressionOps.cpp  |  509 +
 .../unitTests/FE/Math/test_MathConstants.cpp  |  341 +
 tests/unitTests/FE/Math/test_Matrix.cpp       |  594 ++
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |  528 ++
 tests/unitTests/FE/Math/test_Vector.cpp       |  589 ++
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |  409 +
 56 files changed, 34681 insertions(+), 47 deletions(-)
 create mode 100644 Code/Source/solver/FE/Basis/BasisCache.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisCache.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisExceptions.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisFactory.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisFactory.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisFunction.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisFunction.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisTolerance.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisTraits.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasis.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisFast.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
 create mode 100644 Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
 create mode 100644 Code/Source/solver/FE/Basis/NodeOrderingConventions.h
 create mode 100644 Code/Source/solver/FE/Basis/PyramidModalBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/SerendipityBasis.cpp
 create mode 100644 Code/Source/solver/FE/Basis/SerendipityBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
 create mode 100644 Code/Source/solver/FE/Common/Alignment.h
 create mode 100644 Code/Source/solver/FE/Common/Types.h
 create mode 100644 Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
 create mode 100644 Code/Source/solver/FE/Math/DenseLinearAlgebra.h
 create mode 100644 Code/Source/solver/FE/Math/DenseTransformKernels.h
 create mode 100644 Code/Source/solver/FE/Math/ExpressionOps.h
 create mode 100644 Code/Source/solver/FE/Math/IntegerMath.h
 create mode 100644 Code/Source/solver/FE/Math/MathConstants.h
 create mode 100644 Code/Source/solver/FE/Math/Matrix.h
 create mode 100644 Code/Source/solver/FE/Math/MatrixExpr.h
 create mode 100644 Code/Source/solver/FE/Math/Vector.h
 create mode 100644 Code/Source/solver/FE/Math/VectorExpr.h
 create mode 100644 Code/Source/solver/FE/Quadrature/QuadratureRule.h
 create mode 100644 tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_BasisHessians.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
 create mode 100644 tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
 create mode 100644 tests/unitTests/FE/Math/test_ExpressionOps.cpp
 create mode 100644 tests/unitTests/FE/Math/test_MathConstants.cpp
 create mode 100644 tests/unitTests/FE/Math/test_Matrix.cpp
 create mode 100644 tests/unitTests/FE/Math/test_MatrixExpr.cpp
 create mode 100644 tests/unitTests/FE/Math/test_Vector.cpp
 create mode 100644 tests/unitTests/FE/Math/test_VectorExpr.cpp

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index c546c2822..e42391862 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -23,15 +23,18 @@
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 include_directories(${SV_SOURCE_DIR}/ThirdParty/eigen/include)
+include_directories(${SV_SOURCE_DIR}/ThirdParty/eigen/include/eigen3)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/parmetis_internal/simvascular_parmetis_internal/ParMETISLib)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/tetgen/simvascular_tetgen)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/tinyxml/simvascular_tinyxml)
 include_directories(${MPI_C_INCLUDE_PATH})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/Core)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/FE)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/FE/Common)
 
 # Find Trilinos package if requested
@@ -86,7 +89,7 @@ endif()
 # add trilinos flags and defines
 if(USE_TRILINOS)
   ADD_DEFINITIONS(-DWITH_TRILINOS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20")
 endif()
 
 # Build with the PETSc linear algebra package.
@@ -245,9 +248,27 @@ file(GLOB SOLVER_FE_COMMON_SRCS CONFIGURE_DEPENDS
   FE/Common/*.h
 )
 
+file(GLOB SOLVER_FE_BASIS_SRCS CONFIGURE_DEPENDS
+  FE/Basis/*.cpp
+  FE/Basis/*.h
+)
+
+file(GLOB SOLVER_FE_MATH_SRCS CONFIGURE_DEPENDS
+  FE/Math/*.cpp
+  FE/Math/*.h
+)
+
+file(GLOB SOLVER_FE_QUADRATURE_SRCS CONFIGURE_DEPENDS
+  FE/Quadrature/*.cpp
+  FE/Quadrature/*.h
+)
+
 list(APPEND CSRCS
   ${SOLVER_CORE_SRCS}
   ${SOLVER_FE_COMMON_SRCS}
+  ${SOLVER_FE_BASIS_SRCS}
+  ${SOLVER_FE_MATH_SRCS}
+  ${SOLVER_FE_QUADRATURE_SRCS}
 )
 
   # Set PETSc interace code.
diff --git a/Code/Source/solver/FE/Basis/BasisCache.cpp b/Code/Source/solver/FE/Basis/BasisCache.cpp
new file mode 100644
index 000000000..6d8a4ede3
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisCache.cpp
@@ -0,0 +1,309 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisCache.h"
+#include <utility>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+QuadratureCacheKey make_quadrature_cache_key(const quadrature::QuadratureRule& quad) noexcept {
+    const auto fingerprint = quad.point_fingerprint();
+    return QuadratureCacheKey{fingerprint.dimension,
+                              fingerprint.num_points,
+                              fingerprint.points_hash_a,
+                              fingerprint.points_hash_b};
+}
+
+void mix_hash_word(std::uint64_t word,
+                   std::uint64_t& hash_a,
+                   std::uint64_t& hash_b) noexcept {
+    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
+    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+}
+
+std::pair<std::uint64_t, std::uint64_t>
+identity_fingerprint(const std::string& identity) noexcept {
+    std::uint64_t hash_a = 0xa4093822299f31d0ULL;
+    std::uint64_t hash_b = 0x082efa98ec4e6c89ULL;
+    mix_hash_word(static_cast<std::uint64_t>(identity.size()), hash_a, hash_b);
+    for (const char c : identity) {
+        mix_hash_word(static_cast<std::uint64_t>(static_cast<unsigned char>(c)), hash_a, hash_b);
+    }
+    return {hash_a, hash_b};
+}
+
+BasisCacheKey make_basis_cache_key(const BasisFunction& basis,
+                                   const quadrature::QuadratureRule& quad,
+                                   bool gradients,
+                                   bool hessians) {
+    StructuralBasisKey structural_key{
+        basis.basis_type(),
+        basis.element_type(),
+        basis.dimension(),
+        basis.order(),
+        basis.size(),
+        basis.is_vector_valued(),
+        make_quadrature_cache_key(quad),
+        gradients,
+        hessians
+    };
+
+    BasisCacheKey key;
+    const bool uses_basis_identity = !basis.cache_identity_is_structural();
+    if (!uses_basis_identity) {
+        key.value = structural_key;
+        return key;
+    }
+
+    std::vector<std::uint64_t> basis_identity_words;
+    const bool uses_structured_identity = basis.cache_identity_words(basis_identity_words);
+    if (!uses_structured_identity) {
+        basis_identity_words.clear();
+    }
+    const std::string basis_identity =
+        uses_structured_identity ? std::string{} : basis.cache_identity();
+    BasisIdentityFingerprint cached_identity_hash{};
+    const bool has_cached_identity_hash =
+        uses_structured_identity &&
+        basis.cache_identity_fingerprint(cached_identity_hash.hash_a,
+                                         cached_identity_hash.hash_b);
+    const auto identity_hash = uses_structured_identity
+        ? has_cached_identity_hash
+              ? std::pair<std::uint64_t, std::uint64_t>{
+                    cached_identity_hash.hash_a,
+                    cached_identity_hash.hash_b}
+              : [&basis_identity_words] {
+                    const auto fingerprint =
+                        compute_basis_identity_fingerprint(basis_identity_words);
+                    return std::pair<std::uint64_t, std::uint64_t>{
+                        fingerprint.hash_a,
+                        fingerprint.hash_b};
+                }()
+        : identity_fingerprint(basis_identity);
+    key.value = ParameterizedBasisKey{
+        structural_key,
+        uses_structured_identity,
+        identity_hash.first,
+        identity_hash.second,
+        std::move(basis_identity_words),
+        basis_identity
+    };
+    return key;
+}
+
+} // namespace
+
+BasisCache& BasisCache::instance() {
+    static BasisCache cache;
+    return cache;
+}
+
+const BasisCacheEntry& BasisCache::get_or_compute(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return *get_or_compute_shared(basis, quad, gradients, hessians);
+}
+
+std::shared_ptr<const BasisCacheEntry> BasisCache::get_or_compute_shared(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    const BasisCacheKey key = make_basis_cache_key(basis, quad, gradients, hessians);
+
+    // Warm path: shared (reader) lock allows concurrent cache hits.
+    {
+        std::shared_lock<std::shared_mutex> read_lock(mutex_);
+        auto it = slots_.find(key);
+        if (it != slots_.end() && it->second.entry) {
+            return it->second.entry;
+        }
+    }
+
+    std::shared_ptr<InFlightComputation> in_flight;
+    bool owner = false;
+    {
+        std::unique_lock<std::shared_mutex> write_lock(mutex_);
+        auto& slot = slots_[key];
+        if (slot.entry) {
+            return slot.entry;
+        }
+
+        if (!slot.pending) {
+            in_flight = std::make_shared<InFlightComputation>();
+            slot.pending = in_flight;
+            owner = true;
+        } else {
+            in_flight = slot.pending;
+        }
+    }
+
+    if (!owner) {
+        std::unique_lock<std::mutex> wait_lock(in_flight->mutex);
+        in_flight->ready_cv.wait(wait_lock, [&in_flight] { return in_flight->ready; });
+        if (in_flight->exception) {
+            std::rethrow_exception(in_flight->exception);
+        }
+        return in_flight->entry;
+    }
+
+    try {
+        auto entry = std::make_shared<BasisCacheEntry>(compute(basis, quad, gradients, hessians));
+        {
+            std::unique_lock<std::shared_mutex> write_lock(mutex_);
+            auto slot_it = slots_.find(key);
+            if (slot_it == slots_.end()) {
+                slot_it = slots_.emplace(key, CacheSlot{}).first;
+            }
+            auto& slot = slot_it->second;
+            if (slot.entry) {
+                entry = slot.entry;
+            } else {
+                slot.entry = entry;
+            }
+            if (slot.pending == in_flight) {
+                slot.pending.reset();
+            }
+        }
+        {
+            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
+            in_flight->entry = entry;
+            in_flight->ready = true;
+        }
+        in_flight->ready_cv.notify_all();
+        return entry;
+    } catch (...) {
+        {
+            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
+            in_flight->exception = std::current_exception();
+            in_flight->ready = true;
+        }
+        {
+            std::unique_lock<std::shared_mutex> write_lock(mutex_);
+            auto slot_it = slots_.find(key);
+            if (slot_it != slots_.end() && slot_it->second.pending == in_flight) {
+                slot_it->second.pending.reset();
+                if (!slot_it->second.entry) {
+                    slots_.erase(slot_it);
+                }
+            }
+        }
+        in_flight->ready_cv.notify_all();
+        throw;
+    }
+}
+
+const BasisCacheEntry& BasisCache::prewarm(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return get_or_compute(basis, quad, gradients, hessians);
+}
+
+BasisCacheHandle BasisCache::prewarm_handle(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return BasisCacheHandle(get_or_compute_shared(basis, quad, gradients, hessians));
+}
+
+BasisCacheEntry BasisCache::compute_uncached(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) const {
+    return compute(basis, quad, gradients, hessians);
+}
+
+void BasisCache::clear() {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    for (auto it = slots_.begin(); it != slots_.end();) {
+        if (it->second.pending) {
+            it->second.entry.reset();
+            ++it;
+        } else {
+            it = slots_.erase(it);
+        }
+    }
+}
+
+std::size_t BasisCache::size() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    std::size_t completed = 0;
+    for (const auto& [key, slot] : slots_) {
+        (void)key;
+        if (slot.entry) {
+            ++completed;
+        }
+    }
+    return completed;
+}
+
+BasisCacheEntry BasisCache::compute(const BasisFunction& basis,
+                                    const quadrature::QuadratureRule& quad,
+                                    bool gradients,
+                                    bool hessians) const {
+    BasisCacheEntry entry;
+    const auto& points = quad.points();
+    entry.num_qpts = points.size();
+    entry.num_dofs = basis.size();
+
+    const bool vector_basis = basis.is_vector_valued();
+    if (!vector_basis) {
+        entry.scalar_values.assign(entry.num_dofs * entry.num_qpts, Real(0));
+        if (gradients) {
+            entry.gradients.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        }
+        if (hessians) {
+            entry.hessians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
+        }
+    } else {
+        entry.vector_values_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        if (gradients && basis.supports_vector_jacobians()) {
+            entry.vector_jacobians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
+        }
+        if (gradients && basis.supports_curl()) {
+            entry.vector_curls_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        }
+        if (gradients && basis.supports_divergence()) {
+            entry.vector_divergence.assign(entry.num_dofs * entry.num_qpts, Real(0));
+        }
+    }
+
+    if (vector_basis) {
+        if (entry.num_dofs > 0 && entry.num_qpts > 0) {
+            basis.evaluate_vector_at_quadrature_points(
+                points,
+                entry.vector_values_xyz.data(),
+                entry.vector_jacobians.empty() ? nullptr : entry.vector_jacobians.data(),
+                entry.vector_curls_xyz.empty() ? nullptr : entry.vector_curls_xyz.data(),
+                entry.vector_divergence.empty() ? nullptr : entry.vector_divergence.data());
+        }
+        return entry;
+    }
+
+    if (entry.num_dofs > 0 && entry.num_qpts > 0) {
+        basis.fill_scalar_cache_entry(points,
+                                      entry.num_qpts,
+                                      entry.scalar_values.data(),
+                                      gradients ? entry.gradients.data() : nullptr,
+                                      hessians ? entry.hessians.data() : nullptr);
+    }
+
+    return entry;
+}
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisCache.h b/Code/Source/solver/FE/Basis/BasisCache.h
new file mode 100644
index 000000000..a84c0e87a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisCache.h
@@ -0,0 +1,456 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISCACHE_H
+#define SVMP_FE_BASIS_BASISCACHE_H
+
+/**
+ * @file BasisCache.h
+ * @brief Cache for basis evaluations at quadrature points
+ */
+
+#include "BasisFunction.h"
+#include "Quadrature/QuadratureRule.h"
+#include <cstddef>
+#include <condition_variable>
+#include <exception>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <cstdint>
+#include <shared_mutex>
+#include <span>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct QuadratureCacheKey {
+    int dimension{0};
+    std::size_t num_points{0};
+    // Quadrature coordinates are intentionally fingerprinted from their exact
+    // Real bit patterns. Values such as -0.0 and +0.0 therefore produce
+    // distinct cache keys unless a future API explicitly normalizes them. The
+    // key intentionally ignores weights and rule class because basis values only
+    // depend on reference coordinates; bit-identical point sets share entries.
+    std::uint64_t points_hash_a{0};
+    std::uint64_t points_hash_b{0};
+
+    bool operator==(const QuadratureCacheKey& other) const noexcept {
+        return dimension == other.dimension &&
+               num_points == other.num_points &&
+               points_hash_a == other.points_hash_a &&
+               points_hash_b == other.points_hash_b;
+    }
+};
+
+struct StructuralBasisKey {
+    BasisType basis_type{BasisType::Custom};
+    ElementType element_type{ElementType::Unknown};
+    int dimension{0};
+    int order{0};
+    std::size_t num_dofs{0};
+    bool vector_valued{false};
+    QuadratureCacheKey quadrature;
+    bool with_gradients{false};
+    bool with_hessians{false};
+
+    bool operator==(const StructuralBasisKey& other) const noexcept {
+        return basis_type == other.basis_type &&
+               element_type == other.element_type &&
+               dimension == other.dimension &&
+               order == other.order &&
+               num_dofs == other.num_dofs &&
+               vector_valued == other.vector_valued &&
+               quadrature == other.quadrature &&
+               with_gradients == other.with_gradients &&
+               with_hessians == other.with_hessians;
+    }
+};
+
+struct ParameterizedBasisKey {
+    StructuralBasisKey structural;
+    bool uses_structured_identity{false};
+    std::uint64_t identity_hash_a{0};
+    std::uint64_t identity_hash_b{0};
+    std::vector<std::uint64_t> basis_identity_words;
+    std::string basis_identity;
+
+    bool operator==(const ParameterizedBasisKey& other) const noexcept {
+        return structural == other.structural &&
+               uses_structured_identity == other.uses_structured_identity &&
+               identity_hash_a == other.identity_hash_a &&
+               identity_hash_b == other.identity_hash_b &&
+               basis_identity_words == other.basis_identity_words &&
+               basis_identity == other.basis_identity;
+    }
+};
+
+struct BasisCacheKey {
+    std::variant<StructuralBasisKey, ParameterizedBasisKey> value;
+
+    bool operator==(const BasisCacheKey& other) const noexcept {
+        return value == other.value;
+    }
+};
+
+struct BasisCacheKeyHash {
+    std::size_t operator()(const BasisCacheKey& key) const noexcept {
+        std::size_t seed = 0;
+        auto combine = [&seed](std::size_t value) noexcept {
+            seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
+        };
+
+        auto hash_structural = [&](const StructuralBasisKey& structural) noexcept {
+            combine(std::hash<int>()(structural.quadrature.dimension));
+            combine(std::hash<std::size_t>()(structural.quadrature.num_points));
+            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_a));
+            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_b));
+            combine(std::hash<int>()(static_cast<int>(structural.basis_type)));
+            combine(std::hash<int>()(static_cast<int>(structural.element_type)));
+            combine(std::hash<int>()(structural.dimension));
+            combine(std::hash<int>()(structural.order));
+            combine(std::hash<std::size_t>()(structural.num_dofs));
+            unsigned flags = 0u;
+            flags |= structural.vector_valued ? 1u : 0u;
+            flags |= structural.with_gradients ? 2u : 0u;
+            flags |= structural.with_hessians ? 4u : 0u;
+            combine(std::hash<unsigned>()(flags));
+        };
+
+        std::visit([&](const auto& active_key) {
+            using ActiveKey = std::decay_t<decltype(active_key)>;
+            if constexpr (std::is_same_v<ActiveKey, StructuralBasisKey>) {
+                combine(0x5354525543544b45ULL);
+                hash_structural(active_key);
+            } else {
+                combine(0x504152414d4b4559ULL);
+                hash_structural(active_key.structural);
+                combine(active_key.uses_structured_identity ? 1u : 0u);
+                combine(std::hash<std::uint64_t>()(active_key.identity_hash_a));
+                combine(std::hash<std::uint64_t>()(active_key.identity_hash_b));
+            }
+        }, key.value);
+        return seed;
+    }
+};
+
+struct BasisCacheEntry {
+    std::size_t num_qpts{0};
+    std::size_t num_dofs{0};
+    // Scalar basis values in dof-major SoA layout: [dof * num_qpts + qp].
+    std::vector<Real> scalar_values;
+    // Scalar reference gradients in dof/component/qpt SoA layout:
+    // [(dof * 3 + component) * num_qpts + qp].
+    std::vector<Real> gradients;
+    // Scalar reference Hessians in dof/component/qpt SoA layout:
+    // [(dof * 9 + row * 3 + col) * num_qpts + qp].
+    std::vector<Real> hessians;
+
+    // Vector basis values in dof/component/qpt SoA layout:
+    // [(dof * 3 + component) * num_qpts + qp].
+    std::vector<Real> vector_values_xyz;
+    // Vector basis reference Jacobians in dof/component/derivative/qpt layout:
+    // [(dof * 9 + component * 3 + derivative) * num_qpts + qp].
+    std::vector<Real> vector_jacobians;
+    // Vector basis curls in dof/component/qpt SoA layout.
+    std::vector<Real> vector_curls_xyz;
+    // Vector basis divergences in dof/qpt SoA layout.
+    std::vector<Real> vector_divergence;
+
+    // The object-returning accessors below are convenience helpers for tests,
+    // diagnostics, and occasional scalar use. Hot loops should prefer the SoA
+    // span accessors so they do not reconstruct Gradient, Hessian, or matrix
+    // objects per DOF and quadrature point.
+
+    [[nodiscard]] Real scalarValue(std::size_t dof, std::size_t qp) const noexcept {
+        return scalar_values[dof * num_qpts + qp];
+    }
+
+    [[nodiscard]] std::span<const Real> scalarValuesForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(scalar_values.data() + dof * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] Real gradientValue(std::size_t dof,
+                                     std::size_t component,
+                                     std::size_t qp) const noexcept {
+        return gradients[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] Gradient gradientVector(std::size_t dof, std::size_t qp) const noexcept {
+        Gradient out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = gradientValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> gradientsForDofComponent(std::size_t dof,
+                                                                  std::size_t component) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(gradients.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> gradientsForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(gradients.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real hessianValue(std::size_t dof,
+                                    std::size_t row,
+                                    std::size_t col,
+                                    std::size_t qp) const noexcept {
+        return hessians[(dof * 9u + row * 3u + col) * num_qpts + qp];
+    }
+
+    [[nodiscard]] Hessian hessianMatrix(std::size_t dof, std::size_t qp) const noexcept {
+        Hessian out{};
+        for (std::size_t row = 0; row < 3u; ++row) {
+            for (std::size_t col = 0; col < 3u; ++col) {
+                out(row, col) = hessianValue(dof, row, col, qp);
+            }
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> hessiansForDofComponent(std::size_t dof,
+                                                                 std::size_t row,
+                                                                 std::size_t col) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(hessians.data() + (dof * 9u + row * 3u + col) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> hessiansForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(hessians.data() + dof * 9u * num_qpts, 9u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorValue(std::size_t dof,
+                                   std::size_t component,
+                                   std::size_t qp) const noexcept {
+        return vector_values_xyz[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] math::Vector<Real, 3> vectorValue(std::size_t dof,
+                                                     std::size_t qp) const noexcept {
+        math::Vector<Real, 3> out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = vectorValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorValuesForDofComponent(std::size_t dof,
+                                                                     std::size_t component) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(vector_values_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorValuesForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_values_xyz.empty()) return {};
+        return std::span<const Real>(vector_values_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorJacobianValue(std::size_t dof,
+                                           std::size_t component,
+                                           std::size_t derivative,
+                                           std::size_t qp) const noexcept {
+        return vector_jacobians[(dof * 9u + component * 3u + derivative) * num_qpts + qp];
+    }
+
+    [[nodiscard]] VectorJacobian vectorJacobianMatrix(std::size_t dof,
+                                                       std::size_t qp) const noexcept {
+        VectorJacobian out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
+                out(component, derivative) =
+                    vectorJacobianValue(dof, component, derivative, qp);
+            }
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorJacobiansForDofComponentDerivative(
+        std::size_t dof,
+        std::size_t component,
+        std::size_t derivative) const noexcept {
+        if (num_qpts == 0 || vector_jacobians.empty()) return {};
+        return std::span<const Real>(
+            vector_jacobians.data() + (dof * 9u + component * 3u + derivative) * num_qpts,
+            num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorJacobiansForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_jacobians.empty()) return {};
+        return std::span<const Real>(vector_jacobians.data() + dof * 9u * num_qpts, 9u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorCurlValue(std::size_t dof,
+                                       std::size_t component,
+                                       std::size_t qp) const noexcept {
+        return vector_curls_xyz[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] math::Vector<Real, 3> vectorCurl(std::size_t dof,
+                                                    std::size_t qp) const noexcept {
+        math::Vector<Real, 3> out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = vectorCurlValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorCurlsForDofComponent(std::size_t dof,
+                                                                    std::size_t component) const noexcept {
+        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
+        return std::span<const Real>(vector_curls_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorCurlsForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
+        return std::span<const Real>(vector_curls_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorDivergenceValue(std::size_t dof,
+                                             std::size_t qp) const noexcept {
+        return vector_divergence[dof * num_qpts + qp];
+    }
+
+    [[nodiscard]] std::span<const Real> vectorDivergenceForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_divergence.empty()) return {};
+        return std::span<const Real>(vector_divergence.data() + dof * num_qpts, num_qpts);
+    }
+};
+
+class BasisCacheHandle {
+public:
+    BasisCacheHandle() = default;
+
+    [[nodiscard]] const BasisCacheEntry& entry() const {
+        BASIS_CHECK_CONFIG(entry_ != nullptr,
+                           "BasisCacheHandle: attempted to access an empty handle");
+        return *entry_;
+    }
+
+    [[nodiscard]] bool valid() const noexcept { return entry_ != nullptr; }
+    explicit operator bool() const noexcept { return valid(); }
+
+private:
+    friend class BasisCache;
+
+    explicit BasisCacheHandle(std::shared_ptr<const BasisCacheEntry> entry)
+        : entry_(std::move(entry)) {}
+
+    std::shared_ptr<const BasisCacheEntry> entry_;
+};
+
+class BasisCache {
+public:
+    static BasisCache& instance();
+
+    const BasisCacheEntry& get_or_compute(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Compute an entry without consulting, publishing to, or waiting on
+     * the shared cache.
+     */
+    BasisCacheEntry compute_uncached(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false) const;
+
+    /**
+     * @brief Eagerly populate the cache for the given (basis, quadrature) key
+     *
+     * Pays the compute cost up front so that subsequent get_or_compute calls
+     * for the same key hit the warm-cache path immediately. Equivalent to
+     * calling get_or_compute and discarding the return value.
+     *
+     * Returns the inserted (or pre-existing) entry for convenience.
+     */
+    const BasisCacheEntry& prewarm(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Eagerly populate the cache and return a hot-loop handle.
+     *
+     * The returned handle owns a shared reference to the completed entry. Access
+     * through BasisCacheHandle::entry() performs no key construction, hashing,
+     * map lookup, or cache mutex acquisition. Calling clear() removes the entry
+     * from the global lookup map but does not invalidate existing handles.
+     */
+    BasisCacheHandle prewarm_handle(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Remove completed cache entries.
+     *
+     * This is a soft clear: computations that were already in flight before
+     * clear() was called are allowed to publish their completed entry afterward.
+     * This preserves the returned-reference lifetime contract for concurrent
+     * get_or_compute() callers while still dropping all entries that had already
+     * completed at the time of the call.
+     */
+    void clear();
+    std::size_t size() const;
+
+private:
+    struct InFlightComputation {
+        std::mutex mutex;
+        std::condition_variable ready_cv;
+        bool ready{false};
+        std::shared_ptr<BasisCacheEntry> entry;
+        std::exception_ptr exception;
+    };
+
+    struct CacheSlot {
+        std::shared_ptr<BasisCacheEntry> entry;
+        std::shared_ptr<InFlightComputation> pending;
+    };
+
+    BasisCache() = default;
+    BasisCache(const BasisCache&) = delete;
+    BasisCache& operator=(const BasisCache&) = delete;
+
+    BasisCacheEntry compute(const BasisFunction& basis,
+                            const quadrature::QuadratureRule& quad,
+                            bool gradients,
+                            bool hessians) const;
+
+    std::shared_ptr<const BasisCacheEntry> get_or_compute_shared(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients,
+        bool hessians);
+
+    mutable std::shared_mutex mutex_;
+    std::unordered_map<BasisCacheKey, CacheSlot, BasisCacheKeyHash> slots_;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISCACHE_H
diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
new file mode 100644
index 000000000..8ee92a3dd
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -0,0 +1,134 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISEXCEPTIONS_H
+#define SVMP_FE_BASIS_BASISEXCEPTIONS_H
+
+#include "FEException.h"
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief Base exception type for errors originating in the Basis module
+ */
+class BasisException : public FEException {
+public:
+    BasisException(const std::string& message,
+                   const char* file = "",
+                   int line = 0,
+                   const char* function = "",
+                   StatusCode status = StatusCode::Unknown)
+        : FEException(message, status, file, line, function) {}
+};
+
+/**
+ * @brief Invalid Basis request or configuration
+ */
+class BasisConfigurationException : public BasisException {
+public:
+    BasisConfigurationException(const std::string& message,
+                                const char* file = "",
+                                int line = 0,
+                                const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Requested element topology is incompatible with the basis family
+ */
+class BasisElementCompatibilityException : public BasisException {
+public:
+    BasisElementCompatibilityException(const std::string& message,
+                                       const char* file = "",
+                                       int line = 0,
+                                       const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Basis evaluation request cannot be satisfied
+ */
+class BasisEvaluationException : public BasisException {
+public:
+    BasisEvaluationException(const std::string& message,
+                             const char* file = "",
+                             int line = 0,
+                             const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Public-to-canonical node ordering or coordinate lookup failure
+ */
+class BasisNodeOrderingException : public BasisException {
+public:
+    BasisNodeOrderingException(const std::string& message,
+                               const char* file = "",
+                               int line = 0,
+                               const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Internal basis construction or transform setup failure
+ */
+class BasisConstructionException : public BasisException {
+public:
+    BasisConstructionException(const std::string& message,
+                               const char* file = "",
+                               int line = 0,
+                               const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InternalError) {}
+};
+
+#define BASIS_CHECK_CONFIG(condition, message)                                                 \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisConfigurationException((message),                    \
+                                                                  __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_COMPAT(condition, message)                                                 \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisElementCompatibilityException((message),             \
+                                                                         __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_EVAL(condition, message)                                                   \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisEvaluationException((message),                       \
+                                                               __FILE__, __LINE__, __func__);  \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_NODE_ORDER(condition, message)                                             \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisNodeOrderingException((message),                     \
+                                                                 __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_CONSTRUCTION(condition, message)                                           \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisConstructionException((message),                     \
+                                                                 __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISEXCEPTIONS_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
new file mode 100644
index 000000000..dddbd4c5c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -0,0 +1,160 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisFactory.h"
+
+#include "LagrangeBasis.h"
+#include "SerendipityBasis.h"
+
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using CustomRegistryMap =
+    std::unordered_map<std::string, basis_factory::CustomFactory>;
+
+CustomRegistryMap& custom_registry() {
+    static CustomRegistryMap registry;
+    return registry;
+}
+
+std::mutex& custom_registry_mutex() {
+    static std::mutex mutex;
+    return mutex;
+}
+
+int require_basis_order(const BasisRequest& req,
+                        const char* missing_message,
+                        const char* negative_message) {
+    if (!req.order.has_value()) {
+        throw BasisConfigurationException(missing_message,
+                                          __FILE__, __LINE__, __func__);
+    }
+    if (*req.order < 0) {
+        throw BasisConfigurationException(negative_message,
+                                          __FILE__, __LINE__, __func__);
+    }
+    return *req.order;
+}
+
+void require_scalar_c0_request(const BasisRequest& req) {
+    if (req.field_type != FieldType::Scalar) {
+        throw BasisConfigurationException(
+            "BasisFactory: Lagrange/Serendipity bases currently support scalar fields only",
+            __FILE__, __LINE__, __func__);
+    }
+    if (req.continuity != Continuity::C0) {
+        throw BasisConfigurationException(
+            "BasisFactory: migrated Lagrange/Serendipity scope supports C0 continuity only",
+            __FILE__, __LINE__, __func__);
+    }
+}
+
+std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
+    require_scalar_c0_request(req);
+    const int order = require_basis_order(
+        req,
+        "BasisFactory: Lagrange creation requires an explicit order",
+        "BasisFactory: Lagrange requires non-negative order");
+    return std::make_shared<LagrangeBasis>(req.element_type, order);
+}
+
+std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
+    require_scalar_c0_request(req);
+    const int order = require_basis_order(
+        req,
+        "BasisFactory: Serendipity creation requires an explicit order",
+        "BasisFactory: Serendipity requires non-negative order");
+    return std::make_shared<SerendipityBasis>(req.element_type, order);
+}
+
+std::shared_ptr<BasisFunction> create_custom(const BasisRequest& req) {
+    if (req.custom_id.empty()) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom basis requests require custom_id",
+            __FILE__, __LINE__, __func__);
+    }
+
+    basis_factory::CustomFactory factory;
+    {
+        std::lock_guard<std::mutex> lock(custom_registry_mutex());
+        const auto it = custom_registry().find(req.custom_id);
+        if (it == custom_registry().end()) {
+            throw BasisConfigurationException(
+                "BasisFactory: no custom basis factory registered for id '" +
+                    req.custom_id + "'",
+                __FILE__, __LINE__, __func__);
+        }
+        factory = it->second;
+    }
+
+    auto basis = factory(req);
+    if (!basis) {
+        throw BasisConstructionException(
+            "BasisFactory: custom factory returned null basis for id '" +
+                req.custom_id + "'",
+            __FILE__, __LINE__, __func__);
+    }
+    return basis;
+}
+
+} // namespace
+
+namespace basis_factory {
+
+std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
+    switch (req.basis_type) {
+        case BasisType::Lagrange:
+            return create_lagrange(req);
+        case BasisType::Serendipity:
+            return create_serendipity(req);
+        case BasisType::Custom:
+            return create_custom(req);
+        default:
+            throw BasisConfigurationException(
+                "BasisFactory: requested basis family is outside the migrated Lagrange/Serendipity scope",
+                __FILE__, __LINE__, __func__);
+    }
+}
+
+void register_custom(std::string custom_id, CustomFactory factory) {
+    if (custom_id.empty()) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom factory id must not be empty",
+            __FILE__, __LINE__, __func__);
+    }
+    if (!factory) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom factory must be callable",
+            __FILE__, __LINE__, __func__);
+    }
+
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry()[std::move(custom_id)] = std::move(factory);
+}
+
+void unregister_custom(const std::string& custom_id) {
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry().erase(custom_id);
+}
+
+void clear_custom_registry_for_tests() {
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry().clear();
+}
+
+} // namespace basis_factory
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
new file mode 100644
index 000000000..cedf1ba6d
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -0,0 +1,57 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISFACTORY_H
+#define SVMP_FE_BASIS_BASISFACTORY_H
+
+/**
+ * @file BasisFactory.h
+ * @brief Runtime creation of basis families
+ */
+
+#include "BasisFunction.h"
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct BasisRequest {
+    ElementType element_type;
+    BasisType basis_type;
+    std::optional<int> order{};
+    Continuity continuity{Continuity::C0};
+    FieldType field_type{FieldType::Scalar};
+    std::vector<Real> knot_vector{};
+    std::vector<Real> weights{};
+    std::vector<int> axis_orders{};
+    std::vector<std::vector<Real>> axis_knot_vectors{};
+    std::vector<std::vector<Real>> axis_weights{};
+    std::vector<int> tensor_extents{};
+    std::string custom_id{};
+};
+
+namespace basis_factory {
+
+using CustomFactory = std::function<std::shared_ptr<BasisFunction>(const BasisRequest&)>;
+
+[[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
+void register_custom(std::string custom_id, CustomFactory factory);
+void unregister_custom(const std::string& custom_id);
+void clear_custom_registry_for_tests();
+
+} // namespace basis_factory
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISFACTORY_H
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
new file mode 100644
index 000000000..49c8d8763
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -0,0 +1,366 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisFunction.h"
+#include "VectorBasisEvaluationHelpers.h"
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+struct BasisFunctionScratch {
+    std::vector<Real> scalar_values;
+    std::vector<Gradient> scalar_gradients;
+    std::vector<Hessian> scalar_hessians;
+    std::vector<math::Vector<Real, 3>> vector_values;
+    std::vector<VectorJacobian> vector_jacobians;
+    std::vector<math::Vector<Real, 3>> vector_curls;
+    std::vector<Real> vector_divergences;
+
+    void prewarm(std::size_t max_size) {
+        scalar_values.reserve(max_size);
+        scalar_gradients.reserve(max_size);
+        scalar_hessians.reserve(max_size);
+        vector_values.reserve(max_size);
+        vector_jacobians.reserve(max_size);
+        vector_curls.reserve(max_size);
+        vector_divergences.reserve(max_size);
+    }
+};
+
+BasisFunctionScratch& basis_function_scratch() {
+    // Scratch is intentionally thread-local: production assembly uses a
+    // persistent worker-thread team, so buffers stay warm on each worker.
+    static thread_local BasisFunctionScratch scratch;
+    return scratch;
+}
+
+void mix_identity_hash_word(std::uint64_t word,
+                            std::uint64_t& hash_a,
+                            std::uint64_t& hash_b) noexcept {
+    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
+    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+}
+
+} // namespace
+
+BasisIdentityFingerprint
+compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept {
+    BasisIdentityFingerprint fingerprint{0x243f6a8885a308d3ULL,
+                                         0x13198a2e03707344ULL};
+    mix_identity_hash_word(static_cast<std::uint64_t>(words.size()),
+                           fingerprint.hash_a,
+                           fingerprint.hash_b);
+    for (const auto word : words) {
+        mix_identity_hash_word(word, fingerprint.hash_a, fingerprint.hash_b);
+    }
+    return fingerprint;
+}
+
+std::string BasisFunction::cache_identity() const {
+    std::ostringstream oss;
+    oss << "basis=" << static_cast<int>(basis_type())
+        << "|elem=" << static_cast<int>(element_type())
+        << "|dim=" << dimension()
+        << "|order=" << order()
+        << "|size=" << size()
+        << "|vector=" << is_vector_valued();
+    return oss.str();
+}
+
+bool BasisFunction::cache_identity_words(std::vector<std::uint64_t>& words) const {
+    (void)words;
+    return false;
+}
+
+bool BasisFunction::cache_identity_fingerprint(std::uint64_t& hash_a,
+                                               std::uint64_t& hash_b) const {
+    (void)hash_a;
+    (void)hash_b;
+    return false;
+}
+
+void prewarm_basis_function_scratch(std::size_t max_size,
+                                    std::size_t max_qpts) {
+    (void)max_qpts;
+    basis_function_scratch().prewarm(max_size);
+}
+
+void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients) const {
+    (void)xi;
+    (void)gradients;
+    throw BasisEvaluationException("Analytic gradient evaluation is not implemented for this basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians) const {
+    (void)xi;
+    (void)hessians;
+    throw BasisEvaluationException("Analytic Hessian evaluation is not implemented for this basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    evaluate_values(xi, values);
+    evaluate_gradients(xi, gradients);
+    evaluate_hessians(xi, hessians);
+}
+
+void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    auto& tmp = basis_function_scratch().scalar_values;
+    tmp.resize(size());
+    evaluate_values(xi, tmp);
+    std::copy_n(tmp.data(), tmp.size(), values_out);
+}
+
+void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    auto& tmp = basis_function_scratch().scalar_gradients;
+    tmp.resize(size());
+    evaluate_gradients(xi, tmp);
+    for (std::size_t i = 0; i < tmp.size(); ++i) {
+        gradients_out[i * 3u + 0u] = tmp[i][0];
+        gradients_out[i * 3u + 1u] = tmp[i][1];
+        gradients_out[i * 3u + 2u] = tmp[i][2];
+    }
+}
+
+void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                         Real* SVMP_RESTRICT hessians_out) const {
+    auto& tmp = basis_function_scratch().scalar_hessians;
+    tmp.resize(size());
+    evaluate_hessians(xi, tmp);
+    for (std::size_t i = 0; i < tmp.size(); ++i) {
+        store_hessian(tmp[i], hessians_out + i * 9u);
+    }
+}
+
+void BasisFunction::evaluate_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(
+        points, points.size(), values_out, gradients_out, hessians_out);
+}
+
+void BasisFunction::evaluate_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException(
+            "BasisFunction strided evaluation requires output_stride >= points.size()",
+            __FILE__, __LINE__, __func__);
+    }
+
+    auto& scratch = basis_function_scratch();
+    auto& v_tmp = scratch.scalar_values;
+    auto& g_tmp = scratch.scalar_gradients;
+    auto& h_tmp = scratch.scalar_hessians;
+    if (values_out) v_tmp.resize(num_dofs);
+    if (gradients_out) g_tmp.resize(num_dofs);
+    if (hessians_out) h_tmp.resize(num_dofs);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out && gradients_out && hessians_out) {
+            evaluate_all(points[q], v_tmp, g_tmp, h_tmp);
+        } else {
+            if (values_out) evaluate_values(points[q], v_tmp);
+            if (gradients_out) evaluate_gradients(points[q], g_tmp);
+            if (hessians_out) evaluate_hessians(points[q], h_tmp);
+        }
+
+        if (values_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                values_out[dof * output_stride + q] = v_tmp[dof];
+            }
+        }
+        if (gradients_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    gradients_out[(dof * 3u + component) * output_stride + q] =
+                        g_tmp[dof][component];
+                }
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                store_hessian_strided(
+                    h_tmp[dof], hessians_out + dof * 9u * output_stride, output_stride, q);
+            }
+        }
+    }
+}
+
+void BasisFunction::fill_scalar_cache_entry(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(
+        points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+void BasisFunction::evaluate_vector_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) const {
+    evaluate_vector_at_quadrature_points_strided(
+        points, points.size(), values_out, jacobians_out, curls_out, divergence_out);
+}
+
+void BasisFunction::evaluate_vector_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    detail::vector_common::validate_vector_strided_outputs(
+        num_qpts, output_stride, "BasisFunction");
+
+    auto& scratch = basis_function_scratch();
+    auto& v_tmp = scratch.vector_values;
+    auto& j_tmp = scratch.vector_jacobians;
+    auto& c_tmp = scratch.vector_curls;
+    auto& d_tmp = scratch.vector_divergences;
+    if (values_out) v_tmp.resize(num_dofs);
+    if (jacobians_out) j_tmp.resize(num_dofs);
+    if (curls_out) c_tmp.resize(num_dofs);
+    if (divergence_out) d_tmp.resize(num_dofs);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out) {
+            evaluate_vector_values(points[q], v_tmp);
+            detail::vector_common::write_vector_values_strided(
+                v_tmp, num_dofs, output_stride, q, values_out);
+        }
+
+        if (jacobians_out) {
+            evaluate_vector_jacobians(points[q], j_tmp);
+            detail::vector_common::write_vector_jacobians_strided(
+                j_tmp, num_dofs, output_stride, q, jacobians_out);
+        }
+
+        if (curls_out) {
+            evaluate_curl(points[q], c_tmp);
+            detail::vector_common::write_vector_curl_strided(
+                c_tmp, num_dofs, output_stride, q, curls_out);
+        }
+
+        if (divergence_out) {
+            evaluate_divergence(points[q], d_tmp);
+            detail::vector_common::write_vector_divergence_strided(
+                d_tmp, num_dofs, output_stride, q, divergence_out);
+        }
+    }
+}
+
+void BasisFunction::evaluate_vector_values(
+    const math::Vector<Real, 3>&,
+    std::vector<math::Vector<Real, 3>>&) const {
+    throw BasisEvaluationException("Vector-valued evaluation requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_vector_jacobians(
+    const math::Vector<Real, 3>&,
+    std::vector<VectorJacobian>&) const {
+    throw BasisEvaluationException("Vector-basis Jacobian evaluation requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_divergence(
+    const math::Vector<Real, 3>&,
+    std::vector<Real>&) const {
+    throw BasisEvaluationException("Divergence requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_curl(
+    const math::Vector<Real, 3>&,
+    std::vector<math::Vector<Real, 3>>&) const {
+    throw BasisEvaluationException("Curl requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients,
+                                       Real eps) const {
+    std::vector<Real> base;
+    evaluate_values(xi, base);
+    gradients.assign(base.size(), Gradient{});
+
+    for (int d = 0; d < dimension(); ++d) {
+        math::Vector<Real, 3> forward = xi;
+        math::Vector<Real, 3> backward = xi;
+        const std::size_t idx = static_cast<std::size_t>(d);
+        forward[idx] += eps;
+        backward[idx] -= eps;
+
+        std::vector<Real> fwd, bwd;
+        evaluate_values(forward, fwd);
+        evaluate_values(backward, bwd);
+
+        for (std::size_t i = 0; i < base.size(); ++i) {
+            gradients[i][idx] = (fwd[i] - bwd[i]) / (Real(2) * eps);
+        }
+    }
+}
+
+void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians,
+                                      Real eps) const {
+    std::vector<Gradient> base_grad;
+    evaluate_gradients(xi, base_grad);
+    hessians.assign(base_grad.size(), Hessian{});
+
+    for (int d = 0; d < dimension(); ++d) {
+        math::Vector<Real, 3> forward = xi;
+        math::Vector<Real, 3> backward = xi;
+        const std::size_t col = static_cast<std::size_t>(d);
+        forward[col] += eps;
+        backward[col] -= eps;
+
+        std::vector<Gradient> g_forward, g_backward;
+        evaluate_gradients(forward, g_forward);
+        evaluate_gradients(backward, g_backward);
+
+        for (std::size_t i = 0; i < base_grad.size(); ++i) {
+            for (int k = 0; k < dimension(); ++k) {
+                const std::size_t row = static_cast<std::size_t>(k);
+                hessians[i](row, col) = (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
+            }
+        }
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
new file mode 100644
index 000000000..ee38a5b19
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -0,0 +1,426 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISFUNCTION_H
+#define SVMP_FE_BASIS_BASISFUNCTION_H
+
+/**
+ * @file BasisFunction.h
+ * @brief Abstract interface for basis function evaluation on reference elements
+ *
+ * The Basis module operates purely on reference elements and is independent of
+ * mesh-specific data structures. Implementations may leverage Math and
+ * Quadrature utilities but must not read mesh connectivity or geometry.
+ */
+
+#include "Types.h"
+#include "BasisExceptions.h"
+#include "Math/Vector.h"
+#include "Math/Matrix.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using Gradient = math::Vector<Real, 3>;
+using Hessian  = math::Matrix<Real, 3, 3>;
+using VectorJacobian = math::Matrix<Real, 3, 3>;
+
+struct BasisIdentityFingerprint {
+    std::uint64_t hash_a{0};
+    std::uint64_t hash_b{0};
+};
+
+[[nodiscard]] BasisIdentityFingerprint
+compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept;
+
+void prewarm_basis_function_scratch(std::size_t max_size,
+                                    std::size_t max_qpts = 0);
+
+[[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
+                                                    Real yy,
+                                                    Real zz,
+                                                    Real xy,
+                                                    Real xz,
+                                                    Real yz) {
+    Hessian hessian{};
+    hessian(0, 0) = xx;
+    hessian(1, 1) = yy;
+    hessian(2, 2) = zz;
+    hessian(0, 1) = xy;
+    hessian(1, 0) = xy;
+    hessian(0, 2) = xz;
+    hessian(2, 0) = xz;
+    hessian(1, 2) = yz;
+    hessian(2, 1) = yz;
+    return hessian;
+}
+
+// Raw Hessian buffers use row-major 3x3 blocks:
+// dst[row * 3 + col] = H(row, col).
+inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
+    dst[0u] = hessian(0u, 0u);
+    dst[1u] = hessian(0u, 1u);
+    dst[2u] = hessian(0u, 2u);
+    dst[3u] = hessian(1u, 0u);
+    dst[4u] = hessian(1u, 1u);
+    dst[5u] = hessian(1u, 2u);
+    dst[6u] = hessian(2u, 0u);
+    dst[7u] = hessian(2u, 1u);
+    dst[8u] = hessian(2u, 2u);
+}
+
+inline void store_hessian_strided(const Hessian& hessian,
+                                  Real* dst,
+                                  std::size_t stride,
+                                  std::size_t offset) noexcept {
+    dst[0u * stride + offset] = hessian(0u, 0u);
+    dst[1u * stride + offset] = hessian(0u, 1u);
+    dst[2u * stride + offset] = hessian(0u, 2u);
+    dst[3u * stride + offset] = hessian(1u, 0u);
+    dst[4u * stride + offset] = hessian(1u, 1u);
+    dst[5u * stride + offset] = hessian(1u, 2u);
+    dst[6u * stride + offset] = hessian(2u, 0u);
+    dst[7u * stride + offset] = hessian(2u, 1u);
+    dst[8u * stride + offset] = hessian(2u, 2u);
+}
+
+inline void scatter_hessian_components_strided(const Real* src,
+                                               Real* dst,
+                                               std::size_t stride,
+                                               std::size_t offset) noexcept {
+    dst[0u * stride + offset] = src[0u];
+    dst[1u * stride + offset] = src[1u];
+    dst[2u * stride + offset] = src[2u];
+    dst[3u * stride + offset] = src[3u];
+    dst[4u * stride + offset] = src[4u];
+    dst[5u * stride + offset] = src[5u];
+    dst[6u * stride + offset] = src[6u];
+    dst[7u * stride + offset] = src[7u];
+    dst[8u * stride + offset] = src[8u];
+}
+
+[[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
+    Hessian hessian{};
+    hessian(0u, 0u) = src[0u];
+    hessian(0u, 1u) = src[1u];
+    hessian(0u, 2u) = src[2u];
+    hessian(1u, 0u) = src[3u];
+    hessian(1u, 1u) = src[4u];
+    hessian(1u, 2u) = src[5u];
+    hessian(2u, 0u) = src[6u];
+    hessian(2u, 1u) = src[7u];
+    hessian(2u, 2u) = src[8u];
+    return hessian;
+}
+
+inline void add_scaled_hessian(Hessian& target,
+                               const Hessian& source,
+                               Real scale) noexcept {
+    target(0u, 0u) += scale * source(0u, 0u);
+    target(0u, 1u) += scale * source(0u, 1u);
+    target(0u, 2u) += scale * source(0u, 2u);
+    target(1u, 0u) += scale * source(1u, 0u);
+    target(1u, 1u) += scale * source(1u, 1u);
+    target(1u, 2u) += scale * source(1u, 2u);
+    target(2u, 0u) += scale * source(2u, 0u);
+    target(2u, 1u) += scale * source(2u, 1u);
+    target(2u, 2u) += scale * source(2u, 2u);
+}
+
+/**
+ * @brief Base interface for scalar and vector-valued basis families
+ *
+ * All basis implementations operate in reference space. Physical mappings are
+ * handled by the Geometry module. Derivatives are returned with unused
+ * components set to zero for lower dimensional elements.
+ */
+class BasisFunction {
+public:
+    virtual ~BasisFunction() = default;
+
+    /// Basis family identifier
+    virtual BasisType basis_type() const noexcept = 0;
+
+    /// Underlying element type on the reference domain
+    virtual ElementType element_type() const noexcept = 0;
+
+    /// Reference dimensionality (1, 2, or 3)
+    virtual int dimension() const noexcept = 0;
+
+    /// Polynomial order (modal/nodal definition dependent)
+    virtual int order() const noexcept = 0;
+
+    /// Number of basis functions (scalar or vector-valued)
+    virtual std::size_t size() const noexcept = 0;
+
+    /**
+     * @brief Whether BasisCache can key this basis from common structural fields.
+     *
+     * Return true only when basis_type/element_type/dimension/order/size and
+     * vector-valued status fully determine evaluation behavior. Parameterized
+     * bases such as splines and custom user bases should keep the default false
+     * so BasisCache includes cache_identity() in the key.
+     */
+    virtual bool cache_identity_is_structural() const noexcept { return false; }
+
+    /// Whether the basis is vector-valued (H(div)/H(curl))
+    virtual bool is_vector_valued() const noexcept { return false; }
+
+    /// Whether vector-valued basis Jacobians are available.
+    virtual bool supports_vector_jacobians() const noexcept { return false; }
+
+    /// Whether vector-valued basis curls are available.
+    virtual bool supports_curl() const noexcept { return false; }
+
+    /// Whether vector-valued basis divergences are available.
+    virtual bool supports_divergence() const noexcept { return false; }
+
+    /**
+     * @brief Stable semantic identity used by BasisCache
+     *
+     * Derived classes should override this when evaluation depends on
+     * additional state beyond basis family / element / order metadata.
+     */
+    virtual std::string cache_identity() const;
+
+    /**
+     * @brief Optional exact structured identity payload for BasisCache keys.
+     *
+     * Parameterized bases may append stable integer/bit-pattern words and
+     * return true to let BasisCache avoid using cache_identity() as the exact
+     * key payload. The human-readable cache_identity() remains available for
+     * diagnostics and for custom bases that do not implement this path.
+     */
+    virtual bool cache_identity_words(std::vector<std::uint64_t>& words) const;
+
+    /**
+     * @brief Optional cached fingerprint for structured identity words.
+     *
+     * Implementations that precompute cache_identity_words() may also cache the
+     * corresponding fingerprint. BasisCache still retains exact identity words
+     * for equality after hash matches.
+     */
+    virtual bool cache_identity_fingerprint(std::uint64_t& hash_a,
+                                            std::uint64_t& hash_b) const;
+
+    /**
+     * @brief Evaluate scalar basis values at a reference point
+     * @param xi Reference coordinates (unused entries are ignored)
+     * @param[out] values Output array resized to size()
+     */
+    virtual void evaluate_values(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values) const = 0;
+
+    /**
+     * @brief Evaluate gradients of scalar basis functions
+     *
+     * Production bases must override this with analytic derivatives.
+     * Use numerical_gradient explicitly in tests or diagnostics when a finite
+     * difference approximation is intended.
+     */
+    virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                    std::vector<Gradient>& gradients) const;
+
+    /**
+     * @brief Evaluate Hessians of scalar basis functions
+     *
+     * Production bases must override this with analytic second derivatives.
+     * Use numerical_hessian explicitly in tests or diagnostics when a finite
+     * difference approximation is intended.
+     */
+    virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                   std::vector<Hessian>& hessians) const;
+
+    /**
+     * @brief Fused evaluation of values, gradients, and Hessians at one point
+     *
+     * Default implementation calls evaluate_values, evaluate_gradients, and
+     * evaluate_hessians in sequence. Bases that share intermediate
+     * computations (e.g., LagrangeBasis sharing per-axis 1D evaluations)
+     * should override this to avoid redundant work.
+     */
+    virtual void evaluate_all(const math::Vector<Real, 3>& xi,
+                              std::vector<Real>& values,
+                              std::vector<Gradient>& gradients,
+                              std::vector<Hessian>& hessians) const;
+
+    /**
+     * @brief Fill SoA buffers with basis evaluations at all quadrature points
+     *
+     * Outputs are written directly to caller-provided strided buffers in
+     * DOF-major SoA layout — no scratch+transpose required by the caller.
+     * Pass `nullptr` for any output that is not needed.
+     *
+     *   values_out:    size num_dofs * num_qpts; element [d * num_qpts + q]
+     *   gradients_out: size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   hessians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + r*3 + c) * num_qpts + q]
+     *
+     * Non-null output ranges must not overlap each other. Implementations may
+     * fill requested quantities in any order that is efficient for the basis.
+     *
+     * Default implementation calls evaluate_all (or evaluate_values/gradients/
+     * hessians as appropriate) per QP, materializing into temp buffers then
+     * scatter-writing to the output. Performance-sensitive bases must override
+     * this path so batched assembly does not fall back to Q virtual point
+     * evaluations. Unit coverage keeps an explicit list of hot bases that are
+     * expected to provide a direct strided implementation.
+     */
+    virtual void evaluate_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill strided SoA buffers with basis evaluations at quadrature points
+     *
+     * Same component layout as evaluate_at_quadrature_points, but each
+     * dof/component row advances by `output_stride` rather than `points.size()`.
+     * This lets padded SIMD cache storage be filled directly. Non-null output
+     * ranges have the same non-overlap requirement.
+     */
+    virtual void evaluate_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill zero-initialized scalar cache storage.
+     *
+     * BasisCache allocates and zero-initializes its scalar SoA buffers before
+     * calling this hook. The default implementation overwrites all requested
+     * entries through the public strided evaluator. Sparse-support bases may
+     * override this and write only active entries, relying on the caller's
+     * zero-initialization for inactive DOFs and unused derivative components.
+     */
+    virtual void fill_scalar_cache_entry(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill SoA buffers with vector-basis evaluations at all quadrature points
+     *
+     * Outputs are written in DOF-major SoA layout. Pass `nullptr` for any
+     * quantity that is not needed.
+     *
+     *   values_out:     size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   jacobians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + c*3 + r) * num_qpts + q]
+     *   curls_out:      size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   divergence_out: size num_dofs * num_qpts; element [d * num_qpts + q]
+     *
+     * Non-null output ranges must not overlap each other. Implementations may
+     * fill requested quantities in any order that is efficient for the basis.
+     */
+    virtual void evaluate_vector_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const;
+
+    /**
+     * @brief Fill strided SoA buffers with vector-basis evaluations
+     *
+     * Same component layout as evaluate_vector_at_quadrature_points, but each
+     * dof/component row advances by `output_stride` rather than `points.size()`.
+     * Non-null output ranges have the same non-overlap requirement.
+     *
+     * The base fallback loops over quadrature points through virtual point
+     * evaluation. H(div)/H(curl) bases used in assembly should override this
+     * method directly, and tests track the current hot vector families.
+     */
+    virtual void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const;
+
+    /**
+     * @brief Evaluate scalar basis values into a caller-provided raw buffer
+     *
+     * Caller is responsible for providing a buffer of at least size() Real
+     * entries. This avoids the per-call std::vector::resize() cost of the
+     * vector-output overload. Default implementation forwards through a temp
+     * vector; bases should override for direct write.
+     */
+    virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out) const;
+
+    /**
+     * @brief Evaluate gradients into a flat caller-provided buffer
+     *
+     * Layout: gradients_out[i * 3 + c] = component c of gradient of basis i.
+     * Caller provides a buffer of size() * 3 Real entries.
+     */
+    virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT gradients_out) const;
+
+    /**
+     * @brief Evaluate Hessians into a flat caller-provided buffer
+     *
+     * Layout: hessians_out[i * 9 + r * 3 + c] = H_i(r, c).
+     */
+    virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                      Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Evaluate vector-valued basis functions (H(div)/H(curl))
+     *
+     * Default implementation throws; vector bases must override.
+     */
+    virtual void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                        std::vector<math::Vector<Real, 3>>& values) const;
+
+    /**
+     * @brief Evaluate reference-space Jacobians of vector-valued basis functions
+     *
+     * The returned matrix for basis function `i` has entries
+     * `jacobians[i](component, derivative_direction) = d phi_i_component / d xi_direction`.
+     * Unused rows/columns are zero-filled for lower-dimensional elements.
+     */
+    virtual void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                           std::vector<VectorJacobian>& jacobians) const;
+
+    /// Evaluate divergence of vector-valued basis functions (if applicable)
+    virtual void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>& divergence) const;
+
+    /// Evaluate curl of vector-valued basis functions (if applicable)
+    virtual void evaluate_curl(const math::Vector<Real, 3>& xi,
+                               std::vector<math::Vector<Real, 3>>& curl) const;
+
+protected:
+    /// Finite-difference helper for gradients of scalar bases
+    void numerical_gradient(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients,
+                            Real eps = Real(1e-6)) const;
+
+    /// Finite-difference helper for Hessians of scalar bases
+    void numerical_hessian(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians,
+                           Real eps = Real(1e-5)) const;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISFUNCTION_H
diff --git a/Code/Source/solver/FE/Basis/BasisTolerance.h b/Code/Source/solver/FE/Basis/BasisTolerance.h
new file mode 100644
index 000000000..423551f09
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisTolerance.h
@@ -0,0 +1,52 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISTOLERANCE_H
+#define SVMP_FE_BASIS_BASISTOLERANCE_H
+
+#include "Types.h"
+
+#include <limits>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
+    return value < Real(0) ? -value : value;
+}
+
+[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
+    return lhs < rhs ? rhs : lhs;
+}
+
+[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
+                                                    Real multiplier = Real(64)) noexcept {
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           basis_max(Real(1), basis_abs(scale));
+}
+
+[[nodiscard]] constexpr bool basis_near_zero(Real value,
+                                             Real scale = Real(1),
+                                             Real multiplier = Real(64)) noexcept {
+    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
+                                                Real b,
+                                                Real multiplier = Real(64)) noexcept {
+    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
+    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISTOLERANCE_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
new file mode 100644
index 000000000..835dfe705
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -0,0 +1,218 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISTRAITS_H
+#define SVMP_FE_BASIS_BASISTRAITS_H
+
+#include "Types.h"
+
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+enum class BasisTopology {
+    Unknown,
+    Point,
+    Line,
+    Triangle,
+    Quadrilateral,
+    Tetrahedron,
+    Hexahedron,
+    Wedge,
+    Pyramid,
+};
+
+[[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
+    return type == ElementType::Point1;
+}
+
+[[nodiscard]] constexpr bool is_line(ElementType type) noexcept {
+    return type == ElementType::Line2 || type == ElementType::Line3;
+}
+
+[[nodiscard]] constexpr bool is_triangle(ElementType type) noexcept {
+    return type == ElementType::Triangle3 || type == ElementType::Triangle6;
+}
+
+[[nodiscard]] constexpr bool is_quadrilateral(ElementType type) noexcept {
+    return type == ElementType::Quad4 || type == ElementType::Quad8 ||
+           type == ElementType::Quad9;
+}
+
+[[nodiscard]] constexpr bool is_tetrahedron(ElementType type) noexcept {
+    return type == ElementType::Tetra4 || type == ElementType::Tetra10;
+}
+
+[[nodiscard]] constexpr bool is_hexahedron(ElementType type) noexcept {
+    return type == ElementType::Hex8 || type == ElementType::Hex20 ||
+           type == ElementType::Hex27;
+}
+
+[[nodiscard]] constexpr bool is_wedge(ElementType type) noexcept {
+    return type == ElementType::Wedge6 || type == ElementType::Wedge15 ||
+           type == ElementType::Wedge18;
+}
+
+[[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
+    return type == ElementType::Pyramid5 || type == ElementType::Pyramid13 ||
+           type == ElementType::Pyramid14;
+}
+
+[[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
+    return is_triangle(type) || is_tetrahedron(type);
+}
+
+[[nodiscard]] constexpr bool is_tensor_product(ElementType type) noexcept {
+    return is_line(type) || is_quadrilateral(type) || is_hexahedron(type);
+}
+
+[[nodiscard]] constexpr int reference_dimension(ElementType type) noexcept {
+    return element_dimension(type);
+}
+
+[[nodiscard]] constexpr BasisTopology topology(ElementType type) noexcept {
+    if (is_point(type)) {
+        return BasisTopology::Point;
+    }
+    if (is_line(type)) {
+        return BasisTopology::Line;
+    }
+    if (is_triangle(type)) {
+        return BasisTopology::Triangle;
+    }
+    if (is_quadrilateral(type)) {
+        return BasisTopology::Quadrilateral;
+    }
+    if (is_tetrahedron(type)) {
+        return BasisTopology::Tetrahedron;
+    }
+    if (is_hexahedron(type)) {
+        return BasisTopology::Hexahedron;
+    }
+    if (is_wedge(type)) {
+        return BasisTopology::Wedge;
+    }
+    if (is_pyramid(type)) {
+        return BasisTopology::Pyramid;
+    }
+    return BasisTopology::Unknown;
+}
+
+[[nodiscard]] constexpr ElementType canonical_lagrange_type(ElementType type) noexcept {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return ElementType::Line2;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return ElementType::Triangle3;
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return ElementType::Quad4;
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return ElementType::Tetra4;
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return ElementType::Hex8;
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return ElementType::Wedge6;
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return ElementType::Pyramid5;
+        default:
+            return type;
+    }
+}
+
+[[nodiscard]] constexpr int complete_lagrange_alias_order(ElementType type) noexcept {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Triangle3:
+        case ElementType::Quad4:
+        case ElementType::Tetra4:
+        case ElementType::Hex8:
+        case ElementType::Wedge6:
+        case ElementType::Pyramid5:
+            return 1;
+        case ElementType::Line3:
+        case ElementType::Triangle6:
+        case ElementType::Quad9:
+        case ElementType::Tetra10:
+        case ElementType::Hex27:
+        case ElementType::Wedge18:
+        case ElementType::Pyramid14:
+            return 2;
+        default:
+            return -1;
+    }
+}
+
+[[nodiscard]] constexpr std::size_t line_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>(order + 1) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t triangle_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) / 2) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t quad_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1)) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t tetra_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t hex_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t wedge_lagrange_size(int order) noexcept {
+    return triangle_lagrange_size(order) * line_lagrange_size(order);
+}
+
+[[nodiscard]] constexpr std::size_t pyramid_lagrange_size(int order) noexcept {
+    if (order < 0) {
+        return 0u;
+    }
+    const std::size_t p = static_cast<std::size_t>(order);
+    return (p + 1u) * (p + 2u) * (2u * p + 3u) / 6u;
+}
+
+[[nodiscard]] constexpr std::size_t complete_lagrange_alias_size(ElementType type) noexcept {
+    const int order = complete_lagrange_alias_order(type);
+    switch (canonical_lagrange_type(type)) {
+        case ElementType::Point1:
+            return 1u;
+        case ElementType::Line2:
+            return line_lagrange_size(order);
+        case ElementType::Triangle3:
+            return triangle_lagrange_size(order);
+        case ElementType::Quad4:
+            return quad_lagrange_size(order);
+        case ElementType::Tetra4:
+            return tetra_lagrange_size(order);
+        case ElementType::Hex8:
+            return hex_lagrange_size(order);
+        case ElementType::Wedge6:
+            return wedge_lagrange_size(order);
+        case ElementType::Pyramid5:
+            return pyramid_lagrange_size(order);
+        default:
+            return 0u;
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISTRAITS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
new file mode 100644
index 000000000..63b947516
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -0,0 +1,8323 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "LagrangeBasis.h"
+#include "BasisTraits.h"
+#include "BasisTolerance.h"
+#include "LagrangeBasisFast.h"
+#include "NodeOrderingConventions.h"
+#include "LagrangeBasisPyramid.h"
+#include "LagrangeBasisSimplex.h"
+#include "LagrangeBasisUtility.h"
+#include <algorithm>
+#include <cmath>
+#include <unordered_map>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using LagrangeTopology = BasisTopology;
+
+#if defined(_MSC_VER)
+#define SVMP_LAGRANGE_NOINLINE __declspec(noinline)
+#define SVMP_LAGRANGE_ALIGN64
+#elif defined(__GNUC__) || defined(__clang__)
+#define SVMP_LAGRANGE_NOINLINE __attribute__((noinline))
+#define SVMP_LAGRANGE_ALIGN64 __attribute__((aligned(64)))
+#else
+#define SVMP_LAGRANGE_NOINLINE
+#define SVMP_LAGRANGE_ALIGN64
+#endif
+
+#ifndef FE_ALWAYS_INLINE
+#if defined(_MSC_VER)
+#define FE_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FE_ALWAYS_INLINE __attribute__((always_inline)) inline
+#else
+#define FE_ALWAYS_INLINE inline
+#endif
+#endif
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out);
+
+struct LagrangeTopologyTraits {
+    LagrangeTopology topology;
+    int dimension;
+};
+
+struct SimplexExponentHash {
+    std::size_t operator()(const std::array<int, 4>& exponents) const noexcept {
+        std::size_t seed = 0x9e3779b97f4a7c15ull;
+        for (const int exponent : exponents) {
+            const auto value = static_cast<std::size_t>(exponent);
+            seed ^= value + 0x9e3779b97f4a7c15ull + (seed << 6u) + (seed >> 2u);
+        }
+        return seed;
+    }
+};
+
+template<typename T, std::size_t N>
+void assign_array(std::vector<T>& out, const std::array<T, N>& values) {
+    out.assign(values.begin(), values.end());
+}
+
+bool coordinate_matches_expected(Real coord, Real expected) noexcept {
+    return detail::basis_nearly_equal(coord, expected);
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs(const math::Vector<Real, 3>& xi,
+                           std::vector<Real>* values,
+                           std::vector<Gradient>* gradients,
+                           std::vector<Hessian>* hessians) {
+    if (values != nullptr) {
+        std::array<Real, FastBasis::n_dofs> fast_values{};
+        FastBasis::evaluate(xi, fast_values);
+        assign_array(*values, fast_values);
+    }
+    if (gradients != nullptr) {
+        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+        FastBasis::evaluate_gradients(xi, fast_gradients);
+        assign_array(*gradients, fast_gradients);
+    }
+    if (hessians != nullptr) {
+        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+        FastBasis::evaluate_hessians(xi, fast_hessians);
+        assign_array(*hessians, fast_hessians);
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT values_out,
+                              Real* SVMP_RESTRICT gradients_out,
+                              Real* SVMP_RESTRICT hessians_out) {
+    if (values_out != nullptr) {
+        std::array<Real, FastBasis::n_dofs> fast_values{};
+        FastBasis::evaluate(xi, fast_values);
+        for (std::size_t i = 0; i < fast_values.size(); ++i) {
+            values_out[i] = fast_values[i];
+        }
+    }
+    if (gradients_out != nullptr) {
+        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+        FastBasis::evaluate_gradients(xi, fast_gradients);
+        for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+            gradients_out[i * 3u + 0u] = fast_gradients[i][0];
+            gradients_out[i * 3u + 1u] = fast_gradients[i][1];
+            gradients_out[i * 3u + 2u] = fast_gradients[i][2];
+        }
+    }
+    if (hessians_out != nullptr) {
+        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+        FastBasis::evaluate_hessians(xi, fast_hessians);
+        for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+            store_hessian(fast_hessians[i], hessians_out + i * 9u);
+        }
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_strided(const std::vector<math::Vector<Real, 3>>& points,
+                                   std::size_t output_stride,
+                                   Real* SVMP_RESTRICT values_out,
+                                   Real* SVMP_RESTRICT gradients_out,
+                                   Real* SVMP_RESTRICT hessians_out) {
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        if (values_out != nullptr) {
+            std::array<Real, FastBasis::n_dofs> fast_values{};
+            FastBasis::evaluate(xi, fast_values);
+            for (std::size_t i = 0; i < fast_values.size(); ++i) {
+                values_out[i * output_stride + q] = fast_values[i];
+            }
+        }
+        if (gradients_out != nullptr) {
+            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+            FastBasis::evaluate_gradients(xi, fast_gradients);
+            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+                Real* g = gradients_out + i * 3u * output_stride;
+                g[0u * output_stride + q] = fast_gradients[i][0];
+                g[1u * output_stride + q] = fast_gradients[i][1];
+                g[2u * output_stride + q] = fast_gradients[i][2];
+            }
+        }
+        if (hessians_out != nullptr) {
+            std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+            FastBasis::evaluate_hessians(xi, fast_hessians);
+            for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+                const Hessian& hessian = fast_hessians[i];
+                Real* H = hessians_out + i * 9u * output_stride;
+                H[0u * output_stride + q] = hessian(0, 0);
+                H[1u * output_stride + q] = hessian(0, 1);
+                H[2u * output_stride + q] = hessian(0, 2);
+                H[3u * output_stride + q] = hessian(1, 0);
+                H[4u * output_stride + q] = hessian(1, 1);
+                H[5u * output_stride + q] = hessian(1, 2);
+                H[6u * output_stride + q] = hessian(2, 0);
+                H[7u * output_stride + q] = hessian(2, 1);
+                H[8u * output_stride + q] = hessian(2, 2);
+            }
+        }
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_order(LagrangeTopology topology,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs<LagrangeLineFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs<LagrangeQuadFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs<LagrangeHexFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs<LagrangeTriFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs<LagrangeTetFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_to_order(LagrangeTopology topology,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs_to<LagrangeLineFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs_to<LagrangeQuadFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs_to<LagrangeHexFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs_to<LagrangeTriFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs_to<LagrangeTetFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_strided_order(
+    LagrangeTopology topology,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs_strided<LagrangeLineFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs_strided<LagrangeQuadFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs_strided<LagrangeHexFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs_strided<LagrangeTriFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs_strided<LagrangeTetFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+void evaluate_triangle_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real p10[4];
+        Real p11[4];
+        Real p12[4];
+        Real p20[4];
+        Real p21[4];
+        Real p22[4];
+        Real p30[4];
+        Real p31[4];
+        Real p32[4];
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+
+            p10[q] = Real(3) * l0;
+            p11[q] = Real(3) * l1;
+            p12[q] = Real(3) * l2;
+            p20[q] = Real(0.5) * p10[q] * (p10[q] - Real(1));
+            p21[q] = Real(0.5) * p11[q] * (p11[q] - Real(1));
+            p22[q] = Real(0.5) * p12[q] * (p12[q] - Real(1));
+            p30[q] = (p10[q] * (p10[q] - Real(1)) * (p10[q] - Real(2))) / Real(6);
+            p31[q] = (p11[q] * (p11[q] - Real(1)) * (p11[q] - Real(2))) / Real(6);
+            p32[q] = (p12[q] * (p12[q] - Real(1)) * (p12[q] - Real(2))) / Real(6);
+        }
+
+        row0[0] = p30[0]; row0[1] = p30[1]; row0[2] = p30[2]; row0[3] = p30[3];
+        row1[0] = p31[0]; row1[1] = p31[1]; row1[2] = p31[2]; row1[3] = p31[3];
+        row2[0] = p32[0]; row2[1] = p32[1]; row2[2] = p32[2]; row2[3] = p32[3];
+        row3[0] = p20[0] * p11[0];
+        row3[1] = p20[1] * p11[1];
+        row3[2] = p20[2] * p11[2];
+        row3[3] = p20[3] * p11[3];
+        row4[0] = p10[0] * p21[0];
+        row4[1] = p10[1] * p21[1];
+        row4[2] = p10[2] * p21[2];
+        row4[3] = p10[3] * p21[3];
+        row5[0] = p21[0] * p12[0];
+        row5[1] = p21[1] * p12[1];
+        row5[2] = p21[2] * p12[2];
+        row5[3] = p21[3] * p12[3];
+        row6[0] = p11[0] * p22[0];
+        row6[1] = p11[1] * p22[1];
+        row6[2] = p11[2] * p22[2];
+        row6[3] = p11[3] * p22[3];
+        row7[0] = p10[0] * p22[0];
+        row7[1] = p10[1] * p22[1];
+        row7[2] = p10[2] * p22[2];
+        row7[3] = p10[3] * p22[3];
+        row8[0] = p20[0] * p12[0];
+        row8[1] = p20[1] * p12[1];
+        row8[2] = p20[2] * p12[2];
+        row8[3] = p20[3] * p12[3];
+        row9[0] = p10[0] * p11[0] * p12[0];
+        row9[1] = p10[1] * p11[1] * p12[1];
+        row9[2] = p10[2] * p11[2] * p12[2];
+        row9[3] = p10[3] * p11[3] * p12[3];
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        const Real p10 = Real(3) * l0;
+        const Real p11 = Real(3) * l1;
+        const Real p12 = Real(3) * l2;
+        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
+        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
+        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
+        const Real p30 = (p10 * (p10 - Real(1)) * (p10 - Real(2))) / Real(6);
+        const Real p31 = (p11 * (p11 - Real(1)) * (p11 - Real(2))) / Real(6);
+        const Real p32 = (p12 * (p12 - Real(1)) * (p12 - Real(2))) / Real(6);
+
+        row0[q] = p30;
+        row1[q] = p31;
+        row2[q] = p32;
+        row3[q] = p20 * p11;
+        row4[q] = p10 * p21;
+        row5[q] = p21 * p12;
+        row6[q] = p11 * p22;
+        row7[q] = p10 * p22;
+        row8[q] = p20 * p12;
+        row9[q] = p10 * p11 * p12;
+    }
+}
+
+void evaluate_triangle_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real l0[4];
+        Real l1[4];
+        Real l2[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            l1[q] = xi[0];
+            l2[q] = xi[1];
+            l0[q] = Real(1) - l1[q] - l2[q];
+        }
+
+        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
+        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
+        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
+        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
+        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
+        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
+        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
+        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
+        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
+        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
+        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
+        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
+        row3[0] = Real(4) * l0[0] * l1[0];
+        row3[1] = Real(4) * l0[1] * l1[1];
+        row3[2] = Real(4) * l0[2] * l1[2];
+        row3[3] = Real(4) * l0[3] * l1[3];
+        row4[0] = Real(4) * l1[0] * l2[0];
+        row4[1] = Real(4) * l1[1] * l2[1];
+        row4[2] = Real(4) * l1[2] * l2[2];
+        row4[3] = Real(4) * l1[3] * l2[3];
+        row5[0] = Real(4) * l0[0] * l2[0];
+        row5[1] = Real(4) * l0[1] * l2[1];
+        row5[2] = Real(4) * l0[2] * l2[2];
+        row5[3] = Real(4) * l0[3] * l2[3];
+        return;
+    }
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        row0[q] = l0 * (Real(2) * l0 - Real(1));
+        row1[q] = l1 * (Real(2) * l1 - Real(1));
+        row2[q] = l2 * (Real(2) * l2 - Real(1));
+        row3[q] = Real(4) * l0 * l1;
+        row4[q] = Real(4) * l1 * l2;
+        row5[q] = Real(4) * l0 * l2;
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+void evaluate_triangle_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        row0[q] = Real(1) - xi[0] - xi[1];
+        row1[q] = xi[0];
+        row2[q] = xi[1];
+    }
+}
+
+void evaluate_triangle_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        const Real g0 = Real(1) - Real(4) * l0;
+        row0[0u * output_stride + q] = g0;
+        row0[1u * output_stride + q] = g0;
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real(4) * l1 - Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(4) * l2 - Real(1);
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(4) * (l0 - l1);
+        row3[1u * output_stride + q] = Real(-4) * l1;
+        row3[2u * output_stride + q] = Real(0);
+        row4[0u * output_stride + q] = Real(4) * l2;
+        row4[1u * output_stride + q] = Real(4) * l1;
+        row4[2u * output_stride + q] = Real(0);
+        row5[0u * output_stride + q] = Real(-4) * l2;
+        row5[1u * output_stride + q] = Real(4) * (l0 - l2);
+        row5[2u * output_stride + q] = Real(0);
+    }
+}
+
+inline void write_constant_hessian_q4(Real* SVMP_RESTRICT row,
+                                      std::size_t output_stride,
+                                      Real h00,
+                                      Real h01,
+                                      Real h02,
+                                      Real h10,
+                                      Real h11,
+                                      Real h12,
+                                      Real h20,
+                                      Real h21,
+                                      Real h22) {
+    Real* c0 = row + 0u * output_stride;
+    Real* c1 = row + 1u * output_stride;
+    Real* c2 = row + 2u * output_stride;
+    Real* c3 = row + 3u * output_stride;
+    Real* c4 = row + 4u * output_stride;
+    Real* c5 = row + 5u * output_stride;
+    Real* c6 = row + 6u * output_stride;
+    Real* c7 = row + 7u * output_stride;
+    Real* c8 = row + 8u * output_stride;
+
+    c0[0] = h00; c0[1] = h00; c0[2] = h00; c0[3] = h00;
+    c1[0] = h01; c1[1] = h01; c1[2] = h01; c1[3] = h01;
+    c2[0] = h02; c2[1] = h02; c2[2] = h02; c2[3] = h02;
+    c3[0] = h10; c3[1] = h10; c3[2] = h10; c3[3] = h10;
+    c4[0] = h11; c4[1] = h11; c4[2] = h11; c4[3] = h11;
+    c5[0] = h12; c5[1] = h12; c5[2] = h12; c5[3] = h12;
+    c6[0] = h20; c6[1] = h20; c6[2] = h20; c6[3] = h20;
+    c7[0] = h21; c7[1] = h21; c7[2] = h21; c7[3] = h21;
+    c8[0] = h22; c8[1] = h22; c8[2] = h22; c8[3] = h22;
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(4), Real(0),
+                              Real(4), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
+                              output_stride,
+                              Real(-8), Real(-4), Real(0),
+                              Real(-4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(4), Real(0),
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(-4), Real(0),
+                              Real(-4), Real(-8), Real(0),
+                              Real(0), Real(0), Real(0));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(4), Real(4),
+                              Real(4), Real(4), Real(4),
+                              Real(4), Real(4), Real(4));
+    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(4));
+    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
+                              output_stride,
+                              Real(-8), Real(-4), Real(-4),
+                              Real(-4), Real(0), Real(0),
+                              Real(-4), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(4), Real(0),
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 6u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(-4), Real(0),
+                              Real(-4), Real(-8), Real(-4),
+                              Real(0), Real(-4), Real(0));
+    write_constant_hessian_q4(hessians_out + 7u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(-4),
+                              Real(0), Real(0), Real(-4),
+                              Real(-4), Real(-4), Real(-8));
+    write_constant_hessian_q4(hessians_out + 8u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(4),
+                              Real(0), Real(0), Real(0),
+                              Real(4), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 9u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(4),
+                              Real(0), Real(4), Real(0));
+}
+
+void evaluate_tet_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        row0[q] = Real(1) - xi[0] - xi[1] - xi[2];
+        row1[q] = xi[0];
+        row2[q] = xi[1];
+        row3[q] = xi[2];
+    }
+}
+
+void evaluate_tet_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        row0[0u * output_stride + q] = Real(-1);
+        row0[1u * output_stride + q] = Real(-1);
+        row0[2u * output_stride + q] = Real(-1);
+        row1[0u * output_stride + q] = Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(1);
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(0);
+        row3[1u * output_stride + q] = Real(0);
+        row3[2u * output_stride + q] = Real(1);
+    }
+}
+
+void evaluate_zero_hessians_strided(
+    std::size_t num_nodes,
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (num_qpts == 4u) {
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            write_constant_hessian_q4(hessians_out + node * 9u * output_stride,
+                                      output_stride,
+                                      Real(0), Real(0), Real(0),
+                                      Real(0), Real(0), Real(0),
+                                      Real(0), Real(0), Real(0));
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        Real* row = hessians_out + node * 9u * output_stride;
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            row[0u * output_stride + q] = Real(0);
+            row[1u * output_stride + q] = Real(0);
+            row[2u * output_stride + q] = Real(0);
+            row[3u * output_stride + q] = Real(0);
+            row[4u * output_stride + q] = Real(0);
+            row[5u * output_stride + q] = Real(0);
+            row[6u * output_stride + q] = Real(0);
+            row[7u * output_stride + q] = Real(0);
+            row[8u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+void evaluate_tet_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real l0[4];
+        Real l1[4];
+        Real l2[4];
+        Real l3[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            l1[q] = xi[0];
+            l2[q] = xi[1];
+            l3[q] = xi[2];
+            l0[q] = Real(1) - l1[q] - l2[q] - l3[q];
+        }
+
+        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
+        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
+        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
+        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
+        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
+        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
+        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
+        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
+        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
+        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
+        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
+        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
+        row3[0] = l3[0] * (Real(2) * l3[0] - Real(1));
+        row3[1] = l3[1] * (Real(2) * l3[1] - Real(1));
+        row3[2] = l3[2] * (Real(2) * l3[2] - Real(1));
+        row3[3] = l3[3] * (Real(2) * l3[3] - Real(1));
+        row4[0] = Real(4) * l0[0] * l1[0];
+        row4[1] = Real(4) * l0[1] * l1[1];
+        row4[2] = Real(4) * l0[2] * l1[2];
+        row4[3] = Real(4) * l0[3] * l1[3];
+        row5[0] = Real(4) * l1[0] * l2[0];
+        row5[1] = Real(4) * l1[1] * l2[1];
+        row5[2] = Real(4) * l1[2] * l2[2];
+        row5[3] = Real(4) * l1[3] * l2[3];
+        row6[0] = Real(4) * l0[0] * l2[0];
+        row6[1] = Real(4) * l0[1] * l2[1];
+        row6[2] = Real(4) * l0[2] * l2[2];
+        row6[3] = Real(4) * l0[3] * l2[3];
+        row7[0] = Real(4) * l0[0] * l3[0];
+        row7[1] = Real(4) * l0[1] * l3[1];
+        row7[2] = Real(4) * l0[2] * l3[2];
+        row7[3] = Real(4) * l0[3] * l3[3];
+        row8[0] = Real(4) * l1[0] * l3[0];
+        row8[1] = Real(4) * l1[1] * l3[1];
+        row8[2] = Real(4) * l1[2] * l3[2];
+        row8[3] = Real(4) * l1[3] * l3[3];
+        row9[0] = Real(4) * l2[0] * l3[0];
+        row9[1] = Real(4) * l2[1] * l3[1];
+        row9[2] = Real(4) * l2[2] * l3[2];
+        row9[3] = Real(4) * l2[3] * l3[3];
+        return;
+    }
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        row0[q] = l0 * (Real(2) * l0 - Real(1));
+        row1[q] = l1 * (Real(2) * l1 - Real(1));
+        row2[q] = l2 * (Real(2) * l2 - Real(1));
+        row3[q] = l3 * (Real(2) * l3 - Real(1));
+        row4[q] = Real(4) * l0 * l1;
+        row5[q] = Real(4) * l1 * l2;
+        row6[q] = Real(4) * l0 * l2;
+        row7[q] = Real(4) * l0 * l3;
+        row8[q] = Real(4) * l1 * l3;
+        row9[q] = Real(4) * l2 * l3;
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+inline void write_tet_order2_gradient_q(Real* SVMP_RESTRICT row,
+                                        std::size_t output_stride,
+                                        std::size_t q,
+                                        Real gx,
+                                        Real gy,
+                                        Real gz) {
+    row[0u * output_stride + q] = gx;
+    row[1u * output_stride + q] = gy;
+    row[2u * output_stride + q] = gz;
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+    Real* row9 = gradients_out + 9u * 3u * output_stride;
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        const Real four = Real(4);
+        const Real g0 = Real(1) - four * l0;
+
+        write_tet_order2_gradient_q(row0, output_stride, q, g0, g0, g0);
+        write_tet_order2_gradient_q(row1, output_stride, q, four * l1 - Real(1), Real(0), Real(0));
+        write_tet_order2_gradient_q(row2, output_stride, q, Real(0), four * l2 - Real(1), Real(0));
+        write_tet_order2_gradient_q(row3, output_stride, q, Real(0), Real(0), four * l3 - Real(1));
+        write_tet_order2_gradient_q(row4, output_stride, q, four * (l0 - l1), -four * l1, -four * l1);
+        write_tet_order2_gradient_q(row5, output_stride, q, four * l2, four * l1, Real(0));
+        write_tet_order2_gradient_q(row6, output_stride, q, -four * l2, four * (l0 - l2), -four * l2);
+        write_tet_order2_gradient_q(row7, output_stride, q, -four * l3, -four * l3, four * (l0 - l3));
+        write_tet_order2_gradient_q(row8, output_stride, q, four * l3, Real(0), four * l1);
+        write_tet_order2_gradient_q(row9, output_stride, q, Real(0), four * l3, four * l2);
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+inline void fill_simplex_order3_factor_values(Real lambda, Real* SVMP_RESTRICT phi) {
+    const Real t = Real(3) * lambda;
+    phi[0] = Real(1);
+    phi[1] = t;
+    phi[2] = phi[1] * (t - Real(1)) * Real(0.5);
+    phi[3] = phi[2] * (t - Real(2)) / Real(3);
+}
+
+void evaluate_tet_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+    Real* row16 = values_out + 16u * output_stride;
+    Real* row17 = values_out + 17u * output_stride;
+    Real* row18 = values_out + 18u * output_stride;
+    Real* row19 = values_out + 19u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        Real p0[4];
+        Real p1[4];
+        Real p2[4];
+        Real p3[4];
+        fill_simplex_order3_factor_values(l0, p0);
+        fill_simplex_order3_factor_values(l1, p1);
+        fill_simplex_order3_factor_values(l2, p2);
+        fill_simplex_order3_factor_values(l3, p3);
+
+        row0[q] = p0[3];
+        row1[q] = p1[3];
+        row2[q] = p2[3];
+        row3[q] = p3[3];
+        row4[q] = p0[2] * p1[1];
+        row5[q] = p0[1] * p1[2];
+        row6[q] = p1[2] * p2[1];
+        row7[q] = p1[1] * p2[2];
+        row8[q] = p0[1] * p2[2];
+        row9[q] = p0[2] * p2[1];
+        row10[q] = p0[2] * p3[1];
+        row11[q] = p0[1] * p3[2];
+        row12[q] = p1[2] * p3[1];
+        row13[q] = p1[1] * p3[2];
+        row14[q] = p2[2] * p3[1];
+        row15[q] = p2[1] * p3[2];
+        row16[q] = p0[1] * p1[1] * p2[1];
+        row17[q] = p0[1] * p1[1] * p3[1];
+        row18[q] = p1[1] * p2[1] * p3[1];
+        row19[q] = p0[1] * p2[1] * p3[1];
+    }
+}
+
+void evaluate_triangle_order3_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* rows[10] = {
+        gradients_out + 0u * 3u * output_stride,
+        gradients_out + 1u * 3u * output_stride,
+        gradients_out + 2u * 3u * output_stride,
+        gradients_out + 3u * 3u * output_stride,
+        gradients_out + 4u * 3u * output_stride,
+        gradients_out + 5u * 3u * output_stride,
+        gradients_out + 6u * 3u * output_stride,
+        gradients_out + 7u * 3u * output_stride,
+        gradients_out + 8u * 3u * output_stride,
+        gradients_out + 9u * 3u * output_stride,
+    };
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        const Real p10 = Real(3) * l0;
+        const Real p11 = Real(3) * l1;
+        const Real p12 = Real(3) * l2;
+        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
+        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
+        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
+        const Real d10 = Real(3);
+        const Real d11 = Real(3);
+        const Real d12 = Real(3);
+        const Real d20 = Real(3) * p10 - Real(1.5);
+        const Real d21 = Real(3) * p11 - Real(1.5);
+        const Real d22 = Real(3) * p12 - Real(1.5);
+        const Real d30 = Real(1.5) * p10 * p10 - Real(3) * p10 + Real(1);
+        const Real d31 = Real(1.5) * p11 * p11 - Real(3) * p11 + Real(1);
+        const Real d32 = Real(1.5) * p12 * p12 - Real(3) * p12 + Real(1);
+
+        const Real dl0[10] = {
+            d30,
+            Real(0),
+            Real(0),
+            d20 * p11,
+            d10 * p21,
+            Real(0),
+            Real(0),
+            d10 * p22,
+            d20 * p12,
+            d10 * p11 * p12,
+        };
+        const Real dl1[10] = {
+            Real(0),
+            d31,
+            Real(0),
+            p20 * d11,
+            p10 * d21,
+            d21 * p12,
+            d11 * p22,
+            Real(0),
+            Real(0),
+            p10 * d11 * p12,
+        };
+        const Real dl2[10] = {
+            Real(0),
+            Real(0),
+            d32,
+            Real(0),
+            Real(0),
+            p21 * d12,
+            p11 * d22,
+            p10 * d22,
+            p20 * d12,
+            p10 * p11 * d12,
+        };
+
+        for (std::size_t node = 0; node < 10u; ++node) {
+            Real* g = rows[node];
+            g[0u * output_stride + q] = dl1[node] - dl0[node];
+            g[1u * output_stride + q] = dl2[node] - dl0[node];
+            g[2u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+void evaluate_hex_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+
+    const auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        const Real lxly = lx * ly;
+        const Real uxly = ux * ly;
+        const Real uxuy = ux * uy;
+        const Real lxuy = lx * uy;
+        row0[q] = lxly * lz;
+        row1[q] = uxly * lz;
+        row2[q] = uxuy * lz;
+        row3[q] = lxuy * lz;
+        row4[q] = lxly * uz;
+        row5[q] = uxly * uz;
+        row6[q] = uxuy * uz;
+        row7[q] = lxuy * uz;
+    };
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+void evaluate_hex_order1_outputs_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr Real half = Real(0.5);
+    constexpr std::array<Real, 8> dx{{-half, half, half, -half, -half, half, half, -half}};
+    constexpr std::array<Real, 8> dy{{-half, -half, half, half, -half, -half, half, half}};
+    constexpr std::array<Real, 8> dz{{-half, -half, -half, -half, half, half, half, half}};
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * half;
+        const Real ly = (Real(1) - xi[1]) * half;
+        const Real lz = (Real(1) - xi[2]) * half;
+        const Real ux = (Real(1) + xi[0]) * half;
+        const Real uy = (Real(1) + xi[1]) * half;
+        const Real uz = (Real(1) + xi[2]) * half;
+        const Real xval[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
+        const Real yval[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
+        const Real zval[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+
+        for (std::size_t node = 0; node < 8u; ++node) {
+            if constexpr (NeedValues) {
+                values_out[node * output_stride + q] =
+                    xval[node] * yval[node] * zval[node];
+            }
+            if constexpr (NeedGradients) {
+                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dx[node] * yval[node] * zval[node];
+                g[1u * output_stride + q] = xval[node] * dy[node] * zval[node];
+                g[2u * output_stride + q] = xval[node] * yval[node] * dz[node];
+            }
+            if constexpr (NeedHessians) {
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                const Real hxy = dx[node] * dy[node] * zval[node];
+                const Real hxz = dx[node] * yval[node] * dz[node];
+                const Real hyz = xval[node] * dy[node] * dz[node];
+                H[0u * output_stride + q] = Real(0);
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = Real(0);
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_quad_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real lx[4];
+        Real ux[4];
+        Real ly[4];
+        Real uy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            lx[q] = (Real(1) - xi[0]) * Real(0.5);
+            ux[q] = (Real(1) + xi[0]) * Real(0.5);
+            ly[q] = (Real(1) - xi[1]) * Real(0.5);
+            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+        }
+        row0[0] = lx[0] * ly[0];
+        row0[1] = lx[1] * ly[1];
+        row0[2] = lx[2] * ly[2];
+        row0[3] = lx[3] * ly[3];
+        row1[0] = ux[0] * ly[0];
+        row1[1] = ux[1] * ly[1];
+        row1[2] = ux[2] * ly[2];
+        row1[3] = ux[3] * ly[3];
+        row2[0] = ux[0] * uy[0];
+        row2[1] = ux[1] * uy[1];
+        row2[2] = ux[2] * uy[2];
+        row2[3] = ux[3] * uy[3];
+        row3[0] = lx[0] * uy[0];
+        row3[1] = lx[1] * uy[1];
+        row3[2] = lx[2] * uy[2];
+        row3[3] = lx[3] * uy[3];
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        row0[q] = lx * ly;
+        row1[q] = ux * ly;
+        row2[q] = ux * uy;
+        row3[q] = lx * uy;
+    }
+}
+
+void evaluate_quad_order1_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+
+    if (points.size() == 4u) {
+        Real lx[4];
+        Real ly[4];
+        Real ux[4];
+        Real uy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            lx[q] = (Real(1) - xi[0]) * Real(0.5);
+            ly[q] = (Real(1) - xi[1]) * Real(0.5);
+            ux[q] = (Real(1) + xi[0]) * Real(0.5);
+            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+        }
+
+        auto write_component = [](Real* SVMP_RESTRICT row,
+                                  Real a0,
+                                  Real a1,
+                                  Real a2,
+                                  Real a3) {
+            row[0] = a0;
+            row[1] = a1;
+            row[2] = a2;
+            row[3] = a3;
+        };
+
+        write_component(row0, Real(-0.5) * ly[0], Real(-0.5) * ly[1],
+                        Real(-0.5) * ly[2], Real(-0.5) * ly[3]);
+        write_component(row0 + output_stride, Real(-0.5) * lx[0], Real(-0.5) * lx[1],
+                        Real(-0.5) * lx[2], Real(-0.5) * lx[3]);
+        write_component(row0 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row1, Real(0.5) * ly[0], Real(0.5) * ly[1],
+                        Real(0.5) * ly[2], Real(0.5) * ly[3]);
+        write_component(row1 + output_stride, Real(-0.5) * ux[0], Real(-0.5) * ux[1],
+                        Real(-0.5) * ux[2], Real(-0.5) * ux[3]);
+        write_component(row1 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row2, Real(0.5) * uy[0], Real(0.5) * uy[1],
+                        Real(0.5) * uy[2], Real(0.5) * uy[3]);
+        write_component(row2 + output_stride, Real(0.5) * ux[0], Real(0.5) * ux[1],
+                        Real(0.5) * ux[2], Real(0.5) * ux[3]);
+        write_component(row2 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row3, Real(-0.5) * uy[0], Real(-0.5) * uy[1],
+                        Real(-0.5) * uy[2], Real(-0.5) * uy[3]);
+        write_component(row3 + output_stride, Real(0.5) * lx[0], Real(0.5) * lx[1],
+                        Real(0.5) * lx[2], Real(0.5) * lx[3]);
+        write_component(row3 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        row0[0u * output_stride + q] = Real(-0.5) * ly;
+        row0[1u * output_stride + q] = Real(-0.5) * lx;
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real( 0.5) * ly;
+        row1[1u * output_stride + q] = Real(-0.5) * ux;
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real( 0.5) * uy;
+        row2[1u * output_stride + q] = Real( 0.5) * ux;
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(-0.5) * uy;
+        row3[1u * output_stride + q] = Real( 0.5) * lx;
+        row3[2u * output_stride + q] = Real(0);
+    }
+}
+
+inline void write_quad_order1_hessian_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real xy) {
+    row[0u * output_stride + q] = Real(0);
+    row[1u * output_stride + q] = xy;
+    row[2u * output_stride + q] = Real(0);
+    row[3u * output_stride + q] = xy;
+    row[4u * output_stride + q] = Real(0);
+    row[5u * output_stride + q] = Real(0);
+    row[6u * output_stride + q] = Real(0);
+    row[7u * output_stride + q] = Real(0);
+    row[8u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order1_hessians_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+
+    constexpr Real positive = Real(0.25);
+    constexpr Real negative = Real(-0.25);
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        write_quad_order1_hessian_q(row0, output_stride, q, positive);
+        write_quad_order1_hessian_q(row1, output_stride, q, negative);
+        write_quad_order1_hessian_q(row2, output_stride, q, positive);
+        write_quad_order1_hessian_q(row3, output_stride, q, negative);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order1_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][2],
+    const Real ly[4][2],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = (i == 0u) ? Real(-0.5) : Real(0.5);
+    const Real yd = (j == 0u) ? Real(-0.5) : Real(0.5);
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = Real(0);
+    hess_row[4u * output_stride + Q] = Real(0);
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order1_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<1>();
+
+    Real lx[4][2];
+    Real ly[4][2];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        lx[q][0] = (Real(1) - xi[0]) * Real(0.5);
+        lx[q][1] = (Real(1) + xi[0]) * Real(0.5);
+        ly[q][0] = (Real(1) - xi[1]) * Real(0.5);
+        ly[q][1] = (Real(1) + xi[1]) * Real(0.5);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order1_all_q4<0u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<1u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<2u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<3u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_quad_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+
+        row0[q] = x0 * y0;
+        row1[q] = x1 * y0;
+        row2[q] = x1 * y1;
+        row3[q] = x0 * y1;
+        row4[q] = x2 * y0;
+        row5[q] = x1 * y2;
+        row6[q] = x2 * y1;
+        row7[q] = x0 * y2;
+        row8[q] = x2 * y2;
+    }
+}
+
+inline void write_quad_order2_gradient_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real dx,
+    Real dy) {
+    row[0u * output_stride + q] = dx;
+    row[1u * output_stride + q] = dy;
+    row[2u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    if (points.size() == 4u) {
+        Real xv[4][3];
+        Real yv[4][3];
+        Real xd[4][3];
+        Real yd[4][3];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            const Real x = xi[0];
+            const Real y = xi[1];
+            xv[q][0] = x * (x - Real(1)) * Real(0.5);
+            xv[q][1] = x * (x + Real(1)) * Real(0.5);
+            xv[q][2] = Real(1) - x * x;
+            yv[q][0] = y * (y - Real(1)) * Real(0.5);
+            yv[q][1] = y * (y + Real(1)) * Real(0.5);
+            yv[q][2] = Real(1) - y * y;
+            xd[q][0] = x - Real(0.5);
+            xd[q][1] = x + Real(0.5);
+            xd[q][2] = Real(-2) * x;
+            yd[q][0] = y - Real(0.5);
+            yd[q][1] = y + Real(0.5);
+            yd[q][2] = Real(-2) * y;
+        }
+
+        auto write_node = [&](std::size_t node, std::size_t i, std::size_t j) {
+            Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
+            row[0u] = xd[0][i] * yv[0][j];
+            row[1u] = xd[1][i] * yv[1][j];
+            row[2u] = xd[2][i] * yv[2][j];
+            row[3u] = xd[3][i] * yv[3][j];
+            row[output_stride + 0u] = xv[0][i] * yd[0][j];
+            row[output_stride + 1u] = xv[1][i] * yd[1][j];
+            row[output_stride + 2u] = xv[2][i] * yd[2][j];
+            row[output_stride + 3u] = xv[3][i] * yd[3][j];
+            row[2u * output_stride + 0u] = Real(0);
+            row[2u * output_stride + 1u] = Real(0);
+            row[2u * output_stride + 2u] = Real(0);
+            row[2u * output_stride + 3u] = Real(0);
+        };
+
+        write_node(0u, 0u, 0u);
+        write_node(1u, 1u, 0u);
+        write_node(2u, 1u, 1u);
+        write_node(3u, 0u, 1u);
+        write_node(4u, 2u, 0u);
+        write_node(5u, 1u, 2u);
+        write_node(6u, 2u, 1u);
+        write_node(7u, 0u, 2u);
+        write_node(8u, 2u, 2u);
+        return;
+    }
+
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real dx0 = x - Real(0.5);
+        const Real dx1 = x + Real(0.5);
+        const Real dx2 = Real(-2) * x;
+        const Real dy0 = y - Real(0.5);
+        const Real dy1 = y + Real(0.5);
+        const Real dy2 = Real(-2) * y;
+
+        write_quad_order2_gradient_q(row0, output_stride, q, dx0 * y0, x0 * dy0);
+        write_quad_order2_gradient_q(row1, output_stride, q, dx1 * y0, x1 * dy0);
+        write_quad_order2_gradient_q(row2, output_stride, q, dx1 * y1, x1 * dy1);
+        write_quad_order2_gradient_q(row3, output_stride, q, dx0 * y1, x0 * dy1);
+        write_quad_order2_gradient_q(row4, output_stride, q, dx2 * y0, x2 * dy0);
+        write_quad_order2_gradient_q(row5, output_stride, q, dx1 * y2, x1 * dy2);
+        write_quad_order2_gradient_q(row6, output_stride, q, dx2 * y1, x2 * dy1);
+        write_quad_order2_gradient_q(row7, output_stride, q, dx0 * y2, x0 * dy2);
+        write_quad_order2_gradient_q(row8, output_stride, q, dx2 * y2, x2 * dy2);
+    }
+}
+
+inline void write_quad_order2_hessian_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real hxx,
+    Real hxy,
+    Real hyy) {
+    row[0u * output_stride + q] = hxx;
+    row[1u * output_stride + q] = hxy;
+    row[2u * output_stride + q] = Real(0);
+    row[3u * output_stride + q] = hxy;
+    row[4u * output_stride + q] = hyy;
+    row[5u * output_stride + q] = Real(0);
+    row[6u * output_stride + q] = Real(0);
+    row[7u * output_stride + q] = Real(0);
+    row[8u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order2_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+    Real* row4 = hessians_out + 4u * 9u * output_stride;
+    Real* row5 = hessians_out + 5u * 9u * output_stride;
+    Real* row6 = hessians_out + 6u * 9u * output_stride;
+    Real* row7 = hessians_out + 7u * 9u * output_stride;
+    Real* row8 = hessians_out + 8u * 9u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real dx0 = x - Real(0.5);
+        const Real dx1 = x + Real(0.5);
+        const Real dx2 = Real(-2) * x;
+        const Real dy0 = y - Real(0.5);
+        const Real dy1 = y + Real(0.5);
+        const Real dy2 = Real(-2) * y;
+
+        write_quad_order2_hessian_q(row0, output_stride, q, y0, dx0 * dy0, x0);
+        write_quad_order2_hessian_q(row1, output_stride, q, y0, dx1 * dy0, x1);
+        write_quad_order2_hessian_q(row2, output_stride, q, y1, dx1 * dy1, x1);
+        write_quad_order2_hessian_q(row3, output_stride, q, y1, dx0 * dy1, x0);
+        write_quad_order2_hessian_q(row4, output_stride, q, Real(-2) * y0, dx2 * dy0, x2);
+        write_quad_order2_hessian_q(row5, output_stride, q, y2, dx1 * dy2, Real(-2) * x1);
+        write_quad_order2_hessian_q(row6, output_stride, q, Real(-2) * y1, dx2 * dy1, x2);
+        write_quad_order2_hessian_q(row7, output_stride, q, y2, dx0 * dy2, Real(-2) * x0);
+        write_quad_order2_hessian_q(row8, output_stride, q, Real(-2) * y2, dx2 * dy2, Real(-2) * x2);
+    }
+}
+
+inline void fill_order3_axis_values(Real x, Real* SVMP_RESTRICT values) {
+    const Real x2 = x * x;
+    values[0] = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
+    values[1] = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
+    values[2] = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
+    values[3] = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
+}
+
+inline void fill_order3_axis_value_scalars(Real x,
+                                           Real& v0,
+                                           Real& v1,
+                                           Real& v2,
+                                           Real& v3) {
+    const Real x2 = x * x;
+    v0 = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
+    v1 = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
+    v2 = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
+    v3 = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
+}
+
+void evaluate_line_order1_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const Real x = points[q][0];
+        row0[q] = (Real(1) - x) * Real(0.5);
+        row1[q] = (Real(1) + x) * Real(0.5);
+    }
+}
+
+void evaluate_line_order2_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const Real x = points[q][0];
+        row0[q] = x * (x - Real(1)) * Real(0.5);
+        row1[q] = x * (x + Real(1)) * Real(0.5);
+        row2[q] = Real(1) - x * x;
+    }
+}
+
+void evaluate_line_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        fill_order3_axis_values(points[q][0], values);
+        row0[q] = values[0];
+        row1[q] = values[1];
+        row2[q] = values[2];
+        row3[q] = values[3];
+    }
+}
+
+inline void fill_order3_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first);
+
+inline void fill_order3_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second);
+
+inline void write_line_gradient_q4_row(Real* SVMP_RESTRICT row,
+                                       std::size_t output_stride,
+                                       Real g0,
+                                       Real g1,
+                                       Real g2,
+                                       Real g3) {
+    row[0] = g0;
+    row[1] = g1;
+    row[2] = g2;
+    row[3] = g3;
+    row[output_stride + 0u] = Real(0);
+    row[output_stride + 1u] = Real(0);
+    row[output_stride + 2u] = Real(0);
+    row[output_stride + 3u] = Real(0);
+    row[2u * output_stride + 0u] = Real(0);
+    row[2u * output_stride + 1u] = Real(0);
+    row[2u * output_stride + 2u] = Real(0);
+    row[2u * output_stride + 3u] = Real(0);
+}
+
+inline void write_line_hessian_q4_row(Real* SVMP_RESTRICT row,
+                                      std::size_t output_stride,
+                                      Real h0,
+                                      Real h1,
+                                      Real h2,
+                                      Real h3) {
+    row[0] = h0;
+    row[1] = h1;
+    row[2] = h2;
+    row[3] = h3;
+    for (std::size_t component = 1u; component < 9u; ++component) {
+        Real* slot = row + component * output_stride;
+        slot[0] = Real(0);
+        slot[1] = Real(0);
+        slot[2] = Real(0);
+        slot[3] = Real(0);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_gradients_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
+                               output_stride,
+                               Real(-0.5), Real(-0.5), Real(-0.5), Real(-0.5));
+    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
+                               output_stride,
+                               Real(0.5), Real(0.5), Real(0.5), Real(0.5));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
+                              output_stride, Real(0), Real(0), Real(0), Real(0));
+    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
+                              output_stride, Real(0), Real(0), Real(0), Real(0));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_line_order1_values_q4(points, output_stride, values_out);
+    evaluate_line_order1_gradients_q4(output_stride, gradients_out);
+    evaluate_line_order1_hessians_q4(output_stride, hessians_out);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    const Real x0 = points[0][0];
+    const Real x1 = points[1][0];
+    const Real x2 = points[2][0];
+    const Real x3 = points[3][0];
+    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
+                               output_stride,
+                               x0 - Real(0.5), x1 - Real(0.5),
+                               x2 - Real(0.5), x3 - Real(0.5));
+    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
+                               output_stride,
+                               x0 + Real(0.5), x1 + Real(0.5),
+                               x2 + Real(0.5), x3 + Real(0.5));
+    write_line_gradient_q4_row(gradients_out + 2u * 3u * output_stride,
+                               output_stride,
+                               Real(-2) * x0, Real(-2) * x1,
+                               Real(-2) * x2, Real(-2) * x3);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
+                              output_stride, Real(1), Real(1), Real(1), Real(1));
+    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
+                              output_stride, Real(1), Real(1), Real(1), Real(1));
+    write_line_hessian_q4_row(hessians_out + 2u * 9u * output_stride,
+                              output_stride, Real(-2), Real(-2), Real(-2), Real(-2));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_line_order2_values_q4(points, output_stride, values_out);
+    evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
+    evaluate_line_order2_hessians_q4(output_stride, hessians_out);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real first[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        fill_order3_axis_values_first(points[q][0], values, first[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][node], first[1][node],
+                                   first[2][node], first[3][node]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real second[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        Real first[4];
+        fill_order3_axis_values_first_second(points[q][0], values, first, second[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][node], second[1][node],
+                                  second[2][node], second[3][node]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][4];
+    Real first[4][4];
+    Real second[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_order3_axis_values_first_second(points[q][0], values[q], first[q], second[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        Real* value_row = values_out + node * output_stride;
+        value_row[0] = values[0][node];
+        value_row[1] = values[1][node];
+        value_row[2] = values[2][node];
+        value_row[3] = values[3][node];
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][node], first[1][node],
+                                   first[2][node], first[3][node]);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][node], second[1][node],
+                                  second[2][node], second[3][node]);
+    }
+}
+
+inline void fill_order3_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    fill_order3_axis_values(x, values);
+    const Real x2 = x * x;
+    first[0] = Real(-9.0 / 16.0) * (Real(3) * x2 - Real(2) * x - Real(1.0 / 9.0));
+    first[1] = Real( 9.0 / 16.0) * (Real(3) * x2 + Real(2) * x - Real(1.0 / 9.0));
+    first[2] = Real(27.0 / 16.0) * (Real(3) * x2 - Real(2.0 / 3.0) * x - Real(1));
+    first[3] = Real(-27.0 / 16.0) * (Real(3) * x2 + Real(2.0 / 3.0) * x - Real(1));
+}
+
+inline void fill_order3_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second) {
+    fill_order3_axis_values_first(x, values, first);
+    second[0] = Real(-9.0 / 16.0) * (Real(6) * x - Real(2));
+    second[1] = Real( 9.0 / 16.0) * (Real(6) * x + Real(2));
+    second[2] = Real(27.0 / 16.0) * (Real(6) * x - Real(2.0 / 3.0));
+    second[3] = Real(-27.0 / 16.0) * (Real(6) * x + Real(2.0 / 3.0));
+}
+
+inline void write_quad_order3_value_row_q4(Real* SVMP_RESTRICT row,
+                                           const Real lx[4][4],
+                                           const Real ly[4][4],
+                                           std::size_t i,
+                                           std::size_t j) {
+    row[0] = lx[0][i] * ly[0][j];
+    row[1] = lx[1][i] * ly[1][j];
+    row[2] = lx[2][i] * ly[2][j];
+    row[3] = lx[3][i] * ly[3][j];
+}
+
+void evaluate_quad_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    if (output_stride == 4u) {
+        Real* row0 = values_out + 0u * 4u;
+        Real* row1 = values_out + 1u * 4u;
+        Real* row2 = values_out + 2u * 4u;
+        Real* row3 = values_out + 3u * 4u;
+        Real* row4 = values_out + 4u * 4u;
+        Real* row5 = values_out + 5u * 4u;
+        Real* row6 = values_out + 6u * 4u;
+        Real* row7 = values_out + 7u * 4u;
+        Real* row8 = values_out + 8u * 4u;
+        Real* row9 = values_out + 9u * 4u;
+        Real* row10 = values_out + 10u * 4u;
+        Real* row11 = values_out + 11u * 4u;
+        Real* row12 = values_out + 12u * 4u;
+        Real* row13 = values_out + 13u * 4u;
+        Real* row14 = values_out + 14u * 4u;
+        Real* row15 = values_out + 15u * 4u;
+
+        auto write_q = [&](std::size_t q) {
+            const auto& xi = points[q];
+            Real x0;
+            Real x1;
+            Real x2;
+            Real x3;
+            Real y0;
+            Real y1;
+            Real y2;
+            Real y3;
+            fill_order3_axis_value_scalars(xi[0], x0, x1, x2, x3);
+            fill_order3_axis_value_scalars(xi[1], y0, y1, y2, y3);
+            row0[q] = x0 * y0;
+            row1[q] = x1 * y0;
+            row2[q] = x1 * y1;
+            row3[q] = x0 * y1;
+            row4[q] = x2 * y0;
+            row5[q] = x3 * y0;
+            row6[q] = x1 * y2;
+            row7[q] = x1 * y3;
+            row8[q] = x3 * y1;
+            row9[q] = x2 * y1;
+            row10[q] = x0 * y3;
+            row11[q] = x0 * y2;
+            row12[q] = x2 * y2;
+            row13[q] = x3 * y2;
+            row14[q] = x2 * y3;
+            row15[q] = x3 * y3;
+        };
+
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    Real lx[4][4];
+    Real ly[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values(xi[0], lx[q]);
+        fill_order3_axis_values(xi[1], ly[q]);
+    }
+
+    write_quad_order3_value_row_q4(values_out + 0u * output_stride, lx, ly, 0u, 0u);
+    write_quad_order3_value_row_q4(values_out + 1u * output_stride, lx, ly, 1u, 0u);
+    write_quad_order3_value_row_q4(values_out + 2u * output_stride, lx, ly, 1u, 1u);
+    write_quad_order3_value_row_q4(values_out + 3u * output_stride, lx, ly, 0u, 1u);
+    write_quad_order3_value_row_q4(values_out + 4u * output_stride, lx, ly, 2u, 0u);
+    write_quad_order3_value_row_q4(values_out + 5u * output_stride, lx, ly, 3u, 0u);
+    write_quad_order3_value_row_q4(values_out + 6u * output_stride, lx, ly, 1u, 2u);
+    write_quad_order3_value_row_q4(values_out + 7u * output_stride, lx, ly, 1u, 3u);
+    write_quad_order3_value_row_q4(values_out + 8u * output_stride, lx, ly, 3u, 1u);
+    write_quad_order3_value_row_q4(values_out + 9u * output_stride, lx, ly, 2u, 1u);
+    write_quad_order3_value_row_q4(values_out + 10u * output_stride, lx, ly, 0u, 3u);
+    write_quad_order3_value_row_q4(values_out + 11u * output_stride, lx, ly, 0u, 2u);
+    write_quad_order3_value_row_q4(values_out + 12u * output_stride, lx, ly, 2u, 2u);
+    write_quad_order3_value_row_q4(values_out + 13u * output_stride, lx, ly, 3u, 2u);
+    write_quad_order3_value_row_q4(values_out + 14u * output_stride, lx, ly, 2u, 3u);
+    write_quad_order3_value_row_q4(values_out + 15u * output_stride, lx, ly, 3u, 3u);
+}
+
+void evaluate_quad_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    if (points.size() == 4u) {
+        evaluate_quad_order3_values_q4(points, output_stride, values_out);
+        return;
+    }
+
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        fill_order3_axis_values(xi[0], lx);
+        fill_order3_axis_values(xi[1], ly);
+        row0[q] = lx[0] * ly[0];
+        row1[q] = lx[1] * ly[0];
+        row2[q] = lx[1] * ly[1];
+        row3[q] = lx[0] * ly[1];
+        row4[q] = lx[2] * ly[0];
+        row5[q] = lx[3] * ly[0];
+        row6[q] = lx[1] * ly[2];
+        row7[q] = lx[1] * ly[3];
+        row8[q] = lx[3] * ly[1];
+        row9[q] = lx[2] * ly[1];
+        row10[q] = lx[0] * ly[3];
+        row11[q] = lx[0] * ly[2];
+        row12[q] = lx[2] * ly[2];
+        row13[q] = lx[3] * ly[2];
+        row14[q] = lx[2] * ly[3];
+        row15[q] = lx[3] * ly[3];
+    }
+}
+
+template <std::size_t N>
+inline void write_quad_gradient_row_q4(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    const Real (&lx)[4][N],
+    const Real (&ly)[4][N],
+    const Real (&dx)[4][N],
+    const Real (&dy)[4][N],
+    std::size_t i,
+    std::size_t j) {
+    row[0u] = dx[0][i] * ly[0][j];
+    row[1u] = dx[1][i] * ly[1][j];
+    row[2u] = dx[2][i] * ly[2][j];
+    row[3u] = dx[3][i] * ly[3][j];
+    row[output_stride + 0u] = lx[0][i] * dy[0][j];
+    row[output_stride + 1u] = lx[1][i] * dy[1][j];
+    row[output_stride + 2u] = lx[2][i] * dy[2][j];
+    row[output_stride + 3u] = lx[3][i] * dy[3][j];
+    row[2u * output_stride + 0u] = Real(0);
+    row[2u * output_stride + 1u] = Real(0);
+    row[2u * output_stride + 2u] = Real(0);
+    row[2u * output_stride + 3u] = Real(0);
+}
+
+inline void fill_order4_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    const Real r = (x + Real(1)) * Real(2);
+    const Real r2 = r * r;
+    const Real r3 = r2 * r;
+    const Real f0 = r;
+    const Real f1 = r - Real(1);
+    const Real f2 = r - Real(2);
+    const Real f3 = r - Real(3);
+    const Real f4 = r - Real(4);
+    const Real f01 = f0 * f1;
+    const Real f12 = f1 * f2;
+    const Real f23 = f2 * f3;
+    const Real f34 = f3 * f4;
+
+    values[0] = (f12 * f34) / Real(24);
+    values[1] = (f01 * f23) / Real(24);
+    values[2] = -(f0 * f2 * f34) / Real(6);
+    values[3] = (f01 * f34) / Real(4);
+    values[4] = -(f01 * f2 * f4) / Real(6);
+
+    first[0] = (Real(4) * r3 - Real(30) * r2 + Real(70) * r - Real(50)) / Real(12);
+    first[1] = (Real(4) * r3 - Real(18) * r2 + Real(22) * r - Real(6)) / Real(12);
+    first[2] = (-Real(4) * r3 + Real(27) * r2 - Real(52) * r + Real(24)) / Real(3);
+    first[3] = Real(2) * r3 - Real(12) * r2 + Real(19) * r - Real(6);
+    first[4] = (-Real(4) * r3 + Real(21) * r2 - Real(28) * r + Real(8)) / Real(3);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_quad_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        write_quad_gradient_row_q4(
+            gradients_out + node * 3u * output_stride,
+            output_stride,
+            lx,
+            ly,
+            dx,
+            dy,
+            axes[0],
+            axes[1]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_quad_order4_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<4>();
+
+    Real lx[4][5];
+    Real ly[4][5];
+    Real dx[4][5];
+    Real dy[4][5];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order4_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order4_axis_values_first(xi[1], ly[q], dy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        write_quad_gradient_row_q4(
+            gradients_out + node * 3u * output_stride,
+            output_stride,
+            lx,
+            ly,
+            dx,
+            dy,
+            axes[0],
+            axes[1]);
+    }
+}
+
+void evaluate_quad_order3_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    if (points.size() == 4u) {
+        evaluate_quad_order3_gradients_q4(points, output_stride, gradients_out);
+        return;
+    }
+
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+    Real* row9 = gradients_out + 9u * 3u * output_stride;
+    Real* row10 = gradients_out + 10u * 3u * output_stride;
+    Real* row11 = gradients_out + 11u * 3u * output_stride;
+    Real* row12 = gradients_out + 12u * 3u * output_stride;
+    Real* row13 = gradients_out + 13u * 3u * output_stride;
+    Real* row14 = gradients_out + 14u * 3u * output_stride;
+    Real* row15 = gradients_out + 15u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        Real dx[4];
+        Real dy[4];
+        fill_order3_axis_values_first(xi[0], lx, dx);
+        fill_order3_axis_values_first(xi[1], ly, dy);
+        write_quad_order2_gradient_q(row0, output_stride, q, dx[0] * ly[0], lx[0] * dy[0]);
+        write_quad_order2_gradient_q(row1, output_stride, q, dx[1] * ly[0], lx[1] * dy[0]);
+        write_quad_order2_gradient_q(row2, output_stride, q, dx[1] * ly[1], lx[1] * dy[1]);
+        write_quad_order2_gradient_q(row3, output_stride, q, dx[0] * ly[1], lx[0] * dy[1]);
+        write_quad_order2_gradient_q(row4, output_stride, q, dx[2] * ly[0], lx[2] * dy[0]);
+        write_quad_order2_gradient_q(row5, output_stride, q, dx[3] * ly[0], lx[3] * dy[0]);
+        write_quad_order2_gradient_q(row6, output_stride, q, dx[1] * ly[2], lx[1] * dy[2]);
+        write_quad_order2_gradient_q(row7, output_stride, q, dx[1] * ly[3], lx[1] * dy[3]);
+        write_quad_order2_gradient_q(row8, output_stride, q, dx[3] * ly[1], lx[3] * dy[1]);
+        write_quad_order2_gradient_q(row9, output_stride, q, dx[2] * ly[1], lx[2] * dy[1]);
+        write_quad_order2_gradient_q(row10, output_stride, q, dx[0] * ly[3], lx[0] * dy[3]);
+        write_quad_order2_gradient_q(row11, output_stride, q, dx[0] * ly[2], lx[0] * dy[2]);
+        write_quad_order2_gradient_q(row12, output_stride, q, dx[2] * ly[2], lx[2] * dy[2]);
+        write_quad_order2_gradient_q(row13, output_stride, q, dx[3] * ly[2], lx[3] * dy[2]);
+        write_quad_order2_gradient_q(row14, output_stride, q, dx[2] * ly[3], lx[2] * dy[3]);
+        write_quad_order2_gradient_q(row15, output_stride, q, dx[3] * ly[3], lx[3] * dy[3]);
+    }
+}
+
+void evaluate_quad_order3_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+    Real* row4 = hessians_out + 4u * 9u * output_stride;
+    Real* row5 = hessians_out + 5u * 9u * output_stride;
+    Real* row6 = hessians_out + 6u * 9u * output_stride;
+    Real* row7 = hessians_out + 7u * 9u * output_stride;
+    Real* row8 = hessians_out + 8u * 9u * output_stride;
+    Real* row9 = hessians_out + 9u * 9u * output_stride;
+    Real* row10 = hessians_out + 10u * 9u * output_stride;
+    Real* row11 = hessians_out + 11u * 9u * output_stride;
+    Real* row12 = hessians_out + 12u * 9u * output_stride;
+    Real* row13 = hessians_out + 13u * 9u * output_stride;
+    Real* row14 = hessians_out + 14u * 9u * output_stride;
+    Real* row15 = hessians_out + 15u * 9u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        Real dx[4];
+        Real dy[4];
+        Real hx[4];
+        Real hy[4];
+        fill_order3_axis_values_first_second(xi[0], lx, dx, hx);
+        fill_order3_axis_values_first_second(xi[1], ly, dy, hy);
+        write_quad_order2_hessian_q(row0, output_stride, q, hx[0] * ly[0], dx[0] * dy[0], lx[0] * hy[0]);
+        write_quad_order2_hessian_q(row1, output_stride, q, hx[1] * ly[0], dx[1] * dy[0], lx[1] * hy[0]);
+        write_quad_order2_hessian_q(row2, output_stride, q, hx[1] * ly[1], dx[1] * dy[1], lx[1] * hy[1]);
+        write_quad_order2_hessian_q(row3, output_stride, q, hx[0] * ly[1], dx[0] * dy[1], lx[0] * hy[1]);
+        write_quad_order2_hessian_q(row4, output_stride, q, hx[2] * ly[0], dx[2] * dy[0], lx[2] * hy[0]);
+        write_quad_order2_hessian_q(row5, output_stride, q, hx[3] * ly[0], dx[3] * dy[0], lx[3] * hy[0]);
+        write_quad_order2_hessian_q(row6, output_stride, q, hx[1] * ly[2], dx[1] * dy[2], lx[1] * hy[2]);
+        write_quad_order2_hessian_q(row7, output_stride, q, hx[1] * ly[3], dx[1] * dy[3], lx[1] * hy[3]);
+        write_quad_order2_hessian_q(row8, output_stride, q, hx[3] * ly[1], dx[3] * dy[1], lx[3] * hy[1]);
+        write_quad_order2_hessian_q(row9, output_stride, q, hx[2] * ly[1], dx[2] * dy[1], lx[2] * hy[1]);
+        write_quad_order2_hessian_q(row10, output_stride, q, hx[0] * ly[3], dx[0] * dy[3], lx[0] * hy[3]);
+        write_quad_order2_hessian_q(row11, output_stride, q, hx[0] * ly[2], dx[0] * dy[2], lx[0] * hy[2]);
+        write_quad_order2_hessian_q(row12, output_stride, q, hx[2] * ly[2], dx[2] * dy[2], lx[2] * hy[2]);
+        write_quad_order2_hessian_q(row13, output_stride, q, hx[3] * ly[2], dx[3] * dy[2], lx[3] * hy[2]);
+        write_quad_order2_hessian_q(row14, output_stride, q, hx[2] * ly[3], dx[2] * dy[3], lx[2] * hy[3]);
+        write_quad_order2_hessian_q(row15, output_stride, q, hx[3] * ly[3], dx[3] * dy[3], lx[3] * hy[3]);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order3_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][4],
+    const Real ly[4][4],
+    const Real dx[4][4],
+    const Real dy[4][4],
+    const Real hx[4][4],
+    const Real hy[4][4],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real hx[4][4];
+    Real hy[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order3_all_q4<0u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<1u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<2u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<3u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values(xi[0], lx[q]);
+        fill_order3_axis_values(xi[1], ly[q]);
+        fill_order3_axis_values(xi[2], lz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = values_out + node * output_stride;
+        row[0] = lx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = lx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = lx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = lx[3][i] * ly[3][j] * lz[3][k];
+    }
+}
+
+void evaluate_hex_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real dz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
+        fill_order3_axis_values_first(xi[2], lz[q], dz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
+        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
+        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
+        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
+        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
+        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
+        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
+        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
+        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
+    }
+}
+
+template <std::size_t Q, bool WriteValue, bool WriteGradient>
+inline void write_hex_order3_q4_hessian_outputs(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    std::size_t k,
+    const Real lx[4][4],
+    const Real ly[4][4],
+    const Real lz[4][4],
+    const Real dx[4][4],
+    const Real dy[4][4],
+    const Real dz[4][4],
+    const Real hx[4][4],
+    const Real hy[4][4],
+    const Real hz[4][4],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real zv = lz[Q][k];
+    const Real yz = yv * zv;
+
+    if constexpr (WriteValue) {
+        value_row[Q] = xv * yz;
+    }
+
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real zd = dz[Q][k];
+    const Real yd_z = yd * zv;
+    const Real yv_zd = yv * zd;
+
+    if constexpr (WriteGradient) {
+        grad_row[0u * output_stride + Q] = xd * yz;
+        grad_row[1u * output_stride + Q] = xv * yd_z;
+        grad_row[2u * output_stride + Q] = xv * yv_zd;
+    }
+
+    const Real hxy = xd * yd_z;
+    const Real hxz = xd * yv_zd;
+    const Real hyz = xv * yd * zd;
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
+    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <bool WriteValue, bool WriteGradient>
+void evaluate_hex_order3_q4_hessian_outputs(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real dz[4][4];
+    Real hx[4][4];
+    Real hy[4][4];
+    Real hz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+        fill_order3_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_hex_order3_q4_hessian_outputs<0u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<1u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<2u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<3u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order3_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_hex_order3_q4_hessian_outputs<false, false>(
+        points, output_stride, nullptr, nullptr, hessians_out);
+}
+
+void evaluate_hex_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_hex_order3_q4_hessian_outputs<true, true>(
+        points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+void evaluate_hex_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+    Real* row16 = values_out + 16u * output_stride;
+    Real* row17 = values_out + 17u * output_stride;
+    Real* row18 = values_out + 18u * output_stride;
+    Real* row19 = values_out + 19u * output_stride;
+    Real* row20 = values_out + 20u * output_stride;
+    Real* row21 = values_out + 21u * output_stride;
+    Real* row22 = values_out + 22u * output_stride;
+    Real* row23 = values_out + 23u * output_stride;
+    Real* row24 = values_out + 24u * output_stride;
+    Real* row25 = values_out + 25u * output_stride;
+    Real* row26 = values_out + 26u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real z0 = z * (z - Real(1)) * Real(0.5);
+        const Real z1 = z * (z + Real(1)) * Real(0.5);
+        const Real z2 = Real(1) - z * z;
+        const Real x0y0 = x0 * y0;
+        const Real x1y0 = x1 * y0;
+        const Real x1y1 = x1 * y1;
+        const Real x0y1 = x0 * y1;
+        const Real x2y0 = x2 * y0;
+        const Real x1y2 = x1 * y2;
+        const Real x2y1 = x2 * y1;
+        const Real x0y2 = x0 * y2;
+        const Real x2y2 = x2 * y2;
+
+        row0[q] = x0y0 * z0;
+        row1[q] = x1y0 * z0;
+        row2[q] = x1y1 * z0;
+        row3[q] = x0y1 * z0;
+        row4[q] = x0y0 * z1;
+        row5[q] = x1y0 * z1;
+        row6[q] = x1y1 * z1;
+        row7[q] = x0y1 * z1;
+        row8[q] = x2y0 * z0;
+        row9[q] = x1y2 * z0;
+        row10[q] = x2y1 * z0;
+        row11[q] = x0y2 * z0;
+        row12[q] = x2y0 * z1;
+        row13[q] = x1y2 * z1;
+        row14[q] = x2y1 * z1;
+        row15[q] = x0y2 * z1;
+        row16[q] = x0y0 * z2;
+        row17[q] = x1y0 * z2;
+        row18[q] = x1y1 * z2;
+        row19[q] = x0y1 * z2;
+        row20[q] = x2y2 * z0;
+        row21[q] = x2y2 * z1;
+        row22[q] = x2y0 * z2;
+        row23[q] = x1y2 * z2;
+        row24[q] = x2y1 * z2;
+        row25[q] = x0y2 * z2;
+        row26[q] = x2y2 * z2;
+    }
+}
+
+inline void fill_order2_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    values[0] = x * (x - Real(1)) * Real(0.5);
+    values[1] = x * (x + Real(1)) * Real(0.5);
+    values[2] = Real(1) - x * x;
+    first[0] = x - Real(0.5);
+    first[1] = x + Real(0.5);
+    first[2] = Real(-2) * x;
+}
+
+inline void fill_order2_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second) {
+    fill_order2_axis_values_first(x, values, first);
+    second[0] = Real(1);
+    second[1] = Real(1);
+    second[2] = Real(-2);
+}
+
+template <std::size_t Q>
+inline void write_hex_order2_hessian_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    std::size_t k,
+    const Real lx[4][3],
+    const Real ly[4][3],
+    const Real lz[4][3],
+    const Real dx[4][3],
+    const Real dy[4][3],
+    const Real dz[4][3],
+    const Real hx[4][3],
+    const Real hy[4][3],
+    const Real hz[4][3],
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real zv = lz[Q][k];
+    const Real yz = yv * zv;
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real zd = dz[Q][k];
+    const Real yd_z = yd * zv;
+    const Real yv_zd = yv * zd;
+    const Real hxy = xd * yd_z;
+    const Real hxz = xd * yv_zd;
+    const Real hyz = xv * yd * zd;
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
+    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+void evaluate_hex_order2_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<2>();
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real lz[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real dz[4][3];
+    Real hx[4][3];
+    Real hy[4][3];
+    Real hz[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+        fill_order2_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_hex_order2_hessian_q4<0u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<1u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<2u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<3u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order2_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][3],
+    const Real ly[4][3],
+    const Real dx[4][3],
+    const Real dy[4][3],
+    const Real hx[4][3],
+    const Real hy[4][3],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order2_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<2>();
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real hx[4][3];
+    Real hy[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order2_all_q4<0u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<1u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<2u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<3u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order2_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr std::array<std::array<std::size_t, 3>, 27> node_axes = {{
+        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
+        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
+        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
+        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
+        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
+        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
+        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
+    }};
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real lz[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real dz[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order2_axis_values_first(xi[1], ly[q], dy[q]);
+        fill_order2_axis_values_first(xi[2], lz[q], dz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
+        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
+        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
+        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
+        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
+        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
+        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
+        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
+        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
+    }
+}
+
+template<typename FastBasis>
+void evaluate_constant_fast_hessians_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+    FastBasis::evaluate_hessians(math::Vector<Real, 3>{}, fast_hessians);
+    for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+        const Hessian& hessian = fast_hessians[i];
+        Real* H = hessians_out + i * 9u * output_stride;
+        const Real h00 = hessian(0, 0);
+        const Real h01 = hessian(0, 1);
+        const Real h02 = hessian(0, 2);
+        const Real h10 = hessian(1, 0);
+        const Real h11 = hessian(1, 1);
+        const Real h12 = hessian(1, 2);
+        const Real h20 = hessian(2, 0);
+        const Real h21 = hessian(2, 1);
+        const Real h22 = hessian(2, 2);
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            H[0u * output_stride + q] = h00;
+            H[1u * output_stride + q] = h01;
+            H[2u * output_stride + q] = h02;
+            H[3u * output_stride + q] = h10;
+            H[4u * output_stride + q] = h11;
+            H[5u * output_stride + q] = h12;
+            H[6u * output_stride + q] = h20;
+            H[7u * output_stride + q] = h21;
+            H[8u * output_stride + q] = h22;
+        }
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_with_constant_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        if (values_out != nullptr) {
+            std::array<Real, FastBasis::n_dofs> fast_values{};
+            FastBasis::evaluate(xi, fast_values);
+            for (std::size_t i = 0; i < fast_values.size(); ++i) {
+                values_out[i * output_stride + q] = fast_values[i];
+            }
+        }
+        if (gradients_out != nullptr) {
+            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+            FastBasis::evaluate_gradients(xi, fast_gradients);
+            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+                Real* g = gradients_out + i * 3u * output_stride;
+                g[0u * output_stride + q] = fast_gradients[i][0];
+                g[1u * output_stride + q] = fast_gradients[i][1];
+                g[2u * output_stride + q] = fast_gradients[i][2];
+            }
+        }
+    }
+    evaluate_constant_fast_hessians_strided<FastBasis>(
+        points.size(), output_stride, hessians_out);
+}
+
+template<int Order>
+void evaluate_wedge_fast_outputs_strided(
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 1 && Order <= 2,
+                  "wedge fast outputs rely on low-order public triangle ordering");
+    using TriFast = LagrangeTriFast<Order>;
+    constexpr std::size_t axis_size = static_cast<std::size_t>(Order + 1);
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        std::array<Real, TriFast::n_dofs> tri_values{};
+        std::array<Gradient, TriFast::n_dofs> tri_gradients{};
+        std::array<Hessian, TriFast::n_dofs> tri_hessians{};
+        std::array<Real, axis_size> z_values{};
+        std::array<Real, axis_size> z_first{};
+        std::array<Real, axis_size> z_second{};
+
+        TriFast::evaluate(xi, tri_values);
+        if (need_grad || need_hess) {
+            TriFast::evaluate_gradients(xi, tri_gradients);
+        }
+        if (need_hess) {
+            TriFast::evaluate_hessians(xi, tri_hessians);
+            detail::fill_axis_values_first_second<Order>(xi[2], z_values, z_first, z_second);
+        } else if (need_grad) {
+            detail::fill_axis_values_first<Order>(xi[2], z_values, z_first);
+        } else {
+            detail::fill_axis_values<Order>(xi[2], z_values);
+        }
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            const Real tri_v = tri_values[tri];
+            const Real zv = z_values[z];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = tri_v * zv;
+            }
+
+            if (gradients_out != nullptr) {
+                Real* g = gradients_out + node * 3u * output_stride;
+                const Gradient& tri_g = tri_gradients[tri];
+                g[0u * output_stride + q] = tri_g[0] * zv;
+                g[1u * output_stride + q] = tri_g[1] * zv;
+                g[2u * output_stride + q] = tri_v * z_first[z];
+            }
+
+            if (hessians_out != nullptr) {
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Gradient& tri_g = tri_gradients[tri];
+                const Hessian& tri_H = tri_hessians[tri];
+                const Real zd = z_first[z];
+                const Real hxz = tri_g[0] * zd;
+                const Real hxy = tri_H(0, 1) * zv;
+                const Real hyz = tri_g[1] * zd;
+                H[0u * output_stride + q] = tri_H(0, 0) * zv;
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = tri_H(1, 1) * zv;
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = tri_v * z_second[z];
+            }
+        }
+    }
+}
+
+template <int Order>
+inline void fill_triangle_simplex_product_factors(Real lambda, Real* SVMP_RESTRICT factors) {
+    const Real t = static_cast<Real>(Order) * lambda;
+    factors[0] = Real(1);
+    for (int a = 1; a <= Order; ++a) {
+        factors[a] =
+            factors[a - 1] *
+            (t - static_cast<Real>(a - 1)) /
+            static_cast<Real>(a);
+    }
+}
+
+template <int Order>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool evaluate_wedge_values_product_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    constexpr std::size_t tri_count =
+        static_cast<std::size_t>((Order + 1) * (Order + 2) / 2);
+    if (simplex_exponents.size() != tri_count || points.size() != 4u) {
+        return false;
+    }
+
+    Real tri_values[4][tri_count];
+    std::array<Real, Order + 1> z_values[4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        Real f0[Order + 1];
+        Real f1[Order + 1];
+        Real f2[Order + 1];
+        fill_triangle_simplex_product_factors<Order>(l0, f0);
+        fill_triangle_simplex_product_factors<Order>(l1, f1);
+        fill_triangle_simplex_product_factors<Order>(l2, f2);
+        detail::fill_axis_values<Order>(xi[2], z_values[q]);
+
+        for (std::size_t tri = 0; tri < tri_count; ++tri) {
+            const auto& e = simplex_exponents[tri];
+            tri_values[q][tri] =
+                f0[static_cast<std::size_t>(e[0])] *
+                f1[static_cast<std::size_t>(e[1])] *
+                f2[static_cast<std::size_t>(e[2])];
+        }
+    }
+
+    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+        const auto& index = wedge_indices[node];
+        const std::size_t tri = index[0];
+        const std::size_t z = index[1];
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = tri_values[0][tri] * z_values[0][z];
+        row[1] = tri_values[1][tri] * z_values[1][z];
+        row[2] = tri_values[2][tri] * z_values[2][z];
+        row[3] = tri_values[3][tri] * z_values[3][z];
+    }
+    return true;
+}
+
+bool try_evaluate_wedge_values_product_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+        case 4:
+            return evaluate_wedge_values_product_q4<4>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 5:
+            return evaluate_wedge_values_product_q4<5>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 6:
+            return evaluate_wedge_values_product_q4<6>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 7:
+            return evaluate_wedge_values_product_q4<7>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 8:
+            return evaluate_wedge_values_product_q4<8>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        default:
+            return false;
+    }
+}
+
+void evaluate_wedge_order1_values_q4(
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real tri[4][3];
+    Real axis[4][2];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        tri[q][0] = Real(1) - xi[0] - xi[1];
+        tri[q][1] = xi[0];
+        tri[q][2] = xi[1];
+        axis[q][0] = (Real(1) - xi[2]) * Real(0.5);
+        axis[q][1] = (Real(1) + xi[2]) * Real(0.5);
+    }
+
+    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+        const auto& index = wedge_indices[node];
+        const std::size_t tri_node = index[0];
+        const std::size_t axis_node = index[1];
+        Real* row = values_out + node * output_stride;
+        row[0] = tri[0][tri_node] * axis[0][axis_node];
+        row[1] = tri[1][tri_node] * axis[1][axis_node];
+        row[2] = tri[2][tri_node] * axis[2][axis_node];
+        row[3] = tri[3][tri_node] * axis[3][axis_node];
+    }
+}
+
+bool evaluate_wedge_fast_strided(
+    int order,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (order == 3) {
+        return false;
+    }
+    if (order == 1 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_wedge_order1_values_q4(wedge_indices, points, output_stride, values_out);
+        return true;
+    }
+
+    switch (order) {
+        case 1:
+            evaluate_wedge_fast_outputs_strided<1>(
+                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 2:
+            evaluate_wedge_fast_outputs_strided<2>(
+                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast(LagrangeTopology topology,
+                                  int order,
+                                  const math::Vector<Real, 3>& xi,
+                                  std::vector<Real>* values,
+                                  std::vector<Gradient>* gradients,
+                                  std::vector<Hessian>* hessians) {
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_order<1>(
+                topology, xi, values, gradients, hessians);
+        case 2:
+            return evaluate_fixed_lagrange_fast_order<2>(
+                topology, xi, values, gradients, hessians);
+        case 3:
+            return evaluate_fixed_lagrange_fast_order<3>(
+                topology, xi, values, gradients, hessians);
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast_strided(LagrangeTopology topology,
+                                          int order,
+                                          const std::vector<math::Vector<Real, 3>>& points,
+                                          std::size_t output_stride,
+                                          Real* SVMP_RESTRICT values_out,
+                                          Real* SVMP_RESTRICT gradients_out,
+                                          Real* SVMP_RESTRICT hessians_out) {
+    if (topology == LagrangeTopology::Line &&
+        points.size() == 4u) {
+        const bool values_only =
+            values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr;
+        const bool gradients_only =
+            values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr;
+        const bool hessians_only =
+            values_out == nullptr && gradients_out == nullptr && hessians_out != nullptr;
+        const bool all_outputs =
+            values_out != nullptr && gradients_out != nullptr && hessians_out != nullptr;
+        if (values_only) {
+            if (order == 1) {
+                evaluate_line_order1_values_q4(points, output_stride, values_out);
+                return true;
+            }
+            if (order == 2) {
+                evaluate_line_order2_values_q4(points, output_stride, values_out);
+                return true;
+            }
+            if (order == 3) {
+                evaluate_line_order3_values_q4(points, output_stride, values_out);
+                return true;
+            }
+        }
+        if (order == 1) {
+            if (gradients_only) {
+                evaluate_line_order1_gradients_q4(output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order1_hessians_q4(output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order1_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+        if (order == 2) {
+            if (gradients_only) {
+                evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order2_hessians_q4(output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order2_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+        if (order == 3) {
+            if (gradients_only) {
+                evaluate_line_order3_gradients_q4(points, output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order3_hessians_q4(points, output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order3_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+    }
+
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 3 &&
+        (gradients_out != nullptr || hessians_out != nullptr)) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        hessians_out != nullptr) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
+        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order1_values_strided(points, output_stride, values_out);
+        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
+        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
+        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order2_values_strided(points, output_stride, values_out);
+        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
+        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order3_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order1_outputs_strided<false, true, false>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order1_outputs_strided<false, false, true>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order1_outputs_strided<true, true, true>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order1_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order1_hessians_strided(points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order1_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order2_hessians_strided(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order2_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order3_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order3_hessians_strided(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order2_values_strided(points, output_stride, values_out);
+        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
+        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        output_stride == 4u &&
+        hessians_out != nullptr) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order3_values_q4(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order3_gradients_q4(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order3_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (hessians_out != nullptr && order > 1 &&
+        (topology == LagrangeTopology::Quadrilateral ||
+         topology == LagrangeTopology::Hexahedron)) {
+        return false;
+    }
+    if (hessians_out != nullptr) {
+        const bool hessians_only = values_out == nullptr && gradients_out == nullptr;
+        if (order == 1) {
+            if (topology == LagrangeTopology::Triangle && hessians_only) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<1>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+            if (topology == LagrangeTopology::Tetrahedron) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<1>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        } else if (order == 2) {
+            if (topology == LagrangeTopology::Triangle && hessians_only) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<2>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+            if (topology == LagrangeTopology::Tetrahedron) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<2>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+    }
+
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_strided_order<1>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        case 2:
+            return evaluate_fixed_lagrange_fast_strided_order<2>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        case 3:
+            return evaluate_fixed_lagrange_fast_strided_order<3>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast_to(LagrangeTopology topology,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     Real* SVMP_RESTRICT values_out,
+                                     Real* SVMP_RESTRICT gradients_out,
+                                     Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_to_order<1>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        case 2:
+            return evaluate_fixed_lagrange_fast_to_order<2>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        case 3:
+            return evaluate_fixed_lagrange_fast_to_order<3>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+template<std::size_t N>
+struct AxisMonomialCoefficientTable {
+    std::array<Real, N * N> values{};
+    std::array<Real, N * (N > 1 ? N - 1 : 0)> first{};
+    std::array<Real, N * (N > 2 ? N - 2 : 0)> second{};
+};
+
+template<std::size_t N>
+constexpr AxisMonomialCoefficientTable<N> make_axis_monomial_coefficient_table() {
+    AxisMonomialCoefficientTable<N> table{};
+    std::array<Real, N> nodes{};
+    constexpr int order = static_cast<int>(N) - 1;
+    for (std::size_t i = 0; i < N; ++i) {
+        nodes[i] = detail::equispaced_pm_one_coord(static_cast<int>(i), order);
+    }
+
+    for (std::size_t i = 0; i < N; ++i) {
+        std::array<Real, N> coeffs{};
+        std::array<Real, N> next{};
+        coeffs[0] = Real(1);
+        std::size_t degree = 0;
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j == i) {
+                continue;
+            }
+            next = {};
+            for (std::size_t k = 0; k <= degree; ++k) {
+                next[k] -= nodes[j] * coeffs[k];
+                next[k + 1] += coeffs[k];
+            }
+            coeffs = next;
+            ++degree;
+        }
+
+        Real denominator = Real(1);
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j != i) {
+                denominator *= nodes[i] - nodes[j];
+            }
+        }
+        const Real inv_denominator = Real(1) / denominator;
+        for (std::size_t k = 0; k < N; ++k) {
+            table.values[i * N + k] = coeffs[k] * inv_denominator;
+        }
+        if constexpr (N >= 2) {
+            for (std::size_t k = 1; k < N; ++k) {
+                table.first[i * (N - 1) + (k - 1)] =
+                    static_cast<Real>(k) * table.values[i * N + k];
+            }
+        }
+        if constexpr (N >= 3) {
+            for (std::size_t k = 2; k < N; ++k) {
+                table.second[i * (N - 2) + (k - 2)] =
+                    static_cast<Real>(k * (k - 1)) * table.values[i * N + k];
+            }
+        }
+    }
+
+    return table;
+}
+
+template<std::size_t N>
+void assign_axis_coefficient_table(const AxisMonomialCoefficientTable<N>& table,
+                                   std::vector<Real>& values,
+                                   std::vector<Real>& first,
+                                   std::vector<Real>& second) {
+    assign_array(values, table.values);
+    assign_array(first, table.first);
+    assign_array(second, table.second);
+}
+
+bool assign_precomputed_axis_coefficients(int n_axis,
+                                          std::vector<Real>& values,
+                                          std::vector<Real>& first,
+                                          std::vector<Real>& second) {
+    static constexpr auto kAxisCoefficients1 = make_axis_monomial_coefficient_table<1>();
+    static constexpr auto kAxisCoefficients2 = make_axis_monomial_coefficient_table<2>();
+    static constexpr auto kAxisCoefficients3 = make_axis_monomial_coefficient_table<3>();
+    static constexpr auto kAxisCoefficients4 = make_axis_monomial_coefficient_table<4>();
+    static constexpr auto kAxisCoefficients5 = make_axis_monomial_coefficient_table<5>();
+
+    switch (n_axis) {
+        case 1:
+            assign_axis_coefficient_table(kAxisCoefficients1, values, first, second);
+            return true;
+        case 2:
+            assign_axis_coefficient_table(kAxisCoefficients2, values, first, second);
+            return true;
+        case 3:
+            assign_axis_coefficient_table(kAxisCoefficients3, values, first, second);
+            return true;
+        case 4:
+            assign_axis_coefficient_table(kAxisCoefficients4, values, first, second);
+            return true;
+        case 5:
+            assign_axis_coefficient_table(kAxisCoefficients5, values, first, second);
+            return true;
+        default:
+            return false;
+    }
+}
+
+LagrangeTopologyTraits lagrange_topology_traits(ElementType type) {
+    const auto topo = topology(type);
+    if (topo != LagrangeTopology::Unknown) {
+        return {topo, reference_dimension(type)};
+    }
+
+    throw BasisElementCompatibilityException("Unsupported element type for LagrangeBasis",
+                                             __FILE__, __LINE__, __func__);
+}
+
+std::size_t lattice_index_pm_one(Real coord, int order, const char* context) {
+    if (order <= 0) {
+        if (!coordinate_matches_expected(coord, Real(0))) {
+            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+        }
+        return 0;
+    }
+
+    const Real scaled = (coord + Real(1)) * static_cast<Real>(order) / Real(2);
+    const long idx = std::lround(scaled);
+    if (idx < 0 || idx > order ||
+        !coordinate_matches_expected(
+            coord,
+            detail::equispaced_pm_one_coord(static_cast<int>(idx), order))) {
+        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+    }
+    return static_cast<std::size_t>(idx);
+}
+
+int simplex_lattice_index(Real coord, int order, const char* context) {
+    if (order <= 0) {
+        if (!coordinate_matches_expected(coord, Real(0)) &&
+            !coordinate_matches_expected(coord, Real(0.25)) &&
+            !coordinate_matches_expected(coord, Real(1) / Real(3))) {
+            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+        }
+        return 0;
+    }
+
+    const Real scaled = coord * static_cast<Real>(order);
+    const long idx = std::lround(scaled);
+    const Real reconstructed = static_cast<Real>(idx) / static_cast<Real>(order);
+    if (idx < 0 || idx > order || !coordinate_matches_expected(coord, reconstructed)) {
+        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+    }
+    return static_cast<int>(idx);
+}
+
+std::array<int, 4> triangle_exponents_from_public_node(const math::Vector<Real, 3>& node,
+                                                       int order) {
+    if (order == 0) {
+        return {0, 0, 0, 0};
+    }
+
+    const int j = simplex_lattice_index(node[0], order,
+                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
+    const int k = simplex_lattice_index(node[1], order,
+                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
+    const int i = order - j - k;
+    if (i < 0) {
+        throw BasisNodeOrderingException("LagrangeBasis: invalid triangle barycentric coordinates for public ordering",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return {i, j, k, 0};
+}
+
+std::array<int, 4> tetrahedron_exponents_from_public_node(const math::Vector<Real, 3>& node,
+                                                          int order) {
+    if (order == 0) {
+        return {0, 0, 0, 0};
+    }
+
+    const int j = simplex_lattice_index(node[0], order,
+                                        "LagrangeBasis: invalid tetrahedron node x-coordinate for public ordering");
+    const int k = simplex_lattice_index(node[1], order,
+                                        "LagrangeBasis: invalid tetrahedron node y-coordinate for public ordering");
+    const int l = simplex_lattice_index(node[2], order,
+                                        "LagrangeBasis: invalid tetrahedron node z-coordinate for public ordering");
+    const int i = order - j - k - l;
+    if (i < 0) {
+        throw BasisNodeOrderingException("LagrangeBasis: invalid tetrahedron barycentric coordinates for public ordering",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return {i, j, k, l};
+}
+
+struct NormalizedLagrangeRequest {
+    ElementType element_type;
+    int order;
+};
+
+// Non-owning view of the per-axis 1D Lagrange basis evaluations
+// (values, first derivative, second derivative), each of length `size`.
+struct AxisBasisEvaluations {
+    const Real* values;
+    const Real* first;
+    const Real* second;
+    std::size_t size;
+};
+
+AxisBasisEvaluations constant_axis_basis() {
+    static const Real kOne[1]  = {Real(1)};
+    static const Real kZero[1] = {Real(0)};
+    return AxisBasisEvaluations{kOne, kZero, kZero, 1};
+}
+
+// Horner-form evaluator for the precomputed 1D Lagrange basis.
+//
+// Inputs are precomputed monomial coefficients of L_i(x), L_i'(x), L_i''(x)
+// (built once at LagrangeBasis construction). Evaluation is purely
+// multiply-add on the coefficients — no divisions and no node-position
+// lookups in the hot path. Templated on N for compile-time loop unrolling
+// and FMA-friendly straight-line code on the common Hex/Quad/Line orders.
+//
+// Layout:
+//   v_coeffs:  N * N entries; row i holds [c_i0, c_i1, ..., c_i(N-1)]
+//              such that L_i(x) = sum_k c_ik * x^k
+//   d_coeffs:  N * (N-1) entries; row i holds derivative coefficients of L_i'(x)
+//   d2_coeffs: N * (N-2) entries; row i holds coefficients of L_i''(x)
+//              (only valid when N >= 3)
+template<int N>
+inline void evaluate_1d_horner_impl(const Real* v_coeffs,
+                                    const Real* d_coeffs,
+                                    const Real* d2_coeffs,
+                                    Real xi,
+                                    Real* values, Real* first, Real* second) {
+    if constexpr (N == 1) {
+        values[0] = v_coeffs[0];
+        if (first)  first[0]  = Real(0);
+        if (second) second[0] = Real(0);
+        return;
+    } else {
+        // Values: degree N-1 polynomials.
+        for (int i = 0; i < N; ++i) {
+            const Real* c = v_coeffs + i * N;
+            Real r = c[N - 1];
+            for (int k = N - 1; k > 0; --k) {
+                r = r * xi + c[k - 1];
+            }
+            values[i] = r;
+        }
+
+        if (!first && !second) return;
+
+        if (first) {
+            // First derivatives: degree N-2 polynomials (per row of d_coeffs).
+            for (int i = 0; i < N; ++i) {
+                const Real* c = d_coeffs + i * (N - 1);
+                Real r = c[N - 2];
+                for (int k = N - 2; k > 0; --k) {
+                    r = r * xi + c[k - 1];
+                }
+                first[i] = r;
+            }
+        }
+
+        if (!second) return;
+
+        if constexpr (N <= 2) {
+            for (int i = 0; i < N; ++i) second[i] = Real(0);
+        } else {
+            // Second derivatives: degree N-3 polynomials (per row of d2_coeffs).
+            for (int i = 0; i < N; ++i) {
+                const Real* c = d2_coeffs + i * (N - 2);
+                Real r = c[N - 3];
+                for (int k = N - 3; k > 0; --k) {
+                    r = r * xi + c[k - 1];
+                }
+                second[i] = r;
+            }
+        }
+    }
+}
+
+void fill_equispaced_barycentric_weights(int n_axis, Real* weights) {
+    const int order = n_axis - 1;
+    Real weight = (order % 2 == 0) ? Real(1) : Real(-1);
+    Real max_abs = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        weights[i] = weight;
+        max_abs = std::max(max_abs, std::abs(weight));
+        if (i < order) {
+            weight *= -static_cast<Real>(order - i) / static_cast<Real>(i + 1);
+        }
+    }
+
+    if (max_abs > Real(0)) {
+        const Real inv_scale = Real(1) / max_abs;
+        for (int i = 0; i < n_axis; ++i) {
+            weights[i] *= inv_scale;
+        }
+    }
+}
+
+bool coordinate_matches_axis_node(Real xi, Real node) {
+    return coordinate_matches_expected(xi, node);
+}
+
+struct CompensatedSum {
+    Real sum{Real(0)};
+    Real compensation{Real(0)};
+
+    void add(Real value) noexcept {
+        const Real y = value - compensation;
+        const Real t = sum + y;
+        compensation = (t - sum) - y;
+        sum = t;
+    }
+};
+
+void distribute_residual_by_abs(int n_axis, Real* values, Real residual) {
+    if (values == nullptr || n_axis <= 0 || residual == Real(0)) {
+        return;
+    }
+
+    CompensatedSum abs_sum;
+    int largest_index = 0;
+    Real largest_abs = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        const Real magnitude = std::abs(values[i]);
+        abs_sum.add(magnitude);
+        if (magnitude > largest_abs) {
+            largest_abs = magnitude;
+            largest_index = i;
+        }
+    }
+
+    if (abs_sum.sum <= Real(0)) {
+        values[0] += residual;
+        return;
+    }
+
+    const Real inv_abs_sum = Real(1) / abs_sum.sum;
+    CompensatedSum applied;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real correction = residual * std::abs(values[i]) * inv_abs_sum;
+        values[i] += correction;
+        applied.add(correction);
+    }
+    values[largest_index] += residual - applied.sum;
+}
+
+void evaluate_1d_barycentric_runtime(int n_axis,
+                                     Real xi,
+                                     const Real* weights,
+                                     Real* values,
+                                     Real* first,
+                                     Real* second) {
+    const int order = n_axis - 1;
+    BASIS_CHECK_EVAL(weights != nullptr,
+                     "LagrangeBasis: missing cached barycentric weights for runtime axis evaluation");
+
+    int node_index = -1;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        if (coordinate_matches_axis_node(xi, node)) {
+            node_index = i;
+            break;
+        }
+    }
+
+    if (node_index >= 0) {
+        std::fill(values, values + n_axis, Real(0));
+        values[node_index] = Real(1);
+        if (!first && !second) {
+            return;
+        }
+
+        const Real xk = detail::equispaced_pm_one_coord(node_index, order);
+        const Real wk = weights[static_cast<std::size_t>(node_index)];
+        Real reciprocal_sum = Real(0);
+        if (second) {
+            for (int m = 0; m < n_axis; ++m) {
+                if (m == node_index) {
+                    continue;
+                }
+                const Real xm = detail::equispaced_pm_one_coord(m, order);
+                reciprocal_sum += Real(1) / (xk - xm);
+            }
+        }
+
+        Real first_diagonal = Real(0);
+        Real second_diagonal = Real(0);
+        if (first) {
+            std::fill(first, first + n_axis, Real(0));
+        }
+        if (second) {
+            std::fill(second, second + n_axis, Real(0));
+        }
+
+        for (int j = 0; j < n_axis; ++j) {
+            if (j == node_index) {
+                continue;
+            }
+            const Real xj = detail::equispaced_pm_one_coord(j, order);
+            const Real distance = xk - xj;
+            const Real offdiag_first = weights[static_cast<std::size_t>(j)] / (wk * distance);
+            first_diagonal -= offdiag_first;
+            if (first) {
+                first[j] = offdiag_first;
+            }
+            if (second) {
+                const Real offdiag_second =
+                    Real(2) * offdiag_first * (reciprocal_sum - Real(1) / distance);
+                second[j] = offdiag_second;
+                second_diagonal -= offdiag_second;
+            }
+        }
+        if (first) {
+            first[node_index] = first_diagonal;
+        }
+        if (second) {
+            second[node_index] = second_diagonal;
+        }
+        return;
+    }
+
+    Real sum0 = Real(0);
+    Real sum1 = Real(0);
+    Real sum2 = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        const Real inv_distance = Real(1) / (xi - node);
+        const Real weighted = weights[static_cast<std::size_t>(i)] * inv_distance;
+        sum0 += weighted;
+        sum1 += weighted * inv_distance;
+        sum2 += weighted * inv_distance * inv_distance;
+    }
+
+    const Real inv_sum0 = Real(1) / sum0;
+    const Real first_ratio = sum1 * inv_sum0;
+    const Real second_ratio = sum2 * inv_sum0;
+    const Real first_ratio_sq = first_ratio * first_ratio;
+
+    CompensatedSum value_sum;
+    CompensatedSum first_sum;
+    CompensatedSum second_sum;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        const Real inv_distance = Real(1) / (xi - node);
+        const Real value = weights[static_cast<std::size_t>(i)] * inv_distance * inv_sum0;
+        values[i] = value;
+        value_sum.add(value);
+        if (first || second) {
+            const Real derivative_factor = first_ratio - inv_distance;
+            if (first) {
+                first[i] = value * derivative_factor;
+                first_sum.add(first[i]);
+            }
+            if (second) {
+                second[i] = value * (derivative_factor * derivative_factor +
+                                     inv_distance * inv_distance -
+                                     Real(2) * second_ratio +
+                                     first_ratio_sq);
+                second_sum.add(second[i]);
+            }
+        }
+    }
+
+    distribute_residual_by_abs(n_axis, values, Real(1) - value_sum.sum);
+    if (first) {
+        distribute_residual_by_abs(n_axis, first, -first_sum.sum);
+    }
+    if (second) {
+        distribute_residual_by_abs(n_axis, second, -second_sum.sum);
+    }
+}
+
+// 1D Lagrange-basis evaluator. Writes n_axis entries to each non-null output
+// buffer. Dispatches to compile-time Horner specializations for sizes 1..9
+// (orders 0..8 — the Lagrange performance sweep) and uses barycentric
+// evaluation above that threshold to avoid high-order monomial conditioning
+// issues.
+void evaluate_1d_basis_to(const Real* v_coeffs,
+                          const Real* d_coeffs,
+                          const Real* d2_coeffs,
+                          const Real* barycentric_weights,
+                          int n_axis, Real xi,
+                          Real* values, Real* first, Real* second) {
+    switch (n_axis) {
+        case 1: evaluate_1d_horner_impl<1>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 2: evaluate_1d_horner_impl<2>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 3: evaluate_1d_horner_impl<3>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 4: evaluate_1d_horner_impl<4>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 5: evaluate_1d_horner_impl<5>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 6: evaluate_1d_horner_impl<6>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 7: evaluate_1d_horner_impl<7>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 8: evaluate_1d_horner_impl<8>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 9: evaluate_1d_horner_impl<9>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        default:
+            evaluate_1d_barycentric_runtime(n_axis, xi, barycentric_weights, values, first, second);
+            return;
+    }
+}
+
+// Selects which derivative passes are computed by the 1D evaluator.
+enum class AxisDeriv {
+    ValuesOnly,           // skip first and second
+    ValuesAndFirst,       // for gradients
+    ValuesAndFirstAndSecond, // for hessians or fused evaluate_all
+};
+
+// Per-axis storage (values, first derivative, second derivative). Backed by
+// per-thread scratch that grows lazily; subsequent calls reuse capacity with no
+// reallocation.
+struct AxisScratch {
+    std::vector<Real> values;
+    std::vector<Real> first;
+    std::vector<Real> second;
+
+    void reserveFor(std::size_t n) {
+        if (values.size() < n) values.resize(n);
+        if (first.size() < n) first.resize(n);
+        if (second.size() < n) second.resize(n);
+    }
+};
+
+struct AxisBatchScratch {
+    std::vector<Real> values;
+    std::vector<Real> first;
+    std::vector<Real> second;
+
+    void resizeFor(std::size_t count, AxisDeriv level) {
+        if (values.size() < count) values.resize(count);
+        if (level != AxisDeriv::ValuesOnly && first.size() < count) first.resize(count);
+        if (level == AxisDeriv::ValuesAndFirstAndSecond && second.size() < count) second.resize(count);
+    }
+};
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+inline void fill_simplex_factor_sequence_fixed(Real lambda,
+                                               Real* SVMP_RESTRICT phi,
+                                               Real* SVMP_RESTRICT dphi,
+                                               Real* SVMP_RESTRICT d2phi) {
+    static_assert(!NeedSecond || NeedFirst,
+                  "second derivative factors require first-derivative recurrence state");
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        dphi[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        d2phi[0] = Real(0);
+    }
+
+    const Real t = static_cast<Real>(Order) * lambda;
+    const Real dt_dlambda = static_cast<Real>(Order);
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+    for (int a = 1; a <= Order; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt_old = dphi_dt_prev;
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
+            dphi[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
+                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+template<int Order, bool NeedSecond>
+inline void fill_triangle_factors_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real (&phi0)[4][Order + 1],
+    Real (&phi1)[4][Order + 1],
+    Real (&phi2)[4][Order + 1],
+    Real (&dphi0)[4][Order + 1],
+    Real (&dphi1)[4][Order + 1],
+    Real (&dphi2)[4][Order + 1],
+    Real (&d2phi0)[4][Order + 1],
+    Real (&d2phi1)[4][Order + 1],
+    Real (&d2phi2)[4][Order + 1]) {
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        if constexpr (NeedSecond) {
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l0, phi0[q], dphi0[q], d2phi0[q]);
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l1, phi1[q], dphi1[q], d2phi1[q]);
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l2, phi2[q], dphi2[q], d2phi2[q]);
+        } else {
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l0, phi0[q], dphi0[q], nullptr);
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l1, phi1[q], dphi1[q], nullptr);
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l2, phi2[q], dphi2[q], nullptr);
+        }
+    }
+}
+
+template<std::size_t Q>
+inline void write_wedge_gradient_strided_q(std::size_t tri_stride,
+                                           std::size_t axis_stride,
+                                           std::size_t tri,
+                                           std::size_t z,
+                                           std::size_t output_stride,
+                                           const Real* SVMP_RESTRICT tri_values,
+                                           const Real* SVMP_RESTRICT tri_g,
+                                           const AxisBatchScratch& axis_batch,
+                                           Real* SVMP_RESTRICT g) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    g[0u * output_stride + Q] = tri_g[0u * tri_stride + Q] * zv;
+    g[1u * output_stride + Q] = tri_g[1u * tri_stride + Q] * zv;
+    g[2u * output_stride + Q] = tri_v * axis_batch.first[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_gradient_stride4_q(std::size_t tri_stride,
+                                           std::size_t axis_stride,
+                                           std::size_t tri,
+                                           std::size_t z,
+                                           const Real* SVMP_RESTRICT tri_values,
+                                           const Real* SVMP_RESTRICT tri_g,
+                                           const AxisBatchScratch& axis_batch,
+                                           Real* SVMP_RESTRICT g) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    g[Q] = tri_g[0u * tri_stride + Q] * zv;
+    g[4u + Q] = tri_g[1u * tri_stride + Q] * zv;
+    g[8u + Q] = tri_v * axis_batch.first[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_hessian_strided_q(std::size_t tri_stride,
+                                          std::size_t axis_stride,
+                                          std::size_t tri,
+                                          std::size_t z,
+                                          std::size_t output_stride,
+                                          const Real* SVMP_RESTRICT tri_values,
+                                          const Real* SVMP_RESTRICT tri_g,
+                                          const Real* SVMP_RESTRICT tri_H,
+                                          const AxisBatchScratch& axis_batch,
+                                          Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    H[0u * output_stride + Q] = tri_hxx * zv;
+    H[1u * output_stride + Q] = hxy;
+    H[2u * output_stride + Q] = hxz;
+    H[3u * output_stride + Q] = hxy;
+    H[4u * output_stride + Q] = tri_hyy * zv;
+    H[5u * output_stride + Q] = hyz;
+    H[6u * output_stride + Q] = hxz;
+    H[7u * output_stride + Q] = hyz;
+    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_hessian_stride4_q(std::size_t tri_stride,
+                                          std::size_t axis_stride,
+                                          std::size_t tri,
+                                          std::size_t z,
+                                          const Real* SVMP_RESTRICT tri_values,
+                                          const Real* SVMP_RESTRICT tri_g,
+                                          const Real* SVMP_RESTRICT tri_H,
+                                          const AxisBatchScratch& axis_batch,
+                                          Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    H[Q] = tri_hxx * zv;
+    H[4u + Q] = hxy;
+    H[8u + Q] = hxz;
+    H[12u + Q] = hxy;
+    H[16u + Q] = tri_hyy * zv;
+    H[20u + Q] = hyz;
+    H[24u + Q] = hxz;
+    H[28u + Q] = hyz;
+    H[32u + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_all_strided_q(std::size_t tri_stride,
+                                      std::size_t axis_stride,
+                                      std::size_t tri,
+                                      std::size_t z,
+                                      std::size_t output_stride,
+                                      const Real* SVMP_RESTRICT tri_values,
+                                      const Real* SVMP_RESTRICT tri_g,
+                                      const Real* SVMP_RESTRICT tri_H,
+                                      const AxisBatchScratch& axis_batch,
+                                      Real* SVMP_RESTRICT value_row,
+                                      Real* SVMP_RESTRICT g,
+                                      Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    value_row[Q] = tri_v * zv;
+    g[0u * output_stride + Q] = tri_gx * zv;
+    g[1u * output_stride + Q] = tri_gy * zv;
+    g[2u * output_stride + Q] = tri_v * zd;
+    H[0u * output_stride + Q] = tri_hxx * zv;
+    H[1u * output_stride + Q] = hxy;
+    H[2u * output_stride + Q] = hxz;
+    H[3u * output_stride + Q] = hxy;
+    H[4u * output_stride + Q] = tri_hyy * zv;
+    H[5u * output_stride + Q] = hyz;
+    H[6u * output_stride + Q] = hxz;
+    H[7u * output_stride + Q] = hyz;
+    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_all_stride4_q(std::size_t tri_stride,
+                                      std::size_t axis_stride,
+                                      std::size_t tri,
+                                      std::size_t z,
+                                      const Real* SVMP_RESTRICT tri_values,
+                                      const Real* SVMP_RESTRICT tri_g,
+                                      const Real* SVMP_RESTRICT tri_H,
+                                      const AxisBatchScratch& axis_batch,
+                                      Real* SVMP_RESTRICT value_row,
+                                      Real* SVMP_RESTRICT g,
+                                      Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    value_row[Q] = tri_v * zv;
+    g[Q] = tri_gx * zv;
+    g[4u + Q] = tri_gy * zv;
+    g[8u + Q] = tri_v * zd;
+    H[Q] = tri_hxx * zv;
+    H[4u + Q] = hxy;
+    H[8u + Q] = hxz;
+    H[12u + Q] = hxy;
+    H[16u + Q] = tri_hyy * zv;
+    H[20u + Q] = hyz;
+    H[24u + Q] = hxz;
+    H[28u + Q] = hyz;
+    H[32u + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<int Order, bool NeedHess>
+bool evaluate_wedge_fused_stride4_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    const std::vector<math::Vector<Real, 3>>& points,
+    const AxisBatchScratch& axis_batch,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 3 && Order <= 8, "fused wedge q4 path covers orders 3..8");
+    const std::size_t tri_count = simplex_exponents.size();
+    const std::size_t z_count = static_cast<std::size_t>(n_axis);
+    if (points.size() != 4u ||
+        z_count != static_cast<std::size_t>(Order + 1) ||
+        wedge_node_by_tri_z.size() != tri_count * z_count) {
+        return false;
+    }
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+    fill_triangle_factors_q4<Order, NeedHess>(
+        points, phi0, phi1, phi2, dphi0, dphi1, dphi2, d2phi0, d2phi1, d2phi2);
+
+    for (std::size_t tri = 0; tri < tri_count; ++tri) {
+        const auto& e = simplex_exponents[tri];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+        Real tri_v[4];
+        Real tri_gx[4];
+        Real tri_gy[4];
+        Real tri_hxx[4];
+        Real tri_hxy[4];
+        Real tri_hyy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real dl0 = D0 * v1 * v2;
+            tri_v[q] = v0 * v1 * v2;
+            tri_gx[q] = v0 * D1 * v2 - dl0;
+            tri_gy[q] = v0 * v1 * D2 - dl0;
+
+            if constexpr (NeedHess) {
+                const Real DD0 = d2phi0[q][i0];
+                const Real DD1 = d2phi1[q][i1];
+                const Real DD2 = d2phi2[q][i2];
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+                tri_hxx[q] = H00 - Real(2) * H01 + H11;
+                tri_hxy[q] = H00 - H01 - H02 + H12;
+                tri_hyy[q] = H00 - Real(2) * H02 + H22;
+            }
+        }
+
+        for (std::size_t z = 0; z < z_count; ++z) {
+            const std::size_t node = wedge_node_by_tri_z[tri * z_count + z];
+            Real* SVMP_RESTRICT value_row =
+                values_out != nullptr ? values_out + node * 4u : nullptr;
+            Real* SVMP_RESTRICT g =
+                gradients_out != nullptr ? gradients_out + node * 12u : nullptr;
+            Real* SVMP_RESTRICT H =
+                hessians_out != nullptr ? hessians_out + node * 36u : nullptr;
+
+            const Real z0 = axis_batch.values[z];
+            const Real z1 = axis_batch.values[z_count + z];
+            const Real z2 = axis_batch.values[2u * z_count + z];
+            const Real z3 = axis_batch.values[3u * z_count + z];
+            const Real dz0 = axis_batch.first[z];
+            const Real dz1 = axis_batch.first[z_count + z];
+            const Real dz2 = axis_batch.first[2u * z_count + z];
+            const Real dz3 = axis_batch.first[3u * z_count + z];
+
+            if (value_row != nullptr) {
+                value_row[0] = tri_v[0] * z0;
+                value_row[1] = tri_v[1] * z1;
+                value_row[2] = tri_v[2] * z2;
+                value_row[3] = tri_v[3] * z3;
+            }
+            if (g != nullptr) {
+                g[0] = tri_gx[0] * z0;
+                g[1] = tri_gx[1] * z1;
+                g[2] = tri_gx[2] * z2;
+                g[3] = tri_gx[3] * z3;
+                g[4] = tri_gy[0] * z0;
+                g[5] = tri_gy[1] * z1;
+                g[6] = tri_gy[2] * z2;
+                g[7] = tri_gy[3] * z3;
+                g[8] = tri_v[0] * dz0;
+                g[9] = tri_v[1] * dz1;
+                g[10] = tri_v[2] * dz2;
+                g[11] = tri_v[3] * dz3;
+            }
+            if constexpr (NeedHess) {
+                if (H != nullptr) {
+                    const Real d2z0 = axis_batch.second[z];
+                    const Real d2z1 = axis_batch.second[z_count + z];
+                    const Real d2z2 = axis_batch.second[2u * z_count + z];
+                    const Real d2z3 = axis_batch.second[3u * z_count + z];
+                    const Real hxz0 = tri_gx[0] * dz0;
+                    const Real hxz1 = tri_gx[1] * dz1;
+                    const Real hxz2 = tri_gx[2] * dz2;
+                    const Real hxz3 = tri_gx[3] * dz3;
+                    const Real hyz0 = tri_gy[0] * dz0;
+                    const Real hyz1 = tri_gy[1] * dz1;
+                    const Real hyz2 = tri_gy[2] * dz2;
+                    const Real hyz3 = tri_gy[3] * dz3;
+                    H[0] = tri_hxx[0] * z0;
+                    H[1] = tri_hxx[1] * z1;
+                    H[2] = tri_hxx[2] * z2;
+                    H[3] = tri_hxx[3] * z3;
+                    H[4] = tri_hxy[0] * z0;
+                    H[5] = tri_hxy[1] * z1;
+                    H[6] = tri_hxy[2] * z2;
+                    H[7] = tri_hxy[3] * z3;
+                    H[8] = hxz0;
+                    H[9] = hxz1;
+                    H[10] = hxz2;
+                    H[11] = hxz3;
+                    H[12] = H[4];
+                    H[13] = H[5];
+                    H[14] = H[6];
+                    H[15] = H[7];
+                    H[16] = tri_hyy[0] * z0;
+                    H[17] = tri_hyy[1] * z1;
+                    H[18] = tri_hyy[2] * z2;
+                    H[19] = tri_hyy[3] * z3;
+                    H[20] = hyz0;
+                    H[21] = hyz1;
+                    H[22] = hyz2;
+                    H[23] = hyz3;
+                    H[24] = hxz0;
+                    H[25] = hxz1;
+                    H[26] = hxz2;
+                    H[27] = hxz3;
+                    H[28] = hyz0;
+                    H[29] = hyz1;
+                    H[30] = hyz2;
+                    H[31] = hyz3;
+                    H[32] = tri_v[0] * d2z0;
+                    H[33] = tri_v[1] * d2z1;
+                    H[34] = tri_v[2] * d2z2;
+                    H[35] = tri_v[3] * d2z3;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+template<bool NeedHess>
+bool try_evaluate_wedge_fused_stride4_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    const AxisBatchScratch& axis_batch,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+        case 3:
+            return evaluate_wedge_fused_stride4_q4<3, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 4:
+            return evaluate_wedge_fused_stride4_q4<4, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 5:
+            return evaluate_wedge_fused_stride4_q4<5, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 6:
+            return evaluate_wedge_fused_stride4_q4<6, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 7:
+            return evaluate_wedge_fused_stride4_q4<7, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 8:
+            return evaluate_wedge_fused_stride4_q4<8, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+struct TensorProductTableScratch {
+    std::vector<Real> vv;
+    std::vector<Real> dv;
+    std::vector<Real> vd;
+    std::vector<Real> d2v;
+    std::vector<Real> vd2;
+    std::vector<Real> dd;
+
+    void resizeFor(std::size_t count) {
+        if (vv.size() < count) vv.resize(count);
+        if (dv.size() < count) dv.resize(count);
+        if (vd.size() < count) vd.resize(count);
+        if (d2v.size() < count) d2v.resize(count);
+        if (vd2.size() < count) vd2.resize(count);
+        if (dd.size() < count) dd.resize(count);
+    }
+};
+
+// Caller-provided scratch buffers used by tensor-product evaluation. Three
+// independent axes plus reusable simplex/wedge intermediates.
+struct LagrangeEvaluateScratch {
+    AxisScratch axis_x;
+    AxisScratch axis_y;
+    AxisScratch axis_z;
+    AxisBatchScratch axis_x_batch;
+    AxisBatchScratch axis_y_batch;
+    AxisBatchScratch axis_z_batch;
+    TensorProductTableScratch tensor_tables;
+
+    std::vector<Real> tri_values;
+    std::vector<Gradient> tri_gradients;
+    std::vector<Hessian> tri_hessians;
+    std::vector<Real> tri_gradient_components;
+    std::vector<Real> tri_hessian_components;
+    std::vector<Real> wedge_tri_values_batch;
+    std::vector<Real> wedge_tri_gradient_batch;
+    std::vector<Real> wedge_tri_hessian_batch;
+
+    std::vector<Real> strided_values_tmp;
+    std::vector<Real> strided_gradients_tmp;
+    std::vector<Real> strided_hessians_tmp;
+
+    void prewarm(int max_order, std::size_t max_qpts) {
+        const int clamped_order = std::max(max_order, 0);
+        const std::size_t axis_size = static_cast<std::size_t>(clamped_order) + 1u;
+        const std::size_t axis_batch_size = axis_size * max_qpts;
+        const std::size_t tensor_table_size =
+            axis_size * axis_size * std::max<std::size_t>(max_qpts, 1u);
+        const std::size_t tensor_dofs = tensor_table_size * axis_size;
+        const std::size_t tri_count = axis_size * (axis_size + 1u) / 2u;
+
+        axis_x.reserveFor(axis_size);
+        axis_y.reserveFor(axis_size);
+        axis_z.reserveFor(axis_size);
+        axis_x_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        axis_y_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        axis_z_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        tensor_tables.resizeFor(tensor_table_size);
+        tri_values.reserve(tri_count);
+        tri_gradients.reserve(tri_count);
+        tri_hessians.reserve(tri_count);
+        tri_gradient_components.reserve(tri_count * 3u);
+        tri_hessian_components.reserve(tri_count * 9u);
+        wedge_tri_values_batch.reserve(tri_count * max_qpts);
+        wedge_tri_gradient_batch.reserve(tri_count * 3u * max_qpts);
+        wedge_tri_hessian_batch.reserve(tri_count * 9u * max_qpts);
+        strided_values_tmp.reserve(tensor_dofs);
+        strided_gradients_tmp.reserve(tensor_dofs * 3u);
+        strided_hessians_tmp.reserve(tensor_dofs * 9u);
+    }
+};
+
+LagrangeEvaluateScratch& evaluate_scratch() {
+    // Scratch is intentionally thread-local: assembly and benchmark callers run
+    // evaluation on persistent worker threads, so capacity is reused by thread.
+    static thread_local LagrangeEvaluateScratch s;
+    return s;
+}
+
+// Fill axis scratch and return a non-owning view. Uncomputed slots still have
+// valid pointers to scratch storage (they may hold stale data) — callers must
+// only read the slots they requested via `level`. Common low orders use
+// precomputed Horner coefficients; high orders use barycentric axis evaluation.
+AxisBasisEvaluations fill_axis_scratch(AxisScratch& s,
+                                       const Real* v_coeffs,
+                                       const Real* d_coeffs,
+                                       const Real* d2_coeffs,
+                                       const Real* barycentric_weights,
+                                       int n_axis, Real xi,
+                                       AxisDeriv level) {
+    const std::size_t n = static_cast<std::size_t>(n_axis);
+    s.reserveFor(n);
+    Real* first  = (level == AxisDeriv::ValuesOnly) ? nullptr : s.first.data();
+    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? s.second.data() : nullptr;
+    evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights,
+                         n_axis, xi, s.values.data(), first, second);
+    return AxisBasisEvaluations{s.values.data(), s.first.data(), s.second.data(), n};
+}
+
+void fill_axis_batch(AxisBatchScratch& scratch,
+                     const std::vector<math::Vector<Real, 3>>& points,
+                     std::size_t component,
+                     const Real* v_coeffs,
+                     const Real* d_coeffs,
+                     const Real* d2_coeffs,
+                     const Real* barycentric_weights,
+                     int n_axis,
+                     AxisDeriv level) {
+    const std::size_t count = points.size() * static_cast<std::size_t>(n_axis);
+    scratch.resizeFor(count, level);
+    Real* first = (level == AxisDeriv::ValuesOnly) ? nullptr : scratch.first.data();
+    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? scratch.second.data() : nullptr;
+    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights, n_axis,
+                             points[q][component],
+                             scratch.values.data() + q * axis_stride,
+                             first ? first + q * axis_stride : nullptr,
+                             second ? second + q * axis_stride : nullptr);
+    }
+}
+
+// Maximum yz-table footprint that fits comfortably on the stack for the
+// Lagrange performance sweep. Order-8 hex q=4 needs 4*(9x9) entries per table.
+// Higher orders fall back to thread_local heap buffers.
+inline constexpr std::size_t kMaxStackYZ = 384;
+
+struct TensorProductVectorSink {
+    std::vector<Real>* values;
+    std::vector<Gradient>* gradients;
+    std::vector<Hessian>* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t n_nodes) const {
+        if (values)    values->resize(n_nodes);
+        if (gradients) gradients->resize(n_nodes);
+        if (hessians)  hessians->resize(n_nodes);
+    }
+
+    void write_value(std::size_t n, Real value) const {
+        (*values)[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
+        auto& g = (*gradients)[n];
+        g[0] = dx;
+        g[1] = dy;
+        g[2] = dz;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        (*hessians)[n] = make_symmetric_hessian(xx, yy, zz, xy, xz, yz);
+    }
+};
+
+struct TensorProductRawSink {
+    Real* values;
+    Real* gradients;
+    Real* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t) const {}
+
+    void write_value(std::size_t n, Real value) const {
+        values[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
+        Real* g = gradients + n * 3u;
+        g[0] = dx;
+        g[1] = dy;
+        g[2] = dz;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Real* H = hessians + n * 9u;
+        H[0] = xx;
+        H[4] = yy;
+        H[8] = zz;
+        H[1] = xy; H[3] = xy;
+        H[2] = xz; H[6] = xz;
+        H[5] = yz; H[7] = yz;
+    }
+};
+
+// Fused sum-factorized tensor-product evaluator.
+//
+// Precomputes one to six (ny x nz)-shaped tables of partial products
+// `M_xy[j*nz + k]` so that the inner per-node loop performs at most one
+// multiplication per output instead of two. With all three output buffers
+// supplied, this is the fused values + gradients + hessians path that shares
+// every per-axis evaluation.
+//
+// Per-node multiply count (vs. the unfactored variants):
+//   values only       : 1  (was 2)
+//   gradients only    : 3  (was 6)
+//   hessians only     : 6  (was 12)
+//   all three         : 10 (was 20)
+//
+// Dimensional scope: works uniformly for Line/Quadrilateral/Hexahedron with
+// the unused axes' size folded to 1 via constant_axis_basis().
+template <typename Sink>
+void evaluate_tensor_product_factorized_impl(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    const Sink& sink) {
+    const std::size_t ny = y_axis.size;
+    const std::size_t nz = z_axis.size;
+    const std::size_t nyz = ny * nz;
+    const bool need_values = sink.wants_values();
+    const bool need_grad = sink.wants_gradients();
+    const bool need_hess = sink.wants_hessians();
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    Real Md2v_stack[kMaxStackYZ];
+    Real Mvd2_stack[kMaxStackYZ];
+    Real Mdd_stack[kMaxStackYZ];
+
+    Real* Mvv;
+    Real* Mdv;
+    Real* Mvd;
+    Real* Md2v;
+    Real* Mvd2;
+    Real* Mdd;
+    if (nyz <= kMaxStackYZ) {
+        Mvv = Mvv_stack;
+        Mdv = Mdv_stack;
+        Mvd = Mvd_stack;
+        Md2v = Md2v_stack;
+        Mvd2 = Mvd2_stack;
+        Mdd = Mdd_stack;
+    } else {
+        auto& tables = evaluate_scratch().tensor_tables;
+        tables.resizeFor(nyz);
+        Mvv = tables.vv.data();
+        Mdv = tables.dv.data();
+        Mvd = tables.vd.data();
+        Md2v = tables.d2v.data();
+        Mvd2 = tables.vd2.data();
+        Mdd = tables.dd.data();
+    }
+
+    // M_vv is required by every output (values, ∂ξ, ∂ξ²).
+    for (std::size_t j = 0; j < ny; ++j) {
+        const Real yv = y_axis.values[j];
+        for (std::size_t k = 0; k < nz; ++k) {
+            Mvv[j * nz + k] = yv * z_axis.values[k];
+        }
+    }
+
+    if (need_grad || need_hess) {
+        for (std::size_t j = 0; j < ny; ++j) {
+            const Real yv = y_axis.values[j];
+            const Real yd = y_axis.first[j];
+            for (std::size_t k = 0; k < nz; ++k) {
+                Mdv[j * nz + k] = yd * z_axis.values[k];
+                Mvd[j * nz + k] = yv * z_axis.first[k];
+            }
+        }
+    }
+
+    if (need_hess) {
+        for (std::size_t j = 0; j < ny; ++j) {
+            const Real yv = y_axis.values[j];
+            const Real yd = y_axis.first[j];
+            const Real yd2 = y_axis.second[j];
+            for (std::size_t k = 0; k < nz; ++k) {
+                Md2v[j * nz + k] = yd2 * z_axis.values[k];
+                Mvd2[j * nz + k] = yv  * z_axis.second[k];
+                Mdd[j * nz + k]  = yd  * z_axis.first[k];
+            }
+        }
+    }
+
+    const std::size_t n_nodes = tensor_indices.size();
+    sink.prepare(n_nodes);
+
+    for (std::size_t n = 0; n < n_nodes; ++n) {
+        const auto& idx = tensor_indices[n];
+        const std::size_t i = idx[0];
+        const std::size_t jk = idx[1] * nz + idx[2];
+
+        const Real Lx = x_axis.values[i];
+
+        if (need_values) {
+            sink.write_value(n, Lx * Mvv[jk]);
+        }
+
+        if (need_grad) {
+            const Real dLx = x_axis.first[i];
+            sink.write_gradient(n,
+                                dLx * Mvv[jk],
+                                Lx  * Mdv[jk],
+                                Lx  * Mvd[jk]);
+        }
+
+        if (need_hess) {
+            const Real dLx  = x_axis.first[i];
+            const Real d2Lx = x_axis.second[i];
+            sink.write_hessian(n,
+                               d2Lx * Mvv[jk],
+                               Lx   * Md2v[jk],
+                               Lx   * Mvd2[jk],
+                               dLx  * Mdv[jk],
+                               dLx  * Mvd[jk],
+                               Lx   * Mdd[jk]);
+        }
+    }
+}
+
+void evaluate_tensor_product_factorized(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    std::vector<Real>* values_out,
+    std::vector<Gradient>* gradients_out,
+    std::vector<Hessian>* hessians_out) {
+    const TensorProductVectorSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
+}
+
+void evaluate_tensor_product_factorized_to(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const TensorProductRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_value_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    Real* SVMP_RESTRICT value_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    value_row[Q] = x_batch.values[q_axis + i] * Mvv[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_hessian_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
+    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
+    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_hessian_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[Q] = x2 * Mvv[slot];
+    hess_row[16u + Q] = xv * Md2v[slot];
+    hess_row[32u + Q] = xv * Mvd2[slot];
+    hess_row[4u + Q] = hxy;
+    hess_row[12u + Q] = hxy;
+    hess_row[8u + Q] = hxz;
+    hess_row[24u + Q] = hxz;
+    hess_row[20u + Q] = hyz;
+    hess_row[28u + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_gradient_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    Real* SVMP_RESTRICT grad_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
+    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
+    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_gradient_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    Real* SVMP_RESTRICT grad_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    grad_row[Q] = xd * Mvv[slot];
+    grad_row[4u + Q] = xv * Mdv[slot];
+    grad_row[8u + Q] = xv * Mvd[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_all_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    value_row[Q] = xv * Mvv[slot];
+    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
+    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
+    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
+
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
+    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
+    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_all_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    value_row[Q] = xv * Mvv[slot];
+    grad_row[Q] = xd * Mvv[slot];
+    grad_row[4u + Q] = xv * Mdv[slot];
+    grad_row[8u + Q] = xv * Mvd[slot];
+
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[Q] = x2 * Mvv[slot];
+    hess_row[16u + Q] = xv * Md2v[slot];
+    hess_row[32u + Q] = xv * Mvd2[slot];
+    hess_row[4u + Q] = hxy;
+    hess_row[12u + Q] = hxy;
+    hess_row[8u + Q] = hxz;
+    hess_row[24u + Q] = hxz;
+    hess_row[20u + Q] = hyz;
+    hess_row[28u + Q] = hyz;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_values_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT values_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || values_out == nullptr) {
+        return false;
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            Mvv_stack[base + 0u] = yv0 * z_batch.values[k];
+            Mvv_stack[base + 1u] = yv1 * z_batch.values[axis_stride + k];
+            Mvv_stack[base + 2u] = yv2 * z_batch.values[2u * axis_stride + k];
+            Mvv_stack[base + 3u] = yv3 * z_batch.values[3u * axis_stride + k];
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+        Real* SVMP_RESTRICT value_row = values_out + node * 4u;
+        value_row[0u] = x_batch.values[i] * Mvv_stack[jk + 0u];
+        value_row[1u] = x_batch.values[axis_stride + i] * Mvv_stack[jk + 1u];
+        value_row[2u] = x_batch.values[2u * axis_stride + i] * Mvv_stack[jk + 2u];
+        value_row[3u] = x_batch.values[3u * axis_stride + i] * Mvv_stack[jk + 3u];
+    }
+
+    return true;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_gradients_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT gradients_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || gradients_out == nullptr) {
+        return false;
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        const Real yd0 = y_batch.first[j];
+        const Real yd1 = y_batch.first[axis_stride + j];
+        const Real yd2 = y_batch.first[2u * axis_stride + j];
+        const Real yd3 = y_batch.first[3u * axis_stride + j];
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            const Real zv0 = z_batch.values[k];
+            const Real zv1 = z_batch.values[axis_stride + k];
+            const Real zv2 = z_batch.values[2u * axis_stride + k];
+            const Real zv3 = z_batch.values[3u * axis_stride + k];
+            const Real zd0 = z_batch.first[k];
+            const Real zd1 = z_batch.first[axis_stride + k];
+            const Real zd2 = z_batch.first[2u * axis_stride + k];
+            const Real zd3 = z_batch.first[3u * axis_stride + k];
+
+            Mvv_stack[base + 0u] = yv0 * zv0;
+            Mvv_stack[base + 1u] = yv1 * zv1;
+            Mvv_stack[base + 2u] = yv2 * zv2;
+            Mvv_stack[base + 3u] = yv3 * zv3;
+            Mdv_stack[base + 0u] = yd0 * zv0;
+            Mdv_stack[base + 1u] = yd1 * zv1;
+            Mdv_stack[base + 2u] = yd2 * zv2;
+            Mdv_stack[base + 3u] = yd3 * zv3;
+            Mvd_stack[base + 0u] = yv0 * zd0;
+            Mvd_stack[base + 1u] = yv1 * zd1;
+            Mvd_stack[base + 2u] = yv2 * zd2;
+            Mvd_stack[base + 3u] = yv3 * zd3;
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+
+        const Real xv0 = x_batch.values[i];
+        const Real xv1 = x_batch.values[axis_stride + i];
+        const Real xv2 = x_batch.values[2u * axis_stride + i];
+        const Real xv3 = x_batch.values[3u * axis_stride + i];
+        const Real xd0 = x_batch.first[i];
+        const Real xd1 = x_batch.first[axis_stride + i];
+        const Real xd2 = x_batch.first[2u * axis_stride + i];
+        const Real xd3 = x_batch.first[3u * axis_stride + i];
+
+        Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
+        grad_row[0u] = xd0 * Mvv_stack[jk + 0u];
+        grad_row[1u] = xd1 * Mvv_stack[jk + 1u];
+        grad_row[2u] = xd2 * Mvv_stack[jk + 2u];
+        grad_row[3u] = xd3 * Mvv_stack[jk + 3u];
+        grad_row[4u] = xv0 * Mdv_stack[jk + 0u];
+        grad_row[5u] = xv1 * Mdv_stack[jk + 1u];
+        grad_row[6u] = xv2 * Mdv_stack[jk + 2u];
+        grad_row[7u] = xv3 * Mdv_stack[jk + 3u];
+        grad_row[8u] = xv0 * Mvd_stack[jk + 0u];
+        grad_row[9u] = xv1 * Mvd_stack[jk + 1u];
+        grad_row[10u] = xv2 * Mvd_stack[jk + 2u];
+        grad_row[11u] = xv3 * Mvd_stack[jk + 3u];
+    }
+
+    return true;
+}
+
+template<bool NeedAllOutputs>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_second_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || hessians_out == nullptr) {
+        return false;
+    }
+    if constexpr (NeedAllOutputs) {
+        if (values_out == nullptr || gradients_out == nullptr) {
+            return false;
+        }
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    Real Md2v_stack[kMaxStackYZ];
+    Real Mvd2_stack[kMaxStackYZ];
+    Real Mdd_stack[kMaxStackYZ];
+
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        const Real yd0 = y_batch.first[j];
+        const Real yd1 = y_batch.first[axis_stride + j];
+        const Real yd2 = y_batch.first[2u * axis_stride + j];
+        const Real yd3 = y_batch.first[3u * axis_stride + j];
+        const Real y20 = y_batch.second[j];
+        const Real y21 = y_batch.second[axis_stride + j];
+        const Real y22 = y_batch.second[2u * axis_stride + j];
+        const Real y23 = y_batch.second[3u * axis_stride + j];
+
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            const Real zv0 = z_batch.values[k];
+            const Real zv1 = z_batch.values[axis_stride + k];
+            const Real zv2 = z_batch.values[2u * axis_stride + k];
+            const Real zv3 = z_batch.values[3u * axis_stride + k];
+            const Real zd0 = z_batch.first[k];
+            const Real zd1 = z_batch.first[axis_stride + k];
+            const Real zd2 = z_batch.first[2u * axis_stride + k];
+            const Real zd3 = z_batch.first[3u * axis_stride + k];
+            const Real z20 = z_batch.second[k];
+            const Real z21 = z_batch.second[axis_stride + k];
+            const Real z22 = z_batch.second[2u * axis_stride + k];
+            const Real z23 = z_batch.second[3u * axis_stride + k];
+
+            Mvv_stack[base + 0u] = yv0 * zv0;
+            Mvv_stack[base + 1u] = yv1 * zv1;
+            Mvv_stack[base + 2u] = yv2 * zv2;
+            Mvv_stack[base + 3u] = yv3 * zv3;
+            Mdv_stack[base + 0u] = yd0 * zv0;
+            Mdv_stack[base + 1u] = yd1 * zv1;
+            Mdv_stack[base + 2u] = yd2 * zv2;
+            Mdv_stack[base + 3u] = yd3 * zv3;
+            Mvd_stack[base + 0u] = yv0 * zd0;
+            Mvd_stack[base + 1u] = yv1 * zd1;
+            Mvd_stack[base + 2u] = yv2 * zd2;
+            Mvd_stack[base + 3u] = yv3 * zd3;
+            Md2v_stack[base + 0u] = y20 * zv0;
+            Md2v_stack[base + 1u] = y21 * zv1;
+            Md2v_stack[base + 2u] = y22 * zv2;
+            Md2v_stack[base + 3u] = y23 * zv3;
+            Mvd2_stack[base + 0u] = yv0 * z20;
+            Mvd2_stack[base + 1u] = yv1 * z21;
+            Mvd2_stack[base + 2u] = yv2 * z22;
+            Mvd2_stack[base + 3u] = yv3 * z23;
+            Mdd_stack[base + 0u] = yd0 * zd0;
+            Mdd_stack[base + 1u] = yd1 * zd1;
+            Mdd_stack[base + 2u] = yd2 * zd2;
+            Mdd_stack[base + 3u] = yd3 * zd3;
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+
+        const Real xv0 = x_batch.values[i];
+        const Real xv1 = x_batch.values[axis_stride + i];
+        const Real xv2 = x_batch.values[2u * axis_stride + i];
+        const Real xv3 = x_batch.values[3u * axis_stride + i];
+        const Real xd0 = x_batch.first[i];
+        const Real xd1 = x_batch.first[axis_stride + i];
+        const Real xd2 = x_batch.first[2u * axis_stride + i];
+        const Real xd3 = x_batch.first[3u * axis_stride + i];
+        const Real x20 = x_batch.second[i];
+        const Real x21 = x_batch.second[axis_stride + i];
+        const Real x22 = x_batch.second[2u * axis_stride + i];
+        const Real x23 = x_batch.second[3u * axis_stride + i];
+
+        const Real mvv0 = Mvv_stack[jk + 0u];
+        const Real mvv1 = Mvv_stack[jk + 1u];
+        const Real mvv2 = Mvv_stack[jk + 2u];
+        const Real mvv3 = Mvv_stack[jk + 3u];
+        const Real mdv0 = Mdv_stack[jk + 0u];
+        const Real mdv1 = Mdv_stack[jk + 1u];
+        const Real mdv2 = Mdv_stack[jk + 2u];
+        const Real mdv3 = Mdv_stack[jk + 3u];
+        const Real mvd0 = Mvd_stack[jk + 0u];
+        const Real mvd1 = Mvd_stack[jk + 1u];
+        const Real mvd2 = Mvd_stack[jk + 2u];
+        const Real mvd3 = Mvd_stack[jk + 3u];
+        const Real md2v0 = Md2v_stack[jk + 0u];
+        const Real md2v1 = Md2v_stack[jk + 1u];
+        const Real md2v2 = Md2v_stack[jk + 2u];
+        const Real md2v3 = Md2v_stack[jk + 3u];
+        const Real mvd20 = Mvd2_stack[jk + 0u];
+        const Real mvd21 = Mvd2_stack[jk + 1u];
+        const Real mvd22 = Mvd2_stack[jk + 2u];
+        const Real mvd23 = Mvd2_stack[jk + 3u];
+        const Real mdd0 = Mdd_stack[jk + 0u];
+        const Real mdd1 = Mdd_stack[jk + 1u];
+        const Real mdd2 = Mdd_stack[jk + 2u];
+        const Real mdd3 = Mdd_stack[jk + 3u];
+
+        if constexpr (NeedAllOutputs) {
+            Real* SVMP_RESTRICT value_row = values_out + node * 4u;
+            value_row[0u] = xv0 * mvv0;
+            value_row[1u] = xv1 * mvv1;
+            value_row[2u] = xv2 * mvv2;
+            value_row[3u] = xv3 * mvv3;
+
+            Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
+            grad_row[0u] = xd0 * mvv0;
+            grad_row[1u] = xd1 * mvv1;
+            grad_row[2u] = xd2 * mvv2;
+            grad_row[3u] = xd3 * mvv3;
+            grad_row[4u] = xv0 * mdv0;
+            grad_row[5u] = xv1 * mdv1;
+            grad_row[6u] = xv2 * mdv2;
+            grad_row[7u] = xv3 * mdv3;
+            grad_row[8u] = xv0 * mvd0;
+            grad_row[9u] = xv1 * mvd1;
+            grad_row[10u] = xv2 * mvd2;
+            grad_row[11u] = xv3 * mvd3;
+        }
+
+        const Real hxy0 = xd0 * mdv0;
+        const Real hxy1 = xd1 * mdv1;
+        const Real hxy2 = xd2 * mdv2;
+        const Real hxy3 = xd3 * mdv3;
+        const Real hxz0 = xd0 * mvd0;
+        const Real hxz1 = xd1 * mvd1;
+        const Real hxz2 = xd2 * mvd2;
+        const Real hxz3 = xd3 * mvd3;
+        const Real hyz0 = xv0 * mdd0;
+        const Real hyz1 = xv1 * mdd1;
+        const Real hyz2 = xv2 * mdd2;
+        const Real hyz3 = xv3 * mdd3;
+
+        Real* SVMP_RESTRICT hess_row = hessians_out + node * 36u;
+        hess_row[0u] = x20 * mvv0;
+        hess_row[1u] = x21 * mvv1;
+        hess_row[2u] = x22 * mvv2;
+        hess_row[3u] = x23 * mvv3;
+        hess_row[4u] = hxy0;
+        hess_row[5u] = hxy1;
+        hess_row[6u] = hxy2;
+        hess_row[7u] = hxy3;
+        hess_row[8u] = hxz0;
+        hess_row[9u] = hxz1;
+        hess_row[10u] = hxz2;
+        hess_row[11u] = hxz3;
+        hess_row[12u] = hxy0;
+        hess_row[13u] = hxy1;
+        hess_row[14u] = hxy2;
+        hess_row[15u] = hxy3;
+        hess_row[16u] = xv0 * md2v0;
+        hess_row[17u] = xv1 * md2v1;
+        hess_row[18u] = xv2 * md2v2;
+        hess_row[19u] = xv3 * md2v3;
+        hess_row[20u] = hyz0;
+        hess_row[21u] = hyz1;
+        hess_row[22u] = hyz2;
+        hess_row[23u] = hyz3;
+        hess_row[24u] = hxz0;
+        hess_row[25u] = hxz1;
+        hess_row[26u] = hxz2;
+        hess_row[27u] = hxz3;
+        hess_row[28u] = hyz0;
+        hess_row[29u] = hyz1;
+        hess_row[30u] = hyz2;
+        hess_row[31u] = hyz3;
+        hess_row[32u] = xv0 * mvd20;
+        hess_row[33u] = xv1 * mvd21;
+        hess_row[34u] = xv2 * mvd22;
+        hess_row[35u] = xv3 * mvd23;
+    }
+
+    return true;
+}
+
+template<int N>
+constexpr std::size_t line_public_axis_index(std::size_t node) noexcept {
+    return node == 0u ? 0u : (node == 1u ? static_cast<std::size_t>(N - 1) : node - 1u);
+}
+
+template<int N>
+constexpr std::array<Real, N> make_line_axis_inv_denoms() noexcept {
+    std::array<Real, N> inv_denoms{};
+    for (int i = 0; i < N; ++i) {
+        Real denom = Real(1);
+        for (int j = 0; j < N; ++j) {
+            if (j != i) {
+                denom *= static_cast<Real>(i - j);
+            }
+        }
+        inv_denoms[static_cast<std::size_t>(i)] = Real(1) / denom;
+    }
+    return inv_denoms;
+}
+
+template<int N>
+void fill_line_values_product(Real x, Real* SVMP_RESTRICT values) {
+    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
+    const Real p = static_cast<Real>(N - 1);
+    const Real r = (x + Real(1)) * p * Real(0.5);
+    Real prefix[N];
+    Real suffix[N];
+    prefix[0] = Real(1);
+    for (int i = 1; i < N; ++i) {
+        prefix[i] = prefix[i - 1] * (r - static_cast<Real>(i - 1));
+    }
+    suffix[N - 1] = Real(1);
+    for (int i = N - 2; i >= 0; --i) {
+        suffix[i] = suffix[i + 1] * (r - static_cast<Real>(i + 1));
+    }
+    for (int i = 0; i < N; ++i) {
+        const std::size_t slot = static_cast<std::size_t>(i);
+        values[slot] = prefix[i] * suffix[i] * inv_denoms[slot];
+    }
+}
+
+template<int N>
+void fill_line_values_product_derivatives(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first,
+                                          Real* SVMP_RESTRICT second) {
+    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
+    const Real p = static_cast<Real>(N - 1);
+    const Real drdx = p * Real(0.5);
+    const Real d2rdx2 = drdx * drdx;
+    const Real r = (x + Real(1)) * drdx;
+
+    Real prefix[N + 1];
+    Real prefix_d1[N + 1];
+    Real prefix_d2[N + 1];
+    Real suffix[N + 1];
+    Real suffix_d1[N + 1];
+    Real suffix_d2[N + 1];
+
+    const bool need_second = second != nullptr;
+
+    prefix[0] = Real(1);
+    prefix_d1[0] = Real(0);
+    if (need_second) {
+        prefix_d2[0] = Real(0);
+    }
+    for (int i = 0; i < N; ++i) {
+        const Real factor = r - static_cast<Real>(i);
+        prefix[i + 1] = prefix[i] * factor;
+        prefix_d1[i + 1] = prefix_d1[i] * factor + prefix[i];
+        if (need_second) {
+            prefix_d2[i + 1] = prefix_d2[i] * factor + Real(2) * prefix_d1[i];
+        }
+    }
+
+    suffix[N] = Real(1);
+    suffix_d1[N] = Real(0);
+    if (need_second) {
+        suffix_d2[N] = Real(0);
+    }
+    for (int i = N - 1; i >= 0; --i) {
+        const Real factor = r - static_cast<Real>(i);
+        suffix[i] = suffix[i + 1] * factor;
+        suffix_d1[i] = suffix_d1[i + 1] * factor + suffix[i + 1];
+        if (need_second) {
+            suffix_d2[i] = suffix_d2[i + 1] * factor + Real(2) * suffix_d1[i + 1];
+        }
+    }
+
+    for (int i = 0; i < N; ++i) {
+        const std::size_t slot = static_cast<std::size_t>(i);
+        const Real inv = inv_denoms[slot];
+        const Real pre = prefix[i];
+        const Real suf = suffix[i + 1];
+        const Real pre_d1 = prefix_d1[i];
+        const Real suf_d1 = suffix_d1[i + 1];
+        values[slot] = pre * suf * inv;
+        if (first != nullptr) {
+            first[slot] = (pre_d1 * suf + pre * suf_d1) * inv * drdx;
+        }
+        if (second != nullptr) {
+            const Real d2 =
+                prefix_d2[i] * suf +
+                Real(2) * pre_d1 * suf_d1 +
+                pre * suffix_d2[i + 1];
+            second[slot] = d2 * inv * d2rdx2;
+        }
+    }
+}
+
+template<int N>
+void fill_axis_batch_product_q4(
+    AxisBatchScratch& scratch,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t component,
+    AxisDeriv level) {
+    constexpr std::size_t axis_stride = static_cast<std::size_t>(N);
+    scratch.resizeFor(4u * axis_stride, level);
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real* values = scratch.values.data() + q * axis_stride;
+        if (level == AxisDeriv::ValuesOnly) {
+            fill_line_values_product<N>(points[q][component], values);
+        } else {
+            Real* first = scratch.first.data() + q * axis_stride;
+            Real* second = level == AxisDeriv::ValuesAndFirstAndSecond
+                ? scratch.second.data() + q * axis_stride
+                : nullptr;
+            fill_line_values_product_derivatives<N>(
+                points[q][component], values, first, second);
+        }
+    }
+}
+
+bool try_fill_axis_batch_product_q4(
+    AxisBatchScratch& scratch,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t component,
+    int n_axis,
+    AxisDeriv level) {
+    switch (n_axis) {
+        case 5:
+            fill_axis_batch_product_q4<5>(scratch, points, component, level);
+            return true;
+        case 6:
+            fill_axis_batch_product_q4<6>(scratch, points, component, level);
+            return true;
+        case 7:
+            fill_axis_batch_product_q4<7>(scratch, points, component, level);
+            return true;
+        case 8:
+            fill_axis_batch_product_q4<8>(scratch, points, component, level);
+            return true;
+        case 9:
+            fill_axis_batch_product_q4<9>(scratch, points, component, level);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real q0[N];
+    Real q1[N];
+    Real q2[N];
+    Real q3[N];
+    fill_line_values_product<N>(points[0][0], q0);
+    fill_line_values_product<N>(points[1][0], q1);
+    fill_line_values_product<N>(points[2][0], q2);
+    fill_line_values_product<N>(points[3][0], q3);
+
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        Real* row = values_out + node * output_stride;
+        row[0] = q0[i];
+        row[1] = q1[i];
+        row[2] = q2[i];
+        row[3] = q3[i];
+    }
+}
+
+FE_ALWAYS_INLINE void write_line_order4_values_q(
+    Real x,
+    std::size_t q,
+    Real* SVMP_RESTRICT row0,
+    Real* SVMP_RESTRICT row1,
+    Real* SVMP_RESTRICT row2,
+    Real* SVMP_RESTRICT row3,
+    Real* SVMP_RESTRICT row4) {
+    const Real r = (x + Real(1)) * Real(2);
+    const Real f0 = r;
+    const Real f1 = r - Real(1);
+    const Real f2 = r - Real(2);
+    const Real f3 = r - Real(3);
+    const Real f4 = r - Real(4);
+    const Real f01 = f0 * f1;
+    const Real f12 = f1 * f2;
+    const Real f23 = f2 * f3;
+    const Real f34 = f3 * f4;
+    const Real v0 = (f12 * f34) / Real(24);
+    const Real v1 = -(f0 * f2 * f34) / Real(6);
+    const Real v2 = (f01 * f34) / Real(4);
+    const Real v3 = -(f01 * f2 * f4) / Real(6);
+    const Real v4 = (f01 * f23) / Real(24);
+    row0[q] = v0;
+    row1[q] = v4;
+    row2[q] = v1;
+    row3[q] = v2;
+    row4[q] = v3;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_order4_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    write_line_order4_values_q(points[0][0], 0u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[1][0], 1u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[2][0], 2u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[3][0], 3u, row0, row1, row2, row3, row4);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* SVMP_RESTRICT row0 = gradients_out + 0u * 3u * output_stride;
+    Real* SVMP_RESTRICT row1 = gradients_out + 1u * 3u * output_stride;
+    Real* SVMP_RESTRICT row2 = gradients_out + 2u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        row0[0u * output_stride + q] = Real(-1);
+        row0[1u * output_stride + q] = Real(-1);
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(1);
+        row2[2u * output_stride + q] = Real(0);
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE void evaluate_line_hessians_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][N];
+    Real second[4][N];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], values[q], nullptr, second[q]);
+    }
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][i], second[1][i],
+                                  second[2][i], second[3][i]);
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE void evaluate_line_all_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][N];
+    Real first[4][N];
+    Real second[4][N];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], values[q], first[q], second[q]);
+    }
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        Real* value_row = values_out + node * output_stride;
+        value_row[0] = values[0][i];
+        value_row[1] = values[1][i];
+        value_row[2] = values[2][i];
+        value_row[3] = values[3][i];
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][i], first[1][i],
+                                   first[2][i], first[3][i]);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][i], second[1][i],
+                                  second[2][i], second[3][i]);
+    }
+}
+
+inline void write_quad_product_value_row_q4(
+    Real* SVMP_RESTRICT row,
+    const Real* SVMP_RESTRICT x0,
+    const Real* SVMP_RESTRICT x1,
+    const Real* SVMP_RESTRICT x2,
+    const Real* SVMP_RESTRICT x3,
+    const Real* SVMP_RESTRICT y0,
+    const Real* SVMP_RESTRICT y1,
+    const Real* SVMP_RESTRICT y2,
+    const Real* SVMP_RESTRICT y3,
+    std::size_t i,
+    std::size_t j) {
+    row[0] = x0[i] * y0[j];
+    row[1] = x1[i] * y1[j];
+    row[2] = x2[i] * y2[j];
+    row[3] = x3[i] * y3[j];
+}
+
+template<int N>
+void evaluate_quad_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real x0[N];
+    Real x1[N];
+    Real x2[N];
+    Real x3[N];
+    Real y0[N];
+    Real y1[N];
+    Real y2[N];
+    Real y3[N];
+    fill_line_values_product<N>(points[0][0], x0);
+    fill_line_values_product<N>(points[1][0], x1);
+    fill_line_values_product<N>(points[2][0], x2);
+    fill_line_values_product<N>(points[3][0], x3);
+    fill_line_values_product<N>(points[0][1], y0);
+    fill_line_values_product<N>(points[1][1], y1);
+    fill_line_values_product<N>(points[2][1], y2);
+    fill_line_values_product<N>(points[3][1], y3);
+
+    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
+    std::size_t node = 0u;
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, 0u);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, p, 0u);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, p, p);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, p);
+
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, 0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                            x0, x1, x2, x3, y0, y1, y2, y3, i, j);
+        }
+    }
+}
+
+template<int N>
+void evaluate_quad_derivatives_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    Real xv[4][N];
+    Real xd[4][N];
+    Real x2[4][N];
+    Real yv[4][N];
+    Real yd[4][N];
+    Real y2[4][N];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], xv[q], (need_grad || need_hess) ? xd[q] : nullptr,
+            need_hess ? x2[q] : nullptr);
+        fill_line_values_product_derivatives<N>(
+            points[q][1], yv[q], (need_grad || need_hess) ? yd[q] : nullptr,
+            need_hess ? y2[q] : nullptr);
+    }
+
+    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
+    std::size_t node = 0u;
+    auto write_node = [&](std::size_t i, std::size_t j) {
+        Real* value_row = values_out != nullptr ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out != nullptr ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out != nullptr ? hessians_out + node * 9u * output_stride : nullptr;
+        if (grad_row != nullptr) {
+            grad_row[2u * output_stride + 0u] = Real(0);
+            grad_row[2u * output_stride + 1u] = Real(0);
+            grad_row[2u * output_stride + 2u] = Real(0);
+            grad_row[2u * output_stride + 3u] = Real(0);
+        }
+        if (hess_row != nullptr) {
+            hess_row[2u * output_stride + 0u] = Real(0);
+            hess_row[2u * output_stride + 1u] = Real(0);
+            hess_row[2u * output_stride + 2u] = Real(0);
+            hess_row[2u * output_stride + 3u] = Real(0);
+            hess_row[5u * output_stride + 0u] = Real(0);
+            hess_row[5u * output_stride + 1u] = Real(0);
+            hess_row[5u * output_stride + 2u] = Real(0);
+            hess_row[5u * output_stride + 3u] = Real(0);
+            hess_row[6u * output_stride + 0u] = Real(0);
+            hess_row[6u * output_stride + 1u] = Real(0);
+            hess_row[6u * output_stride + 2u] = Real(0);
+            hess_row[6u * output_stride + 3u] = Real(0);
+            hess_row[7u * output_stride + 0u] = Real(0);
+            hess_row[7u * output_stride + 1u] = Real(0);
+            hess_row[7u * output_stride + 2u] = Real(0);
+            hess_row[7u * output_stride + 3u] = Real(0);
+            hess_row[8u * output_stride + 0u] = Real(0);
+            hess_row[8u * output_stride + 1u] = Real(0);
+            hess_row[8u * output_stride + 2u] = Real(0);
+            hess_row[8u * output_stride + 3u] = Real(0);
+        }
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real x_value = xv[q][i];
+            const Real y_value = yv[q][j];
+            if (value_row != nullptr) {
+                value_row[q] = x_value * y_value;
+            }
+            if (grad_row != nullptr) {
+                grad_row[0u * output_stride + q] = xd[q][i] * y_value;
+                grad_row[1u * output_stride + q] = x_value * yd[q][j];
+            }
+            if (hess_row != nullptr) {
+                const Real hxy = xd[q][i] * yd[q][j];
+                hess_row[0u * output_stride + q] = x2[q][i] * y_value;
+                hess_row[1u * output_stride + q] = hxy;
+                hess_row[3u * output_stride + q] = hxy;
+                hess_row[4u * output_stride + q] = x_value * y2[q][j];
+            }
+        }
+        ++node;
+    };
+
+    write_node(0u, 0u);
+    write_node(p, 0u);
+    write_node(p, p);
+    write_node(0u, p);
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_node(i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_node(p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_node(i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_node(0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_node(i, j);
+        }
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_quad_order8_gradients_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr int N = 9;
+    constexpr std::size_t p = 8u;
+    Real xv[4][N];
+    Real xd[4][N];
+    Real yv[4][N];
+    Real yd[4][N];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(points[q][0], xv[q], xd[q], nullptr);
+        fill_line_values_product_derivatives<N>(points[q][1], yv[q], yd[q], nullptr);
+    }
+
+    std::size_t node = 0u;
+    auto write_node = [&](std::size_t i, std::size_t j) {
+        Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
+        row[0u] = xd[0][i] * yv[0][j];
+        row[1u] = xd[1][i] * yv[1][j];
+        row[2u] = xd[2][i] * yv[2][j];
+        row[3u] = xd[3][i] * yv[3][j];
+        row[output_stride + 0u] = xv[0][i] * yd[0][j];
+        row[output_stride + 1u] = xv[1][i] * yd[1][j];
+        row[output_stride + 2u] = xv[2][i] * yd[2][j];
+        row[output_stride + 3u] = xv[3][i] * yd[3][j];
+        row[2u * output_stride + 0u] = Real(0);
+        row[2u * output_stride + 1u] = Real(0);
+        row[2u * output_stride + 2u] = Real(0);
+        row[2u * output_stride + 3u] = Real(0);
+        ++node;
+    };
+
+    write_node(0u, 0u);
+    write_node(p, 0u);
+    write_node(p, p);
+    write_node(0u, p);
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_node(i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_node(p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_node(i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_node(0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_node(i, j);
+        }
+    }
+}
+
+template<int N>
+void evaluate_line_gradients_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT d_coeffs,
+    Real* SVMP_RESTRICT gradients_out) {
+    const Real x0 = points[0][0];
+    const Real x1 = points[1][0];
+    const Real x2 = points[2][0];
+    const Real x3 = points[3][0];
+
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        const Real* c = d_coeffs + i * static_cast<std::size_t>(N - 1);
+        Real r0 = c[N - 2];
+        Real r1 = c[N - 2];
+        Real r2 = c[N - 2];
+        Real r3 = c[N - 2];
+        for (int k = N - 2; k > 0; --k) {
+            const Real ck = c[k - 1];
+            r0 = r0 * x0 + ck;
+            r1 = r1 * x1 + ck;
+            r2 = r2 * x2 + ck;
+            r3 = r3 * x3 + ck;
+        }
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = r0;
+        row[1] = r1;
+        row[2] = r2;
+        row[3] = r3;
+        row[output_stride + 0u] = Real(0);
+        row[output_stride + 1u] = Real(0);
+        row[output_stride + 2u] = Real(0);
+        row[output_stride + 3u] = Real(0);
+        row[2u * output_stride + 0u] = Real(0);
+        row[2u * output_stride + 1u] = Real(0);
+        row[2u * output_stride + 2u] = Real(0);
+        row[2u * output_stride + 3u] = Real(0);
+    }
+}
+
+bool try_evaluate_line_values_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT v_coeffs,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out) {
+    (void)v_coeffs;
+    switch (n_axis) {
+        case 5:
+            evaluate_line_order4_values_q4(points, output_stride, values_out);
+            return true;
+        case 6:
+            evaluate_line_values_product_q4<6>(points, output_stride, values_out);
+            return true;
+        case 7:
+            evaluate_line_values_product_q4<7>(points, output_stride, values_out);
+            return true;
+        case 8:
+            evaluate_line_values_product_q4<8>(points, output_stride, values_out);
+            return true;
+        case 9:
+            evaluate_line_values_product_q4<9>(points, output_stride, values_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool try_evaluate_line_gradients_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT d_coeffs,
+    int n_axis,
+    Real* SVMP_RESTRICT gradients_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_gradients_horner_q4<5>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 6:
+            evaluate_line_gradients_horner_q4<6>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 7:
+            evaluate_line_gradients_horner_q4<7>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 8:
+            evaluate_line_gradients_horner_q4<8>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 9:
+            evaluate_line_gradients_horner_q4<9>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_hessians_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_hessians_product_q4<5>(points, output_stride, hessians_out);
+            return true;
+        case 6:
+            evaluate_line_hessians_product_q4<6>(points, output_stride, hessians_out);
+            return true;
+        case 7:
+            evaluate_line_hessians_product_q4<7>(points, output_stride, hessians_out);
+            return true;
+        case 8:
+            evaluate_line_hessians_product_q4<8>(points, output_stride, hessians_out);
+            return true;
+        case 9:
+            evaluate_line_hessians_product_q4<9>(points, output_stride, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_all_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_all_product_q4<5>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 6:
+            evaluate_line_all_product_q4<6>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 7:
+            evaluate_line_all_product_q4<7>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 8:
+            evaluate_line_all_product_q4<8>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 9:
+            evaluate_line_all_product_q4<9>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_quad_values_product_q4<5>(points, output_stride, values_out);
+            return true;
+        case 6:
+            evaluate_quad_values_product_q4<6>(points, output_stride, values_out);
+            return true;
+        case 7:
+            evaluate_quad_values_product_q4<7>(points, output_stride, values_out);
+            return true;
+        case 8:
+            evaluate_quad_values_product_q4<8>(points, output_stride, values_out);
+            return true;
+        case 9:
+            evaluate_quad_values_product_q4<9>(points, output_stride, values_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_derivatives_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_quad_derivatives_product_q4<5>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 6:
+            evaluate_quad_derivatives_product_q4<6>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 7:
+            evaluate_quad_derivatives_product_q4<7>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 8:
+            evaluate_quad_derivatives_product_q4<8>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 9:
+            evaluate_quad_derivatives_product_q4<9>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+void evaluate_tensor_product_points_strided(
+    LagrangeTopology topology,
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* v_coeffs,
+    const Real* d_coeffs,
+    const Real* d2_coeffs,
+    const Real* barycentric_weights,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_qpts = points.size();
+    if (num_qpts == 0 || tensor_indices.empty()) {
+        return;
+    }
+
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    const bool values_only = values_out != nullptr && !need_grad && !need_hess;
+    const bool gradients_only = values_out == nullptr && need_grad && !need_hess;
+    const bool hessians_only = values_out == nullptr && gradients_out == nullptr && need_hess;
+    const bool all_outputs = values_out != nullptr && need_grad && need_hess;
+    const AxisDeriv level = need_hess
+        ? AxisDeriv::ValuesAndFirstAndSecond
+        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
+
+    if (topology == LagrangeTopology::Line && num_qpts == 4u) {
+        if (values_only &&
+            try_evaluate_line_values_horner_q4(
+                points, output_stride, v_coeffs, n_axis, values_out)) {
+            return;
+        }
+        if (gradients_only &&
+            try_evaluate_line_gradients_horner_q4(
+                points, output_stride, d_coeffs, n_axis, gradients_out)) {
+            return;
+        }
+        if (hessians_only &&
+            try_evaluate_line_hessians_product_q4(
+                points, output_stride, n_axis, hessians_out)) {
+            return;
+        }
+        if (all_outputs &&
+            try_evaluate_line_all_product_q4(
+                points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
+            return;
+        }
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        values_only &&
+        num_qpts == 4u &&
+        try_evaluate_quad_values_product_q4(points, output_stride, n_axis, values_out)) {
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis == 5) {
+        evaluate_quad_order4_gradients_q4(points, output_stride, gradients_out);
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis == 9) {
+        evaluate_quad_order8_gradients_product_q4(points, output_stride, gradients_out);
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        (gradients_only || hessians_only || all_outputs) &&
+        num_qpts == 4u &&
+        try_evaluate_quad_derivatives_product_q4(
+            points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
+        return;
+    }
+
+    auto& scratch = evaluate_scratch();
+    AxisBatchScratch& x_batch = scratch.axis_x_batch;
+    AxisBatchScratch& y_batch = scratch.axis_y_batch;
+    AxisBatchScratch& z_batch = scratch.axis_z_batch;
+
+    const bool has_y = topology != LagrangeTopology::Line;
+    const bool has_z = topology == LagrangeTopology::Hexahedron;
+    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+    const bool use_product_axis_batch =
+        has_z &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis >= 5 &&
+        n_axis <= 9;
+    auto fill_tensor_axis_batch = [&](AxisBatchScratch& batch, std::size_t component) {
+        if (use_product_axis_batch &&
+            try_fill_axis_batch_product_q4(batch, points, component, n_axis, level)) {
+            return;
+        }
+        fill_axis_batch(batch, points, component, v_coeffs, d_coeffs, d2_coeffs,
+                        barycentric_weights, n_axis, level);
+    };
+
+    fill_tensor_axis_batch(x_batch, 0u);
+    if (!has_y) {
+        if (values_only) {
+            if (num_qpts == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const std::size_t i = tensor_indices[node][0];
+                    Real* value_row = values_out + node * output_stride;
+                    value_row[0] = x_batch.values[i];
+                    value_row[1] = x_batch.values[axis_stride + i];
+                    value_row[2] = x_batch.values[2u * axis_stride + i];
+                    value_row[3] = x_batch.values[3u * axis_stride + i];
+                }
+                return;
+            }
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const std::size_t i = tensor_indices[node][0];
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    value_row[q] = x_batch.values[q * axis_stride + i];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const std::size_t i = tensor_indices[node][0];
+                Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    grad_row[0u * output_stride + q] =
+                        x_batch.first[q * axis_stride + i];
+                    grad_row[1u * output_stride + q] = Real(0);
+                    grad_row[2u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+            const std::size_t i = tensor_indices[node][0];
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t q_axis = q * axis_stride + i;
+                if (value_row != nullptr) {
+                    value_row[q] = x_batch.values[q_axis];
+                }
+                if (need_grad) {
+                    grad_row[0u * output_stride + q] = x_batch.first[q_axis];
+                    grad_row[1u * output_stride + q] = Real(0);
+                    grad_row[2u * output_stride + q] = Real(0);
+                }
+                if (need_hess) {
+                    hess_row[0u * output_stride + q] = x_batch.second[q_axis];
+                    hess_row[1u * output_stride + q] = Real(0);
+                    hess_row[2u * output_stride + q] = Real(0);
+                    hess_row[3u * output_stride + q] = Real(0);
+                    hess_row[4u * output_stride + q] = Real(0);
+                    hess_row[5u * output_stride + q] = Real(0);
+                    hess_row[6u * output_stride + q] = Real(0);
+                    hess_row[7u * output_stride + q] = Real(0);
+                    hess_row[8u * output_stride + q] = Real(0);
+                }
+            }
+        }
+        return;
+    }
+    const bool use_tensor_tables =
+        has_z ||
+        (axis_stride == 8u && !(need_hess && values_out == nullptr && gradients_out == nullptr));
+    if (use_tensor_tables) {
+        fill_tensor_axis_batch(y_batch, 1u);
+    } else if (has_y) {
+        fill_tensor_axis_batch(y_batch, 1u);
+    }
+    if (has_z) {
+        fill_tensor_axis_batch(z_batch, 2u);
+    }
+
+    if (use_tensor_tables) {
+        const std::size_t ny = axis_stride;
+        const std::size_t nz = has_z ? axis_stride : 1u;
+        const std::size_t nyz = ny * nz;
+        const std::size_t table_count = num_qpts * nyz;
+
+        if (has_z && num_qpts == 4u && output_stride == 4u) {
+            if (values_only &&
+                evaluate_tensor_product_values_stride4_q4_transposed(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, values_out)) {
+                return;
+            }
+            if (gradients_only &&
+                evaluate_tensor_product_gradients_stride4_q4_transposed(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, gradients_out)) {
+                return;
+            }
+            if (hessians_only &&
+                evaluate_tensor_product_second_stride4_q4_transposed<false>(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
+                    nullptr, nullptr, hessians_out)) {
+                return;
+            }
+            if (all_outputs &&
+                evaluate_tensor_product_second_stride4_q4_transposed<true>(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
+                    values_out, gradients_out, hessians_out)) {
+                return;
+            }
+        }
+
+        Real Mvv_stack[kMaxStackYZ];
+        Real Mdv_stack[kMaxStackYZ];
+        Real Mvd_stack[kMaxStackYZ];
+        Real Md2v_stack[kMaxStackYZ];
+        Real Mvd2_stack[kMaxStackYZ];
+        Real Mdd_stack[kMaxStackYZ];
+
+        Real* Mvv;
+        Real* Mdv;
+        Real* Mvd;
+        Real* Md2v;
+        Real* Mvd2;
+        Real* Mdd;
+        if (table_count <= kMaxStackYZ) {
+            Mvv = Mvv_stack;
+            Mdv = Mdv_stack;
+            Mvd = Mvd_stack;
+            Md2v = Md2v_stack;
+            Mvd2 = Mvd2_stack;
+            Mdd = Mdd_stack;
+        } else {
+            auto& tables = scratch.tensor_tables;
+            tables.resizeFor(table_count);
+            Mvv = tables.vv.data();
+            Mdv = tables.dv.data();
+            Mvd = tables.vd.data();
+            Md2v = tables.d2v.data();
+            Mvd2 = tables.vd2.data();
+            Mdd = tables.dd.data();
+        }
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const std::size_t q_axis = q * axis_stride;
+            const std::size_t q_table = q * nyz;
+            for (std::size_t j = 0; j < ny; ++j) {
+                const Real yv = y_batch.values[q_axis + j];
+                const Real yd = (need_grad || need_hess) ? y_batch.first[q_axis + j] : Real(0);
+                const Real y2 = need_hess ? y_batch.second[q_axis + j] : Real(0);
+                for (std::size_t k = 0; k < nz; ++k) {
+                    const std::size_t slot = q_table + j * nz + k;
+                    const Real zv = has_z ? z_batch.values[q_axis + k] : Real(1);
+                    Mvv[slot] = yv * zv;
+                    if (need_grad || need_hess) {
+                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
+                        Mdv[slot] = yd * zv;
+                        Mvd[slot] = yv * zd;
+                    }
+                    if (need_hess) {
+                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
+                        const Real z2 = has_z ? z_batch.second[q_axis + k] : Real(0);
+                        Md2v[slot] = y2 * zv;
+                        Mvd2[slot] = yv * z2;
+                        Mdd[slot] = yd * zd;
+                    }
+                }
+            }
+        }
+
+        if (values_only) {
+            if (has_z && num_qpts == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+
+                    write_tensor_product_value_strided_q<0>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<1>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<2>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<3>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                }
+                return;
+            }
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const auto& idx = tensor_indices[node];
+                const std::size_t i = idx[0];
+                const std::size_t jk = idx[1] * nz + idx[2];
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t q_axis = q * axis_stride;
+                    const std::size_t slot = q * nyz + jk;
+                    value_row[q] = x_batch.values[q_axis + i] * Mvv[slot];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            if (has_z && num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                        const auto& idx = tensor_indices[node];
+                        const std::size_t i = idx[0];
+                        const std::size_t jk = idx[1] * nz + idx[2];
+                        Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                        write_tensor_product_gradient_stride4_q<0>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<1>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<2>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<3>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                        const auto& idx = tensor_indices[node];
+                        const std::size_t i = idx[0];
+                        const std::size_t jk = idx[1] * nz + idx[2];
+                        Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                        write_tensor_product_gradient_strided_q<0>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<1>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<2>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<3>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const auto& idx = tensor_indices[node];
+                const std::size_t i = idx[0];
+                const std::size_t jk = idx[1] * nz + idx[2];
+                Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t q_axis = q * axis_stride;
+                    const std::size_t slot = q * nyz + jk;
+                    const Real xv = x_batch.values[q_axis + i];
+                    const Real xd = x_batch.first[q_axis + i];
+                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
+                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
+                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
+                }
+            }
+            return;
+        }
+
+        if (has_z && num_qpts == 4u && hessians_only) {
+            if (output_stride == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_hessian_stride4_q<0>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<1>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<2>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<3>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                }
+            } else {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_hessian_strided_q<0>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<1>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<2>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<3>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                }
+            }
+            return;
+        }
+
+        if (has_z && num_qpts == 4u && all_outputs) {
+            if (output_stride == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+                    Real* grad_row = gradients_out + node * 3u * output_stride;
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_all_stride4_q<0>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<1>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<2>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<3>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                }
+            } else {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+                    Real* grad_row = gradients_out + node * 3u * output_stride;
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_all_strided_q<0>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<1>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<2>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<3>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+            const auto& idx = tensor_indices[node];
+            const std::size_t i = idx[0];
+            const std::size_t jk = idx[1] * nz + idx[2];
+
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t q_axis = q * axis_stride;
+                const std::size_t slot = q * nyz + jk;
+                const Real xv = x_batch.values[q_axis + i];
+
+                if (value_row != nullptr) {
+                    value_row[q] = xv * Mvv[slot];
+                }
+
+                if (need_grad) {
+                    const Real xd = x_batch.first[q_axis + i];
+                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
+                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
+                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
+                }
+
+                if (need_hess) {
+                    const Real xd = x_batch.first[q_axis + i];
+                    const Real x2 = x_batch.second[q_axis + i];
+                    const Real hxy = xd * Mdv[slot];
+                    const Real hxz = xd * Mvd[slot];
+                    const Real hyz = xv * Mdd[slot];
+                    hess_row[0u * output_stride + q] = x2 * Mvv[slot];
+                    hess_row[4u * output_stride + q] = xv * Md2v[slot];
+                    hess_row[8u * output_stride + q] = xv * Mvd2[slot];
+                    hess_row[1u * output_stride + q] = hxy;
+                    hess_row[3u * output_stride + q] = hxy;
+                    hess_row[2u * output_stride + q] = hxz;
+                    hess_row[6u * output_stride + q] = hxz;
+                    hess_row[5u * output_stride + q] = hyz;
+                    hess_row[7u * output_stride + q] = hyz;
+                }
+            }
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t j = idx[1];
+
+        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const std::size_t q_axis = q * axis_stride;
+            const Real xv = x_batch.values[q_axis + i];
+            const Real yv = y_batch.values[q_axis + j];
+
+            if (value_row != nullptr) {
+                value_row[q] = xv * yv;
+            }
+
+            if (need_grad) {
+                const Real xd = x_batch.first[q_axis + i];
+                const Real yd = y_batch.first[q_axis + j];
+                grad_row[0u * output_stride + q] = xd * yv;
+                grad_row[1u * output_stride + q] = xv * yd;
+                grad_row[2u * output_stride + q] = Real(0);
+            }
+
+            if (need_hess) {
+                const Real xd = x_batch.first[q_axis + i];
+                const Real yd = y_batch.first[q_axis + j];
+                const Real x2 = x_batch.second[q_axis + i];
+                const Real y2 = y_batch.second[q_axis + j];
+                const Real hxy = xd * yd;
+
+                hess_row[0u * output_stride + q] = x2 * yv;
+                hess_row[4u * output_stride + q] = xv * y2;
+                hess_row[8u * output_stride + q] = Real(0);
+                hess_row[1u * output_stride + q] = hxy;
+                hess_row[3u * output_stride + q] = hxy;
+                hess_row[2u * output_stride + q] = Real(0);
+                hess_row[6u * output_stride + q] = Real(0);
+                hess_row[5u * output_stride + q] = Real(0);
+                hess_row[7u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_wedge_points_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* v_coeffs,
+    const Real* d_coeffs,
+    const Real* d2_coeffs,
+    const Real* barycentric_weights,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (points.empty() || wedge_indices.empty()) {
+        return;
+    }
+
+    const bool want_values = values_out != nullptr;
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    const bool values_only = want_values && !need_grad && !need_hess;
+    const bool gradients_only = !want_values && need_grad && !need_hess;
+    const bool hessians_only = !want_values && !need_grad && need_hess;
+    const bool all_outputs = want_values && need_grad && need_hess;
+    const bool use_batched_wedge =
+        (values_only && order <= 3) ||
+        (gradients_only && order >= 2) ||
+        (hessians_only && order >= 3) ||
+        (all_outputs && order >= 3);
+    if (values_only &&
+        order >= 4 &&
+        order <= 8 &&
+        try_evaluate_wedge_values_product_q4(
+            simplex_exponents, wedge_indices, order, points, output_stride, values_out)) {
+        return;
+    }
+    const AxisDeriv level = need_hess
+        ? AxisDeriv::ValuesAndFirstAndSecond
+        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const std::size_t tri_count = simplex_exponents.size();
+    if (use_batched_wedge) {
+        const std::size_t num_qpts = points.size();
+        const std::size_t tri_stride = num_qpts;
+        if (num_qpts == 4u &&
+            output_stride == 4u &&
+            (gradients_only || hessians_only || all_outputs) &&
+            order >= 3 &&
+            order <= 8 &&
+            wedge_node_by_tri_z.size() == tri_count * static_cast<std::size_t>(n_axis)) {
+            const bool use_product_axis_batch =
+                gradients_only &&
+                n_axis >= 5 &&
+                n_axis <= 9;
+            if (!use_product_axis_batch ||
+                !try_fill_axis_batch_product_q4(
+                    scratch.axis_z_batch, points, 2u, n_axis, level)) {
+                fill_axis_batch(scratch.axis_z_batch,
+                                points,
+                                2u,
+                                v_coeffs,
+                                d_coeffs,
+                                d2_coeffs,
+                                barycentric_weights,
+                                n_axis,
+                                level);
+            }
+            if (need_hess) {
+                if (try_evaluate_wedge_fused_stride4_q4<true>(
+                        simplex_exponents, wedge_node_by_tri_z, order, points,
+                        scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
+                    return;
+                }
+            } else if (try_evaluate_wedge_fused_stride4_q4<false>(
+                           simplex_exponents, wedge_node_by_tri_z, order, points,
+                           scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
+                return;
+            }
+        }
+
+        const std::size_t tri_values_size = tri_count * tri_stride;
+        scratch.wedge_tri_values_batch.resize(tri_values_size);
+        if (need_grad || need_hess) {
+            scratch.wedge_tri_gradient_batch.resize(tri_count * 2u * tri_stride);
+        }
+        if (need_hess) {
+            scratch.wedge_tri_hessian_batch.resize(tri_count * 3u * tri_stride);
+        }
+
+        detail::evaluate_triangle_simplex_basis_wedge_components_strided(
+            simplex_exponents,
+            order,
+            points,
+            tri_stride,
+            scratch.wedge_tri_values_batch.data(),
+            (need_grad || need_hess) ? scratch.wedge_tri_gradient_batch.data() : nullptr,
+            need_hess ? scratch.wedge_tri_hessian_batch.data() : nullptr);
+
+        const bool use_product_axis_batch =
+            gradients_only &&
+            points.size() == 4u &&
+            n_axis >= 5 &&
+            n_axis <= 9;
+        if (!use_product_axis_batch ||
+            !try_fill_axis_batch_product_q4(
+                scratch.axis_z_batch, points, 2u, n_axis, level)) {
+            fill_axis_batch(scratch.axis_z_batch,
+                            points,
+                            2u,
+                            v_coeffs,
+                            d_coeffs,
+                            d2_coeffs,
+                            barycentric_weights,
+                            n_axis,
+                            level);
+        }
+
+        const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+        if (all_outputs) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* value_row = values_out + node * output_stride;
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_all_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* value_row = values_out + node * output_stride;
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_all_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* value_row = values_out + node * output_stride;
+                Real* g = gradients_out + node * 3u * output_stride;
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real tri_gx = tri_g[0u * tri_stride + q];
+                    const Real tri_gy = tri_g[1u * tri_stride + q];
+                    const Real tri_hxx = tri_H[0u * tri_stride + q];
+                    const Real tri_hxy = tri_H[1u * tri_stride + q];
+                    const Real tri_hyy = tri_H[2u * tri_stride + q];
+                    const Real hxz = tri_gx * zd;
+                    const Real hxy = tri_hxy * zv;
+                    const Real hyz = tri_gy * zd;
+
+                    value_row[q] = tri_v * zv;
+                    g[0u * output_stride + q] = tri_gx * zv;
+                    g[1u * output_stride + q] = tri_gy * zv;
+                    g[2u * output_stride + q] = tri_v * zd;
+                    H[0u * output_stride + q] = tri_hxx * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_hyy * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+            return;
+        }
+
+        if (hessians_only) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_hessian_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_hessian_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real tri_gx = tri_g[0u * tri_stride + q];
+                    const Real tri_gy = tri_g[1u * tri_stride + q];
+                    const Real tri_hxx = tri_H[0u * tri_stride + q];
+                    const Real tri_hxy = tri_H[1u * tri_stride + q];
+                    const Real tri_hyy = tri_H[2u * tri_stride + q];
+                    const Real hxz = tri_gx * zd;
+                    const Real hxy = tri_hxy * zv;
+                    const Real hyz = tri_gy * zd;
+
+                    H[0u * output_stride + q] = tri_hxx * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_hyy * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_gradient_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_gradient_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* g = gradients_out + node * 3u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
+                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
+                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t tri_q = tri * tri_stride + q;
+                const std::size_t z_q = q * axis_stride + z;
+                const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                const Real zv = scratch.axis_z_batch.values[z_q];
+                if (values_out != nullptr) {
+                    value_row[q] = tri_v * zv;
+                }
+
+                if (need_grad) {
+                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
+                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
+                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
+                }
+
+                if (need_hess) {
+                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                    const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real hxz = tri_g[0u * tri_stride + q] * zd;
+                    const Real hxy = tri_H[1u * tri_stride + q] * zv;
+                    const Real hyz = tri_g[1u * tri_stride + q] * zd;
+                    H[0u * output_stride + q] = tri_H[0u * tri_stride + q] * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_H[2u * tri_stride + q] * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+        }
+
+        return;
+    }
+
+    scratch.tri_values.resize(tri_count);
+    if (need_grad || need_hess) {
+        scratch.tri_gradient_components.resize(tri_count * 3u);
+    }
+    if (need_hess) {
+        scratch.tri_hessian_components.resize(tri_count * 9u);
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const AxisBasisEvaluations z_axis =
+            fill_axis_scratch(scratch.axis_z,
+                              v_coeffs,
+                              d_coeffs,
+                              d2_coeffs,
+                              barycentric_weights,
+                              n_axis,
+                              xi[2],
+                              level);
+        detail::evaluate_triangle_simplex_basis_to(
+            simplex_exponents,
+            order,
+            xi,
+            scratch.tri_values.data(),
+            (need_grad || need_hess) ? scratch.tri_gradient_components.data() : nullptr,
+            need_hess ? scratch.tri_hessian_components.data() : nullptr);
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            const Real tri_v = scratch.tri_values[tri];
+            const Real zv = z_axis.values[z];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = tri_v * zv;
+            }
+
+            if (need_grad) {
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = tri_g[0] * zv;
+                g[1u * output_stride + q] = tri_g[1] * zv;
+                g[2u * output_stride + q] = tri_v * z_axis.first[z];
+            }
+
+            if (need_hess) {
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real zd = z_axis.first[z];
+                const Real hxz = tri_g[0] * zd;
+                const Real hxy = tri_H[1] * zv;
+                const Real hyz = tri_g[1] * zd;
+                Real* H = hessians_out + node * 9u * output_stride;
+                H[0u * output_stride + q] = tri_H[0] * zv;
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = tri_H[4] * zv;
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = tri_v * z_axis.second[z];
+            }
+        }
+    }
+}
+
+NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
+    switch (element_type) {
+        case ElementType::Line3:
+            return {ElementType::Line2, std::max(order, 2)};
+        case ElementType::Triangle6:
+            return {ElementType::Triangle3, std::max(order, 2)};
+        case ElementType::Quad9:
+            return {ElementType::Quad4, std::max(order, 2)};
+        case ElementType::Quad8:
+            throw BasisElementCompatibilityException(
+                "Quad8 is a serendipity element; use SerendipityBasis for Quad8",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Tetra10:
+            return {ElementType::Tetra4, std::max(order, 2)};
+        case ElementType::Hex27:
+            return {ElementType::Hex8, std::max(order, 2)};
+        case ElementType::Hex20:
+            throw BasisElementCompatibilityException(
+                "Hex20 is a serendipity element; use SerendipityBasis for Hex20",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Wedge18:
+            return {ElementType::Wedge6, std::max(order, 2)};
+        case ElementType::Wedge15:
+            throw BasisElementCompatibilityException(
+                "Wedge15 is a serendipity element; use SerendipityBasis for Wedge15",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid13:
+            throw BasisElementCompatibilityException(
+                "Pyramid13 is a serendipity variant; use SerendipityBasis (Pyramid13) or the complete-family Lagrange path via LagrangeBasis (Pyramid5, order >= 2)",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid14:
+            return {ElementType::Pyramid5, std::max(order, 2)};
+        default:
+            return {element_type, order};
+    }
+}
+
+} // namespace
+
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
+    evaluate_scratch().prewarm(max_order, max_qpts);
+}
+
+LagrangeBasis::LagrangeBasis(ElementType type, int order)
+    : element_type_(type), dimension_(0), order_(order) {
+    const NormalizedLagrangeRequest normalized = normalize_lagrange_request(element_type_, order_);
+    element_type_ = normalized.element_type;
+    order_ = normalized.order;
+
+    if (order_ < 0) {
+        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
+                                          __FILE__, __LINE__, __func__);
+    }
+
+    dimension_ = lagrange_topology_traits(element_type_).dimension;
+
+    init_nodes();
+    init_evaluation_dispatch();
+}
+
+void LagrangeBasis::init_nodes() {
+    nodes_.clear();
+    nodes_1d_.clear();
+    tensor_indices_.clear();
+    simplex_exponents_.clear();
+    wedge_indices_.clear();
+    wedge_node_by_tri_z_.clear();
+    axis_v_coeffs_.clear();
+    axis_d_coeffs_.clear();
+    axis_d2_coeffs_.clear();
+    axis_barycentric_weights_.clear();
+    const auto topology = lagrange_topology_traits(element_type_).topology;
+    topology_id_ = static_cast<int>(topology);
+    switch (topology) {
+        case LagrangeTopology::Point:
+            build_point_nodes();
+            return;
+        case LagrangeTopology::Line:
+            build_tensor_product_nodes(1);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Quadrilateral:
+            build_tensor_product_nodes(2);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Hexahedron:
+            build_tensor_product_nodes(3);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Triangle:
+        case LagrangeTopology::Tetrahedron:
+            build_simplex_nodes();
+            return;
+        case LagrangeTopology::Wedge:
+            build_wedge_nodes();
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Pyramid:
+            build_pyramid_nodes();
+            return;
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
+                                             __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::init_evaluation_dispatch() {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    switch (topology) {
+        case LagrangeTopology::Point:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_point_vectors;
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tensor_product_vectors;
+            return;
+        case LagrangeTopology::Triangle:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_triangle_vectors;
+            return;
+        case LagrangeTopology::Tetrahedron:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tetrahedron_vectors;
+            return;
+        case LagrangeTopology::Wedge:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_wedge_vectors;
+            return;
+        case LagrangeTopology::Pyramid:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_pyramid_vectors;
+            return;
+        case LagrangeTopology::Unknown:
+            break;
+    }
+    vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_unsupported_vectors;
+}
+
+void LagrangeBasis::compute_axis_monomial_coefficients() {
+    const int N = static_cast<int>(nodes_1d_.size());
+    if (N == 0) return;
+
+    axis_barycentric_weights_.resize(static_cast<std::size_t>(N));
+    fill_equispaced_barycentric_weights(N, axis_barycentric_weights_.data());
+
+    if (assign_precomputed_axis_coefficients(N, axis_v_coeffs_, axis_d_coeffs_, axis_d2_coeffs_)) {
+        return;
+    }
+
+    axis_v_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N), Real(0));
+    if (N >= 2) {
+        axis_d_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 1), Real(0));
+    }
+    if (N >= 3) {
+        axis_d2_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 2), Real(0));
+    }
+
+    if (N == 1) {
+        axis_v_coeffs_[0] = Real(1);
+        return;
+    }
+
+    // For each L_i, compute monomial coefficients of P_i(x) = prod_{j != i} (x - x_j),
+    // then divide by w_i = prod_{j != i} (x_i - x_j).
+    std::vector<Real> coeffs;
+    coeffs.reserve(static_cast<std::size_t>(N));
+    for (int i = 0; i < N; ++i) {
+        coeffs.assign(1, Real(1));  // start with constant polynomial 1
+        for (int j = 0; j < N; ++j) {
+            if (j == i) continue;
+            // Multiply (x - x_j) into coeffs (in-place via temp).
+            std::vector<Real> next(coeffs.size() + 1, Real(0));
+            for (std::size_t k = 0; k < coeffs.size(); ++k) {
+                next[k]     -= nodes_1d_[static_cast<std::size_t>(j)] * coeffs[k];
+                next[k + 1] += coeffs[k];
+            }
+            coeffs.swap(next);
+        }
+        // Divide by w_i.
+        Real denom = Real(1);
+        for (int j = 0; j < N; ++j) {
+            if (j == i) continue;
+            denom *= (nodes_1d_[static_cast<std::size_t>(i)] - nodes_1d_[static_cast<std::size_t>(j)]);
+        }
+        const Real inv_denom = Real(1) / denom;
+        for (int k = 0; k < N; ++k) {
+            axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N) + static_cast<std::size_t>(k)]
+                = coeffs[static_cast<std::size_t>(k)] * inv_denom;
+        }
+
+        // First derivative coefficients: d/dx (sum_k c_ik * x^k) = sum_{k>=1} k*c_ik * x^(k-1).
+        if (N >= 2) {
+            for (int k = 1; k < N; ++k) {
+                axis_d_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 1)
+                              + static_cast<std::size_t>(k - 1)]
+                    = static_cast<Real>(k)
+                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
+                                       + static_cast<std::size_t>(k)];
+            }
+        }
+
+        // Second derivative coefficients: d^2/dx^2 = sum_{k>=2} k*(k-1)*c_ik * x^(k-2).
+        if (N >= 3) {
+            for (int k = 2; k < N; ++k) {
+                axis_d2_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 2)
+                              + static_cast<std::size_t>(k - 2)]
+                    = static_cast<Real>(k * (k - 1))
+                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
+                                       + static_cast<std::size_t>(k)];
+            }
+        }
+    }
+}
+
+void LagrangeBasis::build_point_nodes() {
+    nodes_.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(0)});
+}
+
+void LagrangeBasis::init_equispaced_1d_nodes() {
+    nodes_1d_.clear();
+    for (int i = 0; i <= std::max(order_, 0); ++i) {
+        nodes_1d_.push_back(detail::equispaced_pm_one_coord(i, order_));
+    }
+}
+
+void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+    init_equispaced_1d_nodes();
+
+    if (dimensions < 1 || dimensions > 3) {
+        throw BasisConfigurationException("LagrangeBasis::build_tensor_product_nodes requires dimension 1, 2, or 3",
+                                          __FILE__, __LINE__, __func__);
+    }
+
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    tensor_indices_.resize(nodes_.size(), TensorNodeIndex{0u, 0u, 0u});
+    for (std::size_t n = 0; n < nodes_.size(); ++n) {
+        tensor_indices_[n][0] = lattice_index_pm_one(
+            nodes_[n][0], order_,
+            "LagrangeBasis: invalid tensor-product x-coordinate in public node ordering");
+        if (dimensions >= 2) {
+            tensor_indices_[n][1] = lattice_index_pm_one(
+                nodes_[n][1], order_,
+                "LagrangeBasis: invalid tensor-product y-coordinate in public node ordering");
+        }
+        if (dimensions == 3) {
+            tensor_indices_[n][2] = lattice_index_pm_one(
+                nodes_[n][2], order_,
+                "LagrangeBasis: invalid tensor-product z-coordinate in public node ordering");
+        }
+    }
+}
+
+void LagrangeBasis::build_simplex_nodes() {
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    simplex_exponents_.clear();
+    simplex_exponents_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        switch (topology) {
+            case LagrangeTopology::Triangle:
+                simplex_exponents_.push_back(triangle_exponents_from_public_node(node, order_));
+                break;
+            case LagrangeTopology::Tetrahedron:
+                simplex_exponents_.push_back(tetrahedron_exponents_from_public_node(node, order_));
+                break;
+            default:
+                throw BasisElementCompatibilityException("LagrangeBasis::build_simplex_nodes requires simplex topology",
+                                                         __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+void LagrangeBasis::build_wedge_nodes() {
+    init_equispaced_1d_nodes();
+    const auto triangle_nodes = ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
+    simplex_exponents_.clear();
+    simplex_exponents_.reserve(triangle_nodes.size());
+    std::unordered_map<std::array<int, 4>, std::size_t, SimplexExponentHash> triangle_index_by_exponent;
+    triangle_index_by_exponent.reserve(triangle_nodes.size());
+    for (std::size_t tri = 0; tri < triangle_nodes.size(); ++tri) {
+        const auto exponents = triangle_exponents_from_public_node(triangle_nodes[tri], order_);
+        simplex_exponents_.push_back(exponents);
+        const auto inserted = triangle_index_by_exponent.emplace(exponents, tri);
+        if (!inserted.second) {
+            throw BasisNodeOrderingException("LagrangeBasis: duplicate wedge triangle descriptor",
+                                             __FILE__, __LINE__, __func__);
+        }
+    }
+
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    wedge_indices_.clear();
+    wedge_indices_.reserve(nodes_.size());
+    const std::size_t z_count = static_cast<std::size_t>(order_ + 1);
+    const std::size_t missing_node = nodes_.size();
+    wedge_node_by_tri_z_.assign(triangle_nodes.size() * z_count, missing_node);
+    for (std::size_t node_index = 0; node_index < nodes_.size(); ++node_index) {
+        const auto& node = nodes_[node_index];
+        const auto exponents = triangle_exponents_from_public_node(node, order_);
+        const auto found = triangle_index_by_exponent.find(exponents);
+        if (found == triangle_index_by_exponent.end()) {
+            throw BasisNodeOrderingException("LagrangeBasis: failed to resolve wedge triangle descriptor in public ordering",
+                                             __FILE__, __LINE__, __func__);
+        }
+        const std::size_t tri = found->second;
+        const std::size_t z =
+            lattice_index_pm_one(node[2], order_,
+                                 "LagrangeBasis: invalid wedge z-coordinate in public node ordering");
+        wedge_indices_.push_back(WedgeNodeIndex{tri, z});
+        wedge_node_by_tri_z_[tri * z_count + z] = node_index;
+    }
+    for (std::size_t entry = 0; entry < wedge_node_by_tri_z_.size(); ++entry) {
+        if (wedge_node_by_tri_z_[entry] == missing_node) {
+            throw BasisNodeOrderingException("LagrangeBasis: incomplete wedge tensor-product node map",
+                                             __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+void LagrangeBasis::build_pyramid_nodes() {
+    nodes_ = detail::lagrange_pyramid::nodes(order_);
+}
+
+void LagrangeBasis::evaluate_point_vectors(const math::Vector<Real, 3>&,
+                                           std::vector<Real>* values,
+                                           std::vector<Gradient>* gradients,
+                                           std::vector<Hessian>* hessians) const {
+    if (values != nullptr) {
+        values->resize(1u);
+        (*values)[0] = Real(1);
+    }
+    if (gradients != nullptr) {
+        gradients->resize(1u);
+        (*gradients)[0] = Gradient{};
+    }
+    if (hessians != nullptr) {
+        hessians->resize(1u);
+        (*hessians)[0] = Hessian{};
+    }
+}
+
+void LagrangeBasis::evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
+                                                    std::vector<Real>* values,
+                                                    std::vector<Gradient>* gradients,
+                                                    std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
+                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
+                                                  : AxisDeriv::ValuesOnly;
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const AxisBasisEvaluations x_axis =
+        fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], level);
+    AxisBasisEvaluations y_axis = constant_axis_basis();
+    AxisBasisEvaluations z_axis = constant_axis_basis();
+
+    if (topology != LagrangeTopology::Line) {
+        y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], level);
+    }
+    if (topology == LagrangeTopology::Hexahedron) {
+        z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
+    }
+
+    evaluate_tensor_product_factorized(tensor_indices_, x_axis, y_axis, z_axis,
+                                       values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
+                                              std::vector<Real>* values,
+                                              std::vector<Gradient>* gradients,
+                                              std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+    detail::evaluate_triangle_simplex_basis(simplex_exponents_, order_, xi,
+                                            values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
+                                                 std::vector<Real>* values,
+                                                 std::vector<Gradient>* gradients,
+                                                 std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+    detail::evaluate_tetrahedron_simplex_basis(simplex_exponents_, order_, xi,
+                                               values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
+                                           std::vector<Real>* values,
+                                           std::vector<Gradient>* gradients,
+                                           std::vector<Hessian>* hessians) const {
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
+                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
+                                                  : AxisDeriv::ValuesOnly;
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const AxisBasisEvaluations z_axis =
+        fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
+
+    if (hessians != nullptr) {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, &scratch.tri_gradients, &scratch.tri_hessians);
+    } else if (gradients != nullptr) {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, &scratch.tri_gradients, nullptr);
+    } else {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, nullptr, nullptr);
+    }
+
+    const std::size_t n_nodes = wedge_indices_.size();
+    if (values != nullptr) {
+        values->resize(n_nodes);
+    }
+    if (gradients != nullptr) {
+        gradients->resize(n_nodes);
+    }
+    if (hessians != nullptr) {
+        hessians->resize(n_nodes);
+    }
+
+    for (std::size_t n = 0; n < n_nodes; ++n) {
+        const auto& index = wedge_indices_[n];
+        const std::size_t tri_idx = index[0];
+        const std::size_t z_idx = index[1];
+        const Real zv = z_axis.values[z_idx];
+        const Real tri_v = scratch.tri_values[tri_idx];
+
+        if (values != nullptr) {
+            (*values)[n] = tri_v * zv;
+        }
+        if (gradients != nullptr) {
+            const Real zd = z_axis.first[z_idx];
+            (*gradients)[n][0] = scratch.tri_gradients[tri_idx][0] * zv;
+            (*gradients)[n][1] = scratch.tri_gradients[tri_idx][1] * zv;
+            (*gradients)[n][2] = tri_v * zd;
+        }
+        if (hessians != nullptr) {
+            const Real zd = z_axis.first[z_idx];
+            const Real zd2 = z_axis.second[z_idx];
+            Hessian H{};
+            H(0, 0) = scratch.tri_hessians[tri_idx](0, 0) * zv;
+            H(1, 1) = scratch.tri_hessians[tri_idx](1, 1) * zv;
+            H(0, 1) = scratch.tri_hessians[tri_idx](0, 1) * zv;
+            H(1, 0) = H(0, 1);
+            H(2, 2) = tri_v * zd2;
+            H(0, 2) = scratch.tri_gradients[tri_idx][0] * zd;
+            H(2, 0) = H(0, 2);
+            H(1, 2) = scratch.tri_gradients[tri_idx][1] * zd;
+            H(2, 1) = H(1, 2);
+            (*hessians)[n] = H;
+        }
+    }
+}
+
+void LagrangeBasis::evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
+                                             std::vector<Real>* values,
+                                             std::vector<Gradient>* gradients,
+                                             std::vector<Hessian>* hessians) const {
+    if (values != nullptr && gradients != nullptr && hessians != nullptr) {
+        detail::lagrange_pyramid::evaluate_all(order_, xi, *values, *gradients, *hessians);
+        return;
+    }
+    if (values != nullptr) {
+        detail::lagrange_pyramid::evaluate_values(order_, xi, *values);
+    }
+    if (gradients != nullptr) {
+        detail::lagrange_pyramid::evaluate_gradients(order_, xi, *gradients);
+    }
+    if (hessians != nullptr) {
+        detail::lagrange_pyramid::evaluate_hessians(order_, xi, *hessians);
+    }
+}
+
+void LagrangeBasis::evaluate_unsupported_vectors(const math::Vector<Real, 3>&,
+                                                 std::vector<Real>*,
+                                                 std::vector<Gradient>*,
+                                                 std::vector<Hessian>*) const {
+    throw BasisEvaluationException("Unsupported element in LagrangeBasis vector evaluation",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                    std::vector<Real>& values) const {
+    (this->*vector_evaluation_dispatch_)(xi, &values, nullptr, nullptr);
+}
+
+void LagrangeBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients) const {
+    (this->*vector_evaluation_dispatch_)(xi, nullptr, &gradients, nullptr);
+}
+
+void LagrangeBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians) const {
+    (this->*vector_evaluation_dispatch_)(xi, nullptr, nullptr, &hessians);
+}
+
+void LagrangeBasis::evaluate_all(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    (this->*vector_evaluation_dispatch_)(xi, &values, &gradients, &hessians);
+}
+
+void LagrangeBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, nullptr, nullptr)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            values_out[0] = Real(1);
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesOnly);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesOnly);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  values_out, nullptr, nullptr);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       values_out, nullptr, nullptr);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          values_out, nullptr, nullptr);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
+            scratch.tri_values.resize(simplex_exponents_.size());
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(), nullptr, nullptr);
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                values_out[n] = scratch.tri_values[index[0]] * z_axis.values[index[1]];
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_values_to(order_, xi, values_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_values_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, gradients_out, nullptr)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            gradients_out[0] = Real(0);
+            gradients_out[1] = Real(0);
+            gradients_out[2] = Real(0);
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirst);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirst);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  nullptr, gradients_out, nullptr);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       nullptr, gradients_out, nullptr);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          nullptr, gradients_out, nullptr);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       nullptr);
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                Real* g = gradients_out + n * 3u;
+                g[0] = tri_g[0] * z_axis.values[z];
+                g[1] = tri_g[1] * z_axis.values[z];
+                g[2] = scratch.tri_values[tri] * z_axis.first[z];
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_gradients_to(order_, xi, gradients_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_gradients_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                         Real* SVMP_RESTRICT hessians_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, nullptr, hessians_out)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            for (std::size_t i = 0; i < 9; ++i) {
+                hessians_out[i] = Real(0);
+            }
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  nullptr, nullptr, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       nullptr, nullptr, hessians_out);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          nullptr, nullptr, hessians_out);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            scratch.tri_hessian_components.resize(tri_count * 9u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       scratch.tri_hessian_components.data());
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real zv = z_axis.values[z];
+                const Real zd = z_axis.first[z];
+                const Real zd2 = z_axis.second[z];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real hxy = tri_H[1] * zv;
+                const Real hxz = tri_g[0] * zd;
+                const Real hyz = tri_g[1] * zd;
+                Real* H = hessians_out + n * 9u;
+                H[0] = tri_H[0] * zv;
+                H[4] = tri_H[4] * zv;
+                H[1] = hxy;
+                H[3] = hxy;
+                H[8] = scratch.tri_values[tri] * zd2;
+                H[2] = hxz;
+                H[6] = hxz;
+                H[5] = hyz;
+                H[7] = hyz;
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_hessians_to(order_, xi, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_hessians_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, gradients_out, hessians_out)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            values_out[0] = Real(1);
+            gradients_out[0] = Real(0);
+            gradients_out[1] = Real(0);
+            gradients_out[2] = Real(0);
+            for (std::size_t i = 0; i < 9; ++i) {
+                hessians_out[i] = Real(0);
+            }
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  values_out, gradients_out, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       values_out, gradients_out, hessians_out);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          values_out, gradients_out, hessians_out);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            scratch.tri_hessian_components.resize(tri_count * 9u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       scratch.tri_hessian_components.data());
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real zv = z_axis.values[z];
+                const Real zd = z_axis.first[z];
+                const Real zd2 = z_axis.second[z];
+                const Real tri_v = scratch.tri_values[tri];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real hxy = tri_H[1] * zv;
+                const Real hxz = tri_g[0] * zd;
+                const Real hyz = tri_g[1] * zd;
+
+                values_out[n] = tri_v * zv;
+
+                Real* g = gradients_out + n * 3u;
+                g[0] = tri_g[0] * zv;
+                g[1] = tri_g[1] * zv;
+                g[2] = tri_v * zd;
+
+                Real* H = hessians_out + n * 9u;
+                H[0] = tri_H[0] * zv;
+                H[4] = tri_H[4] * zv;
+                H[1] = hxy;
+                H[3] = hxy;
+                H[8] = tri_v * zd2;
+                H[2] = hxz;
+                H[6] = hxz;
+                H[5] = hyz;
+                H[7] = hyz;
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_all_to(
+                order_, xi, values_out, gradients_out, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_all_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(points, points.size(), values_out, gradients_out, hessians_out);
+}
+
+void LagrangeBasis::evaluate_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException("LagrangeBasis strided evaluation requires output_stride >= points.size()",
+                                          __FILE__, __LINE__, __func__);
+    }
+    if (values_out == nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+        return;
+    }
+
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_strided(topology,
+                                             order_,
+                                             points,
+                                             output_stride,
+                                             values_out,
+                                             gradients_out,
+                                             hessians_out)) {
+        return;
+    }
+
+    if (topology == LagrangeTopology::Line ||
+        topology == LagrangeTopology::Quadrilateral ||
+        topology == LagrangeTopology::Hexahedron) {
+        evaluate_tensor_product_points_strided(topology,
+                                               tensor_indices_,
+                                               points,
+                                               output_stride,
+                                               axis_v_coeffs_.data(),
+                                               axis_d_coeffs_.data(),
+                                               axis_d2_coeffs_.data(),
+                                               axis_barycentric_weights_.data(),
+                                               static_cast<int>(nodes_1d_.size()),
+                                               values_out,
+                                               gradients_out,
+                                               hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Triangle) {
+        detail::evaluate_triangle_simplex_basis_strided(
+            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Tetrahedron) {
+        detail::evaluate_tetrahedron_simplex_basis_strided(
+            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Wedge &&
+        evaluate_wedge_fast_strided(order_,
+                                    wedge_indices_,
+                                    points,
+                                    output_stride,
+                                    values_out,
+                                    gradients_out,
+                                    hessians_out)) {
+        return;
+    }
+
+    const bool wedge_scalar_hessian_fallback =
+        topology == LagrangeTopology::Wedge &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr &&
+        order_ <= 2;
+    if (topology == LagrangeTopology::Wedge && !wedge_scalar_hessian_fallback) {
+        evaluate_wedge_points_strided(simplex_exponents_,
+                                      wedge_indices_,
+                                      wedge_node_by_tri_z_,
+                                      order_,
+                                      points,
+                                      output_stride,
+                                      axis_v_coeffs_.data(),
+                                      axis_d_coeffs_.data(),
+                                      axis_d2_coeffs_.data(),
+                                      axis_barycentric_weights_.data(),
+                                      static_cast<int>(nodes_1d_.size()),
+                                      values_out,
+                                      gradients_out,
+                                      hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Pyramid) {
+        detail::lagrange_pyramid::evaluate_at_quadrature_points_strided(
+            order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    auto& scratch = evaluate_scratch();
+    auto& v_tmp = scratch.strided_values_tmp;
+    auto& g_tmp = scratch.strided_gradients_tmp;
+    auto& h_tmp = scratch.strided_hessians_tmp;
+
+    if (values_out)    v_tmp.resize(num_dofs);
+    if (gradients_out) g_tmp.resize(num_dofs * 3u);
+    if (hessians_out)  h_tmp.resize(num_dofs * 9u);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out && gradients_out && hessians_out) {
+            evaluate_all_to(points[q], v_tmp.data(), g_tmp.data(), h_tmp.data());
+        } else {
+            if (values_out)    evaluate_values_to(points[q], v_tmp.data());
+            if (gradients_out) evaluate_gradients_to(points[q], g_tmp.data());
+            if (hessians_out)  evaluate_hessians_to(points[q], h_tmp.data());
+        }
+
+        if (values_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                values_out[d * output_stride + q] = v_tmp[d];
+            }
+        }
+        if (gradients_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                gradients_out[(d * 3u + 0u) * output_stride + q] = g_tmp[d * 3u + 0u];
+                gradients_out[(d * 3u + 1u) * output_stride + q] = g_tmp[d * 3u + 1u];
+                gradients_out[(d * 3u + 2u) * output_stride + q] = g_tmp[d * 3u + 2u];
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                scatter_hessian_components_strided(
+                    h_tmp.data() + d * 9u,
+                    hessians_out + d * 9u * output_stride,
+                    output_stride,
+                    q);
+            }
+        }
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
new file mode 100644
index 000000000..91f7e379c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -0,0 +1,175 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
+#define SVMP_FE_BASIS_LAGRANGEBASIS_H
+
+/**
+ * @file LagrangeBasis.h
+ * @brief Nodal Lagrange polynomial basis on reference elements
+ */
+
+#include "BasisFunction.h"
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
+
+/**
+ * @brief Complete nodal H1 Lagrange basis on canonical reference topologies
+ *
+ * Supports arbitrary polynomial order on the canonical complete families:
+ * `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`, `Wedge6`, and `Pyramid5`.
+ * Low-order complete-family aliases (`Line3`, `Triangle6`, `Quad9`,
+ * `Tetra10`, `Hex27`, `Wedge18`, `Pyramid14`) normalize to their canonical
+ * topology plus order. Serendipity variants remain intentionally excluded.
+ *
+ * Node locations are generated on canonical reference elements using
+ * equispaced coordinates on tensor-product elements, barycentric grids on
+ * simplices, tensorized triangle-line grids on wedges, and a rational nodal
+ * pyramid construction on `Pyramid5`.
+ *
+ * The evaluator is numerically stabilized for those nodes, but the
+ * interpolation problem itself remains the equispaced Lagrange problem. For
+ * high-order interpolation, especially order >= 4, prefer `SpectralBasis`
+ * (GLL / Warp & Blend nodes) unless exact equispaced nodal placement is part
+ * of the requested discretization.
+ *
+ * For the rational pyramid family, basis values remain exact at the apex.
+ * Gradients and Hessians are analytic on the supported interior reference
+ * domain, but the exact-apex nodal derivative limit is not unique and those
+ * derivative queries throw at the exact apex.
+ */
+class LagrangeBasis : public BasisFunction {
+public:
+    LagrangeBasis(ElementType type, int order);
+
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return nodes_.size(); }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const final;
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const final;
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const final;
+    void evaluate_all(const math::Vector<Real, 3>& xi,
+                      std::vector<Real>& values,
+                      std::vector<Gradient>& gradients,
+                      std::vector<Hessian>& hessians) const final;
+
+    void evaluate_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const final;
+    void evaluate_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const final;
+
+    // Raw-pointer output API. Caller must pre-size buffers to size().
+    void evaluate_values_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT values_out) const final;
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT gradients_out) const final;
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT hessians_out) const final;
+
+private:
+    using TensorNodeIndex = std::array<std::size_t, 3>;
+    using WedgeNodeIndex = std::array<std::size_t, 2>;
+    using VectorEvaluationDispatch = void (LagrangeBasis::*)(
+        const math::Vector<Real, 3>&,
+        std::vector<Real>*,
+        std::vector<Gradient>*,
+        std::vector<Hessian>*) const;
+
+    // Cached topology encoded as int because the topology enum lives in
+    // the .cpp anon namespace. Set once in init_nodes.
+    int topology_id_ = 0;
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+
+    std::vector<Real> nodes_1d_;
+    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<TensorNodeIndex> tensor_indices_;
+    std::vector<std::array<int, 4>> simplex_exponents_;
+    std::vector<WedgeNodeIndex> wedge_indices_;
+    std::vector<std::size_t> wedge_node_by_tri_z_;
+
+    // Precomputed Horner-form coefficients of the 1D Lagrange basis.
+    // Layout per axis (n_axis = nodes_1d_.size() = order_+1):
+    //   axis_v_coeffs_[i * n_axis + k] = coeff of x^k in L_i(x), 0 <= i,k < n_axis
+    //   axis_d_coeffs_[i * (n_axis - 1) + k] = coeff of x^k in L_i'(x)
+    //   axis_d2_coeffs_[i * (n_axis - 2) + k] = coeff of x^k in L_i''(x)  (only if n_axis >= 3)
+    // Populated by build_tensor_product_nodes / build_wedge_nodes.
+    std::vector<Real> axis_v_coeffs_;
+    std::vector<Real> axis_d_coeffs_;
+    std::vector<Real> axis_d2_coeffs_;
+    std::vector<Real> axis_barycentric_weights_;
+    VectorEvaluationDispatch vector_evaluation_dispatch_{nullptr};
+
+    void init_nodes();
+    void init_evaluation_dispatch();
+    void build_point_nodes();
+    void build_tensor_product_nodes(int dimensions);
+    void build_simplex_nodes();
+    void build_wedge_nodes();
+    void build_pyramid_nodes();
+    void init_equispaced_1d_nodes();
+    void compute_axis_monomial_coefficients();
+    void evaluate_point_vectors(const math::Vector<Real, 3>& xi,
+                                std::vector<Real>* values,
+                                std::vector<Gradient>* gradients,
+                                std::vector<Hessian>* hessians) const;
+    void evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
+                                         std::vector<Real>* values,
+                                         std::vector<Gradient>* gradients,
+                                         std::vector<Hessian>* hessians) const;
+    void evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
+                                   std::vector<Real>* values,
+                                   std::vector<Gradient>* gradients,
+                                   std::vector<Hessian>* hessians) const;
+    void evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
+                                      std::vector<Real>* values,
+                                      std::vector<Gradient>* gradients,
+                                      std::vector<Hessian>* hessians) const;
+    void evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
+                                std::vector<Real>* values,
+                                std::vector<Gradient>* gradients,
+                                std::vector<Hessian>* hessians) const;
+    void evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
+                                  std::vector<Real>* values,
+                                  std::vector<Gradient>* gradients,
+                                  std::vector<Hessian>* hessians) const;
+    void evaluate_unsupported_vectors(const math::Vector<Real, 3>& xi,
+                                      std::vector<Real>* values,
+                                      std::vector<Gradient>* gradients,
+                                      std::vector<Hessian>* hessians) const;
+    void evaluate_all_to(const math::Vector<Real, 3>& xi,
+                         Real* SVMP_RESTRICT values_out,
+                         Real* SVMP_RESTRICT gradients_out,
+                         Real* SVMP_RESTRICT hessians_out) const;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASIS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
new file mode 100644
index 000000000..5b9faae04
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
@@ -0,0 +1,1378 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISFAST_H
+#define SVMP_FE_BASIS_LAGRANGEBASISFAST_H
+
+/**
+ * @file LagrangeBasisFast.h
+ * @brief Header-only zero-overhead specializations of the Lagrange basis
+ *
+ * Provides templated static methods for the common nodal Lagrange families
+ * with compile-time-known polynomial order. Callers that know their basis
+ * type and order at compile time use these directly — there is no virtual
+ * dispatch, no std::vector allocation, no scratch lookup, and no topology
+ * switch. The output buffers are stack-allocated std::array, sized at
+ * compile time. The compiler fully unrolls and constant-folds.
+ *
+ * These specializations are an alternative entry point to the runtime path
+ * provided by `LagrangeBasis`. The runtime path remains the canonical API
+ * for generic callers; these specializations serve hot loops that know the
+ * element type.
+ *
+ * Node orderings match `ReferenceNodeLayout::get_lagrange_node_coords(...)` (VTK).
+ */
+
+#include "Types.h"
+#include "Math/Vector.h"
+#include "Math/Matrix.h"
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using Gradient = math::Vector<Real, 3>;
+using Hessian  = math::Matrix<Real, 3, 3>;
+
+namespace detail {
+
+constexpr Gradient scaled_gradient(const Gradient& gradient, Real scale) {
+    return Gradient{scale * gradient[0], scale * gradient[1], scale * gradient[2]};
+}
+
+constexpr Gradient p2_edge_gradient(Real left,
+                                    const Gradient& left_gradient,
+                                    Real right,
+                                    const Gradient& right_gradient) {
+    return Gradient{
+        Real(4) * (left_gradient[0] * right + right_gradient[0] * left),
+        Real(4) * (left_gradient[1] * right + right_gradient[1] * left),
+        Real(4) * (left_gradient[2] * right + right_gradient[2] * left),
+    };
+}
+
+constexpr Hessian p2_vertex_hessian(const Gradient& gradient) {
+    Hessian hessian{};
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t col = 0; col < 3u; ++col) {
+            hessian(row, col) = Real(4) * gradient[row] * gradient[col];
+        }
+    }
+    return hessian;
+}
+
+constexpr Hessian p2_edge_hessian(const Gradient& left_gradient,
+                                  const Gradient& right_gradient) {
+    Hessian hessian{};
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t col = 0; col < 3u; ++col) {
+            hessian(row, col) = Real(4) * (
+                left_gradient[row] * right_gradient[col] +
+                right_gradient[row] * left_gradient[col]);
+        }
+    }
+    return hessian;
+}
+
+constexpr std::size_t public_axis_index(int lattice, int order) noexcept {
+    return lattice == 0 ? 0u :
+           lattice == order ? 1u :
+           static_cast<std::size_t>(lattice + 1);
+}
+
+template<int Order>
+constexpr Real public_axis_coord(std::size_t public_index) noexcept {
+    const int lattice = public_index == 0u ? 0 :
+                        public_index == 1u ? Order :
+                        static_cast<int>(public_index) - 1;
+    return Real(-1) + Real(2) * static_cast<Real>(lattice) / static_cast<Real>(Order);
+}
+
+template<int Order>
+constexpr std::array<Real, Order + 1> make_public_axis_nodes() {
+    std::array<Real, Order + 1> nodes{};
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        nodes[i] = public_axis_coord<Order>(i);
+    }
+    return nodes;
+}
+
+template<int Order>
+constexpr std::array<Real, Order + 1> make_public_axis_inverse_denominators() {
+    constexpr auto nodes = make_public_axis_nodes<Order>();
+    std::array<Real, Order + 1> inv_denominators{};
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        Real denominator = Real(1);
+        for (std::size_t j = 0; j < nodes.size(); ++j) {
+            if (j != i) {
+                denominator *= nodes[i] - nodes[j];
+            }
+        }
+        inv_denominators[i] = Real(1) / denominator;
+    }
+    return inv_denominators;
+}
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+void fill_axis_lagrange(Real x,
+                        std::array<Real, Order + 1>& values,
+                        std::array<Real, Order + 1>* first,
+                        std::array<Real, Order + 1>* second) {
+    constexpr auto nodes = make_public_axis_nodes<Order>();
+    constexpr auto inv_denominators = make_public_axis_inverse_denominators<Order>();
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        Real product = Real(1);
+        for (std::size_t j = 0; j < nodes.size(); ++j) {
+            if (j != i) {
+                product *= x - nodes[j];
+            }
+        }
+        values[i] = product * inv_denominators[i];
+
+        if constexpr (NeedFirst) {
+            Real derivative = Real(0);
+            for (std::size_t m = 0; m < nodes.size(); ++m) {
+                if (m == i) {
+                    continue;
+                }
+                Real term = Real(1);
+                for (std::size_t j = 0; j < nodes.size(); ++j) {
+                    if (j != i && j != m) {
+                        term *= x - nodes[j];
+                    }
+                }
+                derivative += term;
+            }
+            (*first)[i] = derivative * inv_denominators[i];
+        }
+
+        if constexpr (NeedSecond) {
+            Real curvature = Real(0);
+            for (std::size_t m = 0; m < nodes.size(); ++m) {
+                if (m == i) {
+                    continue;
+                }
+                for (std::size_t l = 0; l < nodes.size(); ++l) {
+                    if (l == i || l == m) {
+                        continue;
+                    }
+                    Real term = Real(1);
+                    for (std::size_t j = 0; j < nodes.size(); ++j) {
+                        if (j != i && j != m && j != l) {
+                            term *= x - nodes[j];
+                        }
+                    }
+                    curvature += term;
+                }
+            }
+            (*second)[i] = curvature * inv_denominators[i];
+        }
+    }
+}
+
+template<int Order>
+void fill_axis_values(Real x, std::array<Real, Order + 1>& values) {
+    fill_axis_lagrange<Order, false, false>(x, values, nullptr, nullptr);
+}
+
+template<int Order>
+void fill_axis_values_first(Real x,
+                            std::array<Real, Order + 1>& values,
+                            std::array<Real, Order + 1>& first) {
+    fill_axis_lagrange<Order, true, false>(x, values, &first, nullptr);
+}
+
+template<int Order>
+void fill_axis_values_first_second(Real x,
+                                   std::array<Real, Order + 1>& values,
+                                   std::array<Real, Order + 1>& first,
+                                   std::array<Real, Order + 1>& second) {
+    fill_axis_lagrange<Order, true, true>(x, values, &first, &second);
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)>
+make_quad_tensor_node_axes() {
+    std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)> axes{};
+    std::size_t n = 0;
+
+    axes[n++] = {{0u, 0u}};
+    axes[n++] = {{1u, 0u}};
+    axes[n++] = {{1u, 1u}};
+    axes[n++] = {{0u, 1u}};
+
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order)}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order)}};
+    }
+
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order)}};
+        }
+    }
+
+    return axes;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)>
+make_hex_tensor_node_axes() {
+    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)> axes{};
+    std::size_t n = 0;
+
+    axes[n++] = {{0u, 0u, 0u}};
+    axes[n++] = {{1u, 0u, 0u}};
+    axes[n++] = {{1u, 1u, 0u}};
+    axes[n++] = {{0u, 1u, 0u}};
+    axes[n++] = {{0u, 0u, 1u}};
+    axes[n++] = {{1u, 0u, 1u}};
+    axes[n++] = {{1u, 1u, 1u}};
+    axes[n++] = {{0u, 1u, 1u}};
+
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u, 0u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order), 0u}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u, 0u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order), 0u}};
+    }
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u, 1u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order), 1u}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u, 1u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order), 1u}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{0u, 0u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{1u, 0u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{1u, 1u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{0u, 1u, public_axis_index(k, Order)}};
+    }
+
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 0u}};
+        }
+    }
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 1u}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), 0u, public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int j = 1; j < Order; ++j) {
+            axes[n++] = {{1u, public_axis_index(j, Order), public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int i = Order - 1; i >= 1; --i) {
+            axes[n++] = {{public_axis_index(i, Order), 1u, public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int j = Order - 1; j >= 1; --j) {
+            axes[n++] = {{0u, public_axis_index(j, Order), public_axis_index(k, Order)}};
+        }
+    }
+
+    for (int k = 1; k < Order; ++k) {
+        for (int j = 1; j < Order; ++j) {
+            for (int i = 1; i < Order; ++i) {
+                axes[n++] = {{public_axis_index(i, Order),
+                              public_axis_index(j, Order),
+                              public_axis_index(k, Order)}};
+            }
+        }
+    }
+
+    return axes;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2>
+make_triangle_simplex_exponents() {
+    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2> exponents{};
+    std::size_t n = 0;
+
+    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u}};
+    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u}};
+    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order)}};
+
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m), 0u}};
+    }
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{0u, static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m)}};
+    }
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{static_cast<std::size_t>(m), 0u, static_cast<std::size_t>(Order - m)}};
+    }
+
+    for (int c = 1; c <= Order - 2; ++c) {
+        for (int b = 1; b <= Order - c - 1; ++b) {
+            const int a = Order - b - c;
+            exponents[n++] = {{static_cast<std::size_t>(a),
+                               static_cast<std::size_t>(b),
+                               static_cast<std::size_t>(c)}};
+        }
+    }
+
+    return exponents;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6>
+make_tetrahedron_simplex_exponents() {
+    std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6> exponents{};
+    std::size_t n = 0;
+
+    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u, 0u}};
+    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u, 0u}};
+    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order), 0u}};
+    exponents[n++] = {{0u, 0u, 0u, static_cast<std::size_t>(Order)}};
+
+    constexpr int edges[6][2] = {
+        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < Order; ++m) {
+            std::array<std::size_t, 4> e{};
+            e[static_cast<std::size_t>(edge[0])] = static_cast<std::size_t>(Order - m);
+            e[static_cast<std::size_t>(edge[1])] = static_cast<std::size_t>(m);
+            exponents[n++] = e;
+        }
+    }
+
+    constexpr int faces[4][3] = {
+        {0, 1, 2},
+        {0, 1, 3},
+        {1, 2, 3},
+        {0, 2, 3},
+    };
+    for (const auto& face : faces) {
+        for (int c = 1; c <= Order - 2; ++c) {
+            for (int b = 1; b <= Order - c - 1; ++b) {
+                const int a = Order - b - c;
+                std::array<std::size_t, 4> e{};
+                e[static_cast<std::size_t>(face[0])] = static_cast<std::size_t>(a);
+                e[static_cast<std::size_t>(face[1])] = static_cast<std::size_t>(b);
+                e[static_cast<std::size_t>(face[2])] = static_cast<std::size_t>(c);
+                exponents[n++] = e;
+            }
+        }
+    }
+
+    for (int l = 1; l <= Order - 3; ++l) {
+        for (int k = 1; k <= Order - l - 2; ++k) {
+            for (int j = 1; j <= Order - l - k - 1; ++j) {
+                const int i = Order - j - k - l;
+                exponents[n++] = {{static_cast<std::size_t>(i),
+                                   static_cast<std::size_t>(j),
+                                   static_cast<std::size_t>(k),
+                                   static_cast<std::size_t>(l)}};
+            }
+        }
+    }
+
+    return exponents;
+}
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+void fill_simplex_factor_sequence(Real lambda,
+                                  std::array<Real, Order + 1>& phi,
+                                  std::array<Real, Order + 1>* dphi,
+                                  std::array<Real, Order + 1>* d2phi) {
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        (*dphi)[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        (*d2phi)[0] = Real(0);
+    }
+
+    const Real t = static_cast<Real>(Order) * lambda;
+    constexpr Real dt_dlambda = static_cast<Real>(Order);
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+
+    for (int a = 1; a <= Order; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_prev;
+            (*dphi)[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_prev + s * d2phi_dt2_prev;
+                (*d2phi)[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+template<int Order>
+void fill_simplex_factor_values(Real lambda, std::array<Real, Order + 1>& phi) {
+    fill_simplex_factor_sequence<Order, false, false>(lambda, phi, nullptr, nullptr);
+}
+
+template<int Order>
+void fill_simplex_factor_values_first(Real lambda,
+                                      std::array<Real, Order + 1>& phi,
+                                      std::array<Real, Order + 1>& dphi) {
+    fill_simplex_factor_sequence<Order, true, false>(lambda, phi, &dphi, nullptr);
+}
+
+template<int Order>
+void fill_simplex_factor_values_first_second(Real lambda,
+                                             std::array<Real, Order + 1>& phi,
+                                             std::array<Real, Order + 1>& dphi,
+                                             std::array<Real, Order + 1>& d2phi) {
+    fill_simplex_factor_sequence<Order, true, true>(lambda, phi, &dphi, &d2phi);
+}
+
+} // namespace detail
+
+// ---------------------------------------------------------------------------
+// LagrangeLineFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeLineFast;
+
+template<>
+struct LagrangeLineFast<1> {
+    static constexpr int n_dofs = 2;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = (Real(1) - xi[0]) * Real(0.5);
+        out[1] = (Real(1) + xi[0]) * Real(0.5);
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-0.5), Real(0), Real(0)};
+        out[1] = Gradient{Real( 0.5), Real(0), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeLineFast<2> {
+    static constexpr int n_dofs = 3;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real x = xi[0];
+        out[0] = x * (x - Real(1)) * Real(0.5);
+        out[1] = x * (x + Real(1)) * Real(0.5);
+        out[2] = (Real(1) - x) * (Real(1) + x);
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real x = xi[0];
+        out[0] = Gradient{x - Real(0.5), Real(0), Real(0)};
+        out[1] = Gradient{x + Real(0.5), Real(0), Real(0)};
+        out[2] = Gradient{Real(-2) * x, Real(0), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[0](0, 0) = Real(1);
+        out[1](0, 0) = Real(1);
+        out[2](0, 0) = Real(-2);
+    }
+};
+
+template<>
+struct LagrangeLineFast<3> {
+    static constexpr int n_dofs = 4;
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        detail::fill_axis_values<3>(xi[0], out);
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, n_dofs> values{};
+        std::array<Real, n_dofs> first{};
+        detail::fill_axis_values_first<3>(xi[0], values, first);
+        for (std::size_t i = 0; i < first.size(); ++i) {
+            out[i] = Gradient{first[i], Real(0), Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, n_dofs> values{};
+        std::array<Real, n_dofs> first{};
+        std::array<Real, n_dofs> second{};
+        detail::fill_axis_values_first_second<3>(xi[0], values, first, second);
+        for (std::size_t i = 0; i < second.size(); ++i) {
+            Hessian H{};
+            H(0, 0) = second[i];
+            out[i] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeQuadFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeQuadFast;
+
+template<>
+struct LagrangeQuadFast<1> {
+    static constexpr int n_dofs = 4;
+
+    // VTK Quad4 corner ordering: (-,-), (+,-), (+,+), (-,+).
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        out[0] = lx * ly;
+        out[1] = ux * ly;
+        out[2] = ux * uy;
+        out[3] = lx * uy;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        out[0] = Gradient{Real(-0.5) * ly, Real(-0.5) * lx, Real(0)};
+        out[1] = Gradient{Real( 0.5) * ly, Real(-0.5) * ux, Real(0)};
+        out[2] = Gradient{Real( 0.5) * uy, Real( 0.5) * ux, Real(0)};
+        out[3] = Gradient{Real(-0.5) * uy, Real( 0.5) * lx, Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[3] = Hessian{};
+        constexpr Real qrt = Real(0.25);
+        out[0](0, 1) = qrt;  out[0](1, 0) = qrt;
+        out[1](0, 1) = -qrt; out[1](1, 0) = -qrt;
+        out[2](0, 1) = qrt;  out[2](1, 0) = qrt;
+        out[3](0, 1) = -qrt; out[3](1, 0) = -qrt;
+    }
+};
+
+template<>
+struct LagrangeQuadFast<2> {
+    static constexpr int n_dofs = 9;
+
+    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes = {{
+        {{0u, 0u}}, {{1u, 0u}}, {{1u, 1u}}, {{0u, 1u}},
+        {{2u, 0u}}, {{1u, 2u}}, {{2u, 1u}}, {{0u, 2u}},
+        {{2u, 2u}},
+    }};
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            out[n] = Gradient{gx[i][0] * ly[j], lx[i] * gy[j][0], Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
+        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            Hessian H{};
+            H(0, 0) = hx[i](0, 0) * ly[j];
+            H(1, 1) = lx[i] * hy[j](0, 0);
+            H(0, 1) = gx[i][0] * gy[j][0];
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+template<>
+struct LagrangeQuadFast<3> {
+    static constexpr int n_dofs = 16;
+
+    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes =
+        detail::make_quad_tensor_node_axes<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        detail::fill_axis_values<3>(xi[0], lx);
+        detail::fill_axis_values<3>(xi[1], ly);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        detail::fill_axis_values_first<3>(xi[0], lx, gx);
+        detail::fill_axis_values_first<3>(xi[1], ly, gy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            out[n] = Gradient{gx[i] * ly[j], lx[i] * gy[j], Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
+        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
+        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            Hessian H{};
+            H(0, 0) = hx[i] * ly[j];
+            H(1, 1) = lx[i] * hy[j];
+            H(0, 1) = gx[i] * gy[j];
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeHexFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeHexFast;
+
+template<>
+struct LagrangeHexFast<1> {
+    static constexpr int n_dofs = 8;
+
+    // VTK Hex8 corner ordering: (-,-,-), (+,-,-), (+,+,-), (-,+,-),
+    //                           (-,-,+), (+,-,+), (+,+,+), (-,+,+).
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        // Precompute z-plane partial products (sum factorization).
+        const Real lxly = lx * ly;
+        const Real uxly = ux * ly;
+        const Real uxuy = ux * uy;
+        const Real lxuy = lx * uy;
+        out[0] = lxly * lz;
+        out[1] = uxly * lz;
+        out[2] = uxuy * lz;
+        out[3] = lxuy * lz;
+        out[4] = lxly * uz;
+        out[5] = uxly * uz;
+        out[6] = uxuy * uz;
+        out[7] = lxuy * uz;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        // dL_0(x)/dx = -0.5, dL_1(x)/dx = +0.5 along each axis.
+        out[0] = Gradient{Real(-0.5) * ly * lz, Real(-0.5) * lx * lz, Real(-0.5) * lx * ly};
+        out[1] = Gradient{Real( 0.5) * ly * lz, Real(-0.5) * ux * lz, Real(-0.5) * ux * ly};
+        out[2] = Gradient{Real( 0.5) * uy * lz, Real( 0.5) * ux * lz, Real(-0.5) * ux * uy};
+        out[3] = Gradient{Real(-0.5) * uy * lz, Real( 0.5) * lx * lz, Real(-0.5) * lx * uy};
+        out[4] = Gradient{Real(-0.5) * ly * uz, Real(-0.5) * lx * uz, Real( 0.5) * lx * ly};
+        out[5] = Gradient{Real( 0.5) * ly * uz, Real(-0.5) * ux * uz, Real( 0.5) * ux * ly};
+        out[6] = Gradient{Real( 0.5) * uy * uz, Real( 0.5) * ux * uz, Real( 0.5) * ux * uy};
+        out[7] = Gradient{Real(-0.5) * uy * uz, Real( 0.5) * lx * uz, Real( 0.5) * lx * uy};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                            std::array<Hessian, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        const Real ax[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
+        const Real ay[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
+        const Real az[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+        const int sx[8] = {-1, 1, 1, -1, -1, 1, 1, -1};
+        const int sy[8] = {-1, -1, 1, 1, -1, -1, 1, 1};
+        const int sz[8] = {-1, -1, -1, -1, 1, 1, 1, 1};
+        constexpr Real qrt = Real(0.25);
+        for (std::size_t n = 0; n < static_cast<std::size_t>(n_dofs); ++n) {
+            out[n] = Hessian{};
+            out[n](0, 1) = static_cast<Real>(sx[n] * sy[n]) * qrt * az[n];
+            out[n](1, 0) = out[n](0, 1);
+            out[n](0, 2) = static_cast<Real>(sx[n] * sz[n]) * qrt * ay[n];
+            out[n](2, 0) = out[n](0, 2);
+            out[n](1, 2) = static_cast<Real>(sy[n] * sz[n]) * qrt * ax[n];
+            out[n](2, 1) = out[n](1, 2);
+        }
+    }
+};
+
+template<>
+struct LagrangeHexFast<2> {
+    static constexpr int n_dofs = 27;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes = {{
+        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
+        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
+        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
+        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
+        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
+        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
+        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
+    }};
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            out[n] = Gradient{
+                gx[i][0] * ly[j] * lz[k],
+                lx[i] * gy[j][0] * lz[k],
+                lx[i] * ly[j] * gz[k][0],
+            };
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
+        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
+        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
+        LagrangeLineFast<2>::evaluate_hessians({xi[2], Real(0), Real(0)}, hz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            Hessian H{};
+            H(0, 0) = hx[i](0, 0) * ly[j] * lz[k];
+            H(1, 1) = lx[i] * hy[j](0, 0) * lz[k];
+            H(2, 2) = lx[i] * ly[j] * hz[k](0, 0);
+            H(0, 1) = gx[i][0] * gy[j][0] * lz[k];
+            H(1, 0) = H(0, 1);
+            H(0, 2) = gx[i][0] * ly[j] * gz[k][0];
+            H(2, 0) = H(0, 2);
+            H(1, 2) = lx[i] * gy[j][0] * gz[k][0];
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+template<>
+struct LagrangeHexFast<3> {
+    static constexpr int n_dofs = 64;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes =
+        detail::make_hex_tensor_node_axes<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        detail::fill_axis_values<3>(xi[0], lx);
+        detail::fill_axis_values<3>(xi[1], ly);
+        detail::fill_axis_values<3>(xi[2], lz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
+        detail::fill_axis_values_first<3>(xi[0], lx, gx);
+        detail::fill_axis_values_first<3>(xi[1], ly, gy);
+        detail::fill_axis_values_first<3>(xi[2], lz, gz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            out[n] = Gradient{
+                gx[i] * ly[j] * lz[k],
+                lx[i] * gy[j] * lz[k],
+                lx[i] * ly[j] * gz[k],
+            };
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hz{};
+        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
+        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
+        detail::fill_axis_values_first_second<3>(xi[2], lz, gz, hz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            Hessian H{};
+            H(0, 0) = hx[i] * ly[j] * lz[k];
+            H(1, 1) = lx[i] * hy[j] * lz[k];
+            H(2, 2) = lx[i] * ly[j] * hz[k];
+            H(0, 1) = gx[i] * gy[j] * lz[k];
+            H(1, 0) = H(0, 1);
+            H(0, 2) = gx[i] * ly[j] * gz[k];
+            H(2, 0) = H(0, 2);
+            H(1, 2) = lx[i] * gy[j] * gz[k];
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeTriFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeTriFast;
+
+template<>
+struct LagrangeTriFast<1> {
+    static constexpr int n_dofs = 3;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = Real(1) - xi[0] - xi[1];
+        out[1] = xi[0];
+        out[2] = xi[1];
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-1), Real(-1), Real(0)};
+        out[1] = Gradient{Real( 1), Real( 0), Real(0)};
+        out[2] = Gradient{Real( 0), Real( 1), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeTriFast<2> {
+    static constexpr int n_dofs = 6;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        out[0] = l0 * (Real(2) * l0 - Real(1));
+        out[1] = l1 * (Real(2) * l1 - Real(1));
+        out[2] = l2 * (Real(2) * l2 - Real(1));
+        out[3] = Real(4) * l0 * l1;
+        out[4] = Real(4) * l1 * l2;
+        out[5] = Real(4) * l0 * l2;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
+
+        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
+        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
+        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
+        out[3] = detail::p2_edge_gradient(l0, g0, l1, g1);
+        out[4] = detail::p2_edge_gradient(l1, g1, l2, g2);
+        out[5] = detail::p2_edge_gradient(l0, g0, l2, g2);
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
+
+        out[0] = detail::p2_vertex_hessian(g0);
+        out[1] = detail::p2_vertex_hessian(g1);
+        out[2] = detail::p2_vertex_hessian(g2);
+        out[3] = detail::p2_edge_hessian(g0, g1);
+        out[4] = detail::p2_edge_hessian(g1, g2);
+        out[5] = detail::p2_edge_hessian(g0, g2);
+    }
+};
+
+template<>
+struct LagrangeTriFast<3> {
+    static constexpr int n_dofs = 10;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> exponents =
+        detail::make_triangle_simplex_exponents<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        detail::fill_simplex_factor_values<3>(l0, phi0);
+        detail::fill_simplex_factor_values<3>(l1, phi1);
+        detail::fill_simplex_factor_values<3>(l2, phi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
+        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
+        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real dl0 = dphi0[e[0]] * v1 * v2;
+            const Real dl1 = v0 * dphi1[e[1]] * v2;
+            const Real dl2 = v0 * v1 * dphi2[e[2]];
+            out[n] = Gradient{dl1 - dl0, dl2 - dl0, Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> d2phi0{};
+        std::array<Real, 4> d2phi1{};
+        std::array<Real, 4> d2phi2{};
+        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
+        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
+        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real D0 = dphi0[e[0]];
+            const Real D1 = dphi1[e[1]];
+            const Real D2 = dphi2[e[2]];
+            const Real H00 = d2phi0[e[0]] * v1 * v2;
+            const Real H11 = v0 * d2phi1[e[1]] * v2;
+            const Real H22 = v0 * v1 * d2phi2[e[2]];
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+
+            Hessian H{};
+            H(0, 0) = H00 - Real(2) * H01 + H11;
+            H(1, 1) = H00 - Real(2) * H02 + H22;
+            H(0, 1) = H00 - H01 - H02 + H12;
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeTetFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeTetFast;
+
+template<>
+struct LagrangeTetFast<1> {
+    static constexpr int n_dofs = 4;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = Real(1) - xi[0] - xi[1] - xi[2];
+        out[1] = xi[0];
+        out[2] = xi[1];
+        out[3] = xi[2];
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-1), Real(-1), Real(-1)};
+        out[1] = Gradient{Real( 1), Real( 0), Real( 0)};
+        out[2] = Gradient{Real( 0), Real( 1), Real( 0)};
+        out[3] = Gradient{Real( 0), Real( 0), Real( 1)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[3] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeTetFast<2> {
+    static constexpr int n_dofs = 10;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+
+        out[0] = l0 * (Real(2) * l0 - Real(1));
+        out[1] = l1 * (Real(2) * l1 - Real(1));
+        out[2] = l2 * (Real(2) * l2 - Real(1));
+        out[3] = l3 * (Real(2) * l3 - Real(1));
+        out[4] = Real(4) * l0 * l1;
+        out[5] = Real(4) * l1 * l2;
+        out[6] = Real(4) * l0 * l2;
+        out[7] = Real(4) * l0 * l3;
+        out[8] = Real(4) * l1 * l3;
+        out[9] = Real(4) * l2 * l3;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
+        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
+
+        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
+        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
+        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
+        out[3] = detail::scaled_gradient(g3, Real(4) * l3 - Real(1));
+        out[4] = detail::p2_edge_gradient(l0, g0, l1, g1);
+        out[5] = detail::p2_edge_gradient(l1, g1, l2, g2);
+        out[6] = detail::p2_edge_gradient(l0, g0, l2, g2);
+        out[7] = detail::p2_edge_gradient(l0, g0, l3, g3);
+        out[8] = detail::p2_edge_gradient(l1, g1, l3, g3);
+        out[9] = detail::p2_edge_gradient(l2, g2, l3, g3);
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
+        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
+
+        out[0] = detail::p2_vertex_hessian(g0);
+        out[1] = detail::p2_vertex_hessian(g1);
+        out[2] = detail::p2_vertex_hessian(g2);
+        out[3] = detail::p2_vertex_hessian(g3);
+        out[4] = detail::p2_edge_hessian(g0, g1);
+        out[5] = detail::p2_edge_hessian(g1, g2);
+        out[6] = detail::p2_edge_hessian(g0, g2);
+        out[7] = detail::p2_edge_hessian(g0, g3);
+        out[8] = detail::p2_edge_hessian(g1, g3);
+        out[9] = detail::p2_edge_hessian(g2, g3);
+    }
+};
+
+template<>
+struct LagrangeTetFast<3> {
+    static constexpr int n_dofs = 20;
+
+    static constexpr std::array<std::array<std::size_t, 4>, n_dofs> exponents =
+        detail::make_tetrahedron_simplex_exponents<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        detail::fill_simplex_factor_values<3>(l0, phi0);
+        detail::fill_simplex_factor_values<3>(l1, phi1);
+        detail::fill_simplex_factor_values<3>(l2, phi2);
+        detail::fill_simplex_factor_values<3>(l3, phi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]] * phi3[e[3]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> dphi3{};
+        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
+        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
+        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
+        detail::fill_simplex_factor_values_first<3>(l3, phi3, dphi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real v3 = phi3[e[3]];
+            const Real dl0 = dphi0[e[0]] * v1 * v2 * v3;
+            const Real dl1 = v0 * dphi1[e[1]] * v2 * v3;
+            const Real dl2 = v0 * v1 * dphi2[e[2]] * v3;
+            const Real dl3 = v0 * v1 * v2 * dphi3[e[3]];
+            out[n] = Gradient{dl1 - dl0, dl2 - dl0, dl3 - dl0};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> dphi3{};
+        std::array<Real, 4> d2phi0{};
+        std::array<Real, 4> d2phi1{};
+        std::array<Real, 4> d2phi2{};
+        std::array<Real, 4> d2phi3{};
+        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
+        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
+        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
+        detail::fill_simplex_factor_values_first_second<3>(l3, phi3, dphi3, d2phi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real v3 = phi3[e[3]];
+            const Real D0 = dphi0[e[0]];
+            const Real D1 = dphi1[e[1]];
+            const Real D2 = dphi2[e[2]];
+            const Real D3 = dphi3[e[3]];
+
+            const Real H00 = d2phi0[e[0]] * v1 * v2 * v3;
+            const Real H11 = v0 * d2phi1[e[1]] * v2 * v3;
+            const Real H22 = v0 * v1 * d2phi2[e[2]] * v3;
+            const Real H33 = v0 * v1 * v2 * d2phi3[e[3]];
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+
+            Hessian H{};
+            H(0, 0) = H00 - Real(2) * H01 + H11;
+            H(1, 1) = H00 - Real(2) * H02 + H22;
+            H(2, 2) = H00 - Real(2) * H03 + H33;
+            H(0, 1) = H00 - H01 - H02 + H12;
+            H(1, 0) = H(0, 1);
+            H(0, 2) = H00 - H01 - H03 + H13;
+            H(2, 0) = H(0, 2);
+            H(1, 2) = H00 - H02 - H03 + H23;
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISFAST_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
new file mode 100644
index 000000000..4a332621e
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
@@ -0,0 +1,2069 @@
+#include "LagrangeBasisPyramid.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "Basis/BasisExceptions.h"
+#include "BasisTolerance.h"
+#include "Math/DenseLinearAlgebra.h"
+#include "Math/DenseTransformKernels.h"
+#include "LagrangeBasisUtility.h"
+#include "PyramidModalBasis.h"
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+class PyramidLagrangeCache {
+public:
+    using ModalTerm = pyramid_modal::Term;
+
+    struct UvPolynomial {
+        using Power = std::pair<int, int>;
+        std::vector<std::pair<Power, Real>> coeffs;
+
+        void add_term(int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
+            if (std::abs(coeff) <= tol) {
+                return;
+            }
+            const auto key = std::make_pair(pu, pv);
+            const auto found = std::lower_bound(
+                coeffs.begin(),
+                coeffs.end(),
+                key,
+                [](const auto& entry, const Power& value) { return entry.first < value; });
+            if (found == coeffs.end() || found->first != key) {
+                coeffs.insert(found, {key, coeff});
+                return;
+            }
+
+            found->second += coeff;
+            if (std::abs(found->second) <= tol) {
+                coeffs.erase(found);
+            }
+        }
+
+        void add_scaled(const UvPolynomial& other, Real scale, Real tol = Real(1e-14)) {
+            if (std::abs(scale) <= tol) {
+                return;
+            }
+            for (const auto& [powers, coeff] : other.coeffs) {
+                add_term(powers.first, powers.second, scale * coeff, tol);
+            }
+        }
+
+        bool empty(Real tol = Real(1e-12)) const {
+            for (const auto& [powers, coeff] : coeffs) {
+                (void)powers;
+                if (std::abs(coeff) > tol) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        bool is_constant(Real tol = Real(1e-12)) const {
+            for (const auto& [powers, coeff] : coeffs) {
+                if (std::abs(coeff) <= tol) {
+                    continue;
+                }
+                if (powers.first != 0 || powers.second != 0) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        Real constant_value(Real tol = Real(1e-12)) const {
+            Real value = Real(0);
+            for (const auto& [powers, coeff] : coeffs) {
+                if (std::abs(coeff) <= tol) {
+                    continue;
+                }
+                if (powers.first == 0 && powers.second == 0) {
+                    value += coeff;
+                }
+            }
+            return value;
+        }
+    };
+
+    struct ApexSeries {
+        std::vector<std::pair<int, UvPolynomial>> by_power;
+
+        void add_term(int beta, int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
+            const auto found = find_or_insert(beta);
+            found->second.add_term(pu, pv, coeff, tol);
+            if (found->second.empty(tol)) {
+                by_power.erase(found);
+            }
+        }
+
+        void add_scaled(const ApexSeries& other, Real scale, Real tol = Real(1e-14)) {
+            if (std::abs(scale) <= tol) {
+                return;
+            }
+            for (const auto& [beta, poly] : other.by_power) {
+                const auto found = find_or_insert(beta);
+                found->second.add_scaled(poly, scale, tol);
+                if (found->second.empty(tol)) {
+                    by_power.erase(found);
+                }
+            }
+        }
+
+    private:
+        std::vector<std::pair<int, UvPolynomial>>::iterator find_or_insert(int beta) {
+            const auto found = std::lower_bound(
+                by_power.begin(),
+                by_power.end(),
+                beta,
+                [](const auto& entry, int value) { return entry.first < value; });
+            if (found != by_power.end() && found->first == beta) {
+                return found;
+            }
+            return by_power.insert(found, {beta, UvPolynomial{}});
+        }
+    };
+
+    using GradientSeries = std::array<ApexSeries, 3>;
+    using HessianSeries = std::array<std::array<ApexSeries, 3>, 3>;
+
+    enum class ApexLimitKind {
+        Constant,
+        DirectionDependent,
+        Singular,
+    };
+
+    enum class ApexRankStatus {
+        Exact,
+        DirectionDependent,
+        Singular,
+    };
+
+    struct ApexClassification {
+        ApexLimitKind kind{ApexLimitKind::Constant};
+        Real constant_value{0};
+        int leading_power{1};
+    };
+
+    struct ApexData {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        ApexRankStatus gradient_status{ApexRankStatus::Exact};
+        ApexRankStatus hessian_status{ApexRankStatus::Exact};
+    };
+
+    struct OrderData {
+        int order{0};
+        std::vector<math::Vector<Real, 3>> nodes;
+        std::vector<ModalTerm> modal_terms;
+        std::vector<Real> modal_to_nodal;
+        ApexData apex;
+    };
+
+    struct EvaluationScratch {
+        std::vector<Real> modal_values;
+        std::vector<Real> modal_gradient_components;
+        std::vector<Real> modal_hessian_components;
+        std::vector<Gradient> modal_gradients;
+        std::vector<Hessian> modal_hessians;
+        pyramid_modal::EvaluationPoint modal_point;
+
+        void prewarm(std::size_t max_size, std::size_t max_qpts) {
+            const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
+            modal_values.reserve(batched_size);
+            modal_gradient_components.reserve(batched_size * 3u);
+            modal_hessian_components.reserve(batched_size * 9u);
+            modal_gradients.reserve(max_size);
+            modal_hessians.reserve(max_size);
+        }
+    };
+
+    static EvaluationScratch& evaluation_scratch() {
+        // Scratch is intentionally thread-local: production assembly uses a
+        // persistent worker-thread team, so buffers stay warm on each worker.
+        static thread_local EvaluationScratch scratch;
+        return scratch;
+    }
+
+    static void prewarm_scratch(std::size_t max_size, std::size_t max_qpts) {
+        evaluation_scratch().prewarm(max_size, max_qpts);
+    }
+
+    static bool is_apex_point(const math::Vector<Real, 3>& xi) {
+        const Real tol = apex_coord_tolerance();
+        return std::abs(xi[0]) <= tol &&
+               std::abs(xi[1]) <= tol &&
+               std::abs(Real(1) - xi[2]) <= tol;
+    }
+
+    static bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi) {
+        return basis_near_zero(Real(1) - xi[2]);
+    }
+
+    static void validate_top_plane_query(const math::Vector<Real, 3>& xi) {
+        if (on_degenerate_top_plane(xi) && !is_apex_point(xi)) [[unlikely]] {
+            throw BasisEvaluationException(
+                "Pyramid reference evaluation on the degenerate z=1 plane is only defined at the apex",
+                __FILE__, __LINE__, __func__);
+        }
+    }
+
+    static OrderData build_order_data(int order) {
+        OrderData data;
+        data.order = order;
+
+        data.nodes = build_public_nodes(order);
+        data.modal_terms = pyramid_modal::build_terms(order);
+
+        const std::size_t n = data.nodes.size();
+        if (data.modal_terms.size() != n) {
+            throw BasisConstructionException("LagrangeBasis pyramid modal basis size mismatch",
+                                             __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> vandermonde(n * n, Real(0));
+        for (std::size_t row = 0; row < n; ++row) {
+            pyramid_modal::EvaluationPoint modal_point;
+            pyramid_modal::prepare_evaluation_point(
+                data.modal_terms, data.nodes[row], modal_point);
+            for (std::size_t col = 0; col < n; ++col) {
+                Real value = Real(0);
+                pyramid_modal::evaluate_term(data.modal_terms[col], modal_point, value);
+                vandermonde[row * n + col] = value;
+            }
+        }
+
+        const auto inverse_result = math::invert_dense_matrix_with_diagnostics(
+            std::move(vandermonde),
+            n,
+            "LagrangeBasis pyramid Vandermonde");
+        math::validate_dense_inverse_diagnostics(
+            inverse_result,
+            n,
+            "LagrangeBasis pyramid Vandermonde");
+        const std::vector<Real>& inverse = inverse_result.inverse;
+
+        data.modal_to_nodal.assign(n * n, Real(0));
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                data.modal_to_nodal[basis_i * n + modal_j] =
+                    inverse[modal_j * n + basis_i];
+            }
+        }
+        data.apex = build_apex_data(data);
+        return data;
+    }
+
+    static bool has_low_order_fast_modal_to_nodal(const OrderData& data) noexcept {
+        return data.order == 1 || data.order == 2;
+    }
+
+    static const OrderData& get(int order) {
+        constexpr int kMaxOnceCachedOrder = 12;
+        if (order >= 0 && order <= kMaxOnceCachedOrder) {
+            static std::array<std::once_flag, kMaxOnceCachedOrder + 1> flags;
+            static std::array<std::unique_ptr<OrderData>, kMaxOnceCachedOrder + 1> cache;
+            const auto idx = static_cast<std::size_t>(order);
+            std::call_once(flags[idx], [idx, order]() {
+                cache[idx] = std::make_unique<OrderData>(build_order_data(order));
+            });
+            return *cache[idx];
+        }
+
+        static std::mutex fallback_mutex;
+        static std::map<int, std::unique_ptr<OrderData>> fallback_cache;
+
+        std::lock_guard<std::mutex> lock(fallback_mutex);
+        const auto found = fallback_cache.find(order);
+        if (found != fallback_cache.end()) {
+            return *found->second;
+        }
+
+        auto data = std::make_unique<OrderData>(build_order_data(order));
+        const auto [it, inserted] = fallback_cache.emplace(order, std::move(data));
+        (void)inserted;
+        return *it->second;
+    }
+
+    static void evaluate_values(const OrderData& data,
+                                const math::Vector<Real, 3>& xi,
+                                std::vector<Real>& values) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            values = data.apex.values;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal = scratch.modal_values;
+        auto& modal_point = scratch.modal_point;
+        modal.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal, values);
+        } else {
+            apply_modal_to_nodal(data, modal, values);
+        }
+    }
+
+    static void evaluate_gradients(const OrderData& data,
+                                   const math::Vector<Real, 3>& xi,
+                                   std::vector<Gradient>& gradients) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            gradients = data.apex.gradients;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_point = scratch.modal_point;
+        modal_gradients.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal_gradients, gradients);
+        } else {
+            apply_modal_to_nodal(data, modal_gradients, gradients);
+        }
+    }
+
+    static void evaluate_hessians(const OrderData& data,
+                                  const math::Vector<Real, 3>& xi,
+                                  std::vector<Hessian>& hessians) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            hessians = data.apex.hessians;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_hessians.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal_hessians, hessians);
+        } else {
+            apply_modal_to_nodal(data, modal_hessians, hessians);
+        }
+    }
+
+    static void evaluate_all(const OrderData& data,
+                             const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& values,
+                             std::vector<Gradient>& gradients,
+                             std::vector<Hessian>& hessians) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            values = data.apex.values;
+            gradients = data.apex.gradients;
+            hessians = data.apex.hessians;
+            return;
+        }
+
+        const std::size_t n = data.modal_terms.size();
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_values.resize(n);
+        modal_gradients.resize(n);
+        modal_hessians.resize(n);
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+
+        for (std::size_t m = 0; m < n; ++m) {
+            pyramid_modal::evaluate_term(
+                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
+        }
+
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_all(
+                data, modal_values, modal_gradients, modal_hessians, values, gradients, hessians);
+            return;
+        }
+
+        values.resize(n);
+        gradients.resize(n);
+        hessians.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Gradient gradient{};
+            Hessian hessian{};
+            Real value = Real(0);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                value += coeff * modal_values[modal_j];
+
+                const Real* modal_gradient = modal_gradients[modal_j].data();
+                gradient[0] += coeff * modal_gradient[0];
+                gradient[1] += coeff * modal_gradient[1];
+                gradient[2] += coeff * modal_gradient[2];
+
+                const Real* modal_hessian = modal_hessians[modal_j].data();
+                Real* hessian_data = hessian.data();
+                hessian_data[0] += coeff * modal_hessian[0];
+                hessian_data[1] += coeff * modal_hessian[1];
+                hessian_data[2] += coeff * modal_hessian[2];
+                hessian_data[4] += coeff * modal_hessian[4];
+                hessian_data[5] += coeff * modal_hessian[5];
+                hessian_data[8] += coeff * modal_hessian[8];
+            }
+            values[basis_i] = value;
+            gradients[basis_i] = gradient;
+            Real* hessian_data = hessian.data();
+            hessian_data[3] = hessian_data[1];
+            hessian_data[6] = hessian_data[2];
+            hessian_data[7] = hessian_data[5];
+            hessians[basis_i] = hessian;
+        }
+    }
+
+    static void evaluate_values_to(const OrderData& data,
+                                   const math::Vector<Real, 3>& xi,
+                                   Real* SVMP_RESTRICT values_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal = scratch.modal_values;
+        auto& modal_point = scratch.modal_point;
+        modal.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal, values_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal, values_out);
+        }
+    }
+
+    static void evaluate_gradients_to(const OrderData& data,
+                                      const math::Vector<Real, 3>& xi,
+                                      Real* SVMP_RESTRICT gradients_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
+                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
+                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
+                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
+            }
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_point = scratch.modal_point;
+        modal_gradients.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal_gradients, gradients_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal_gradients, gradients_out);
+        }
+    }
+
+    static void evaluate_hessians_to(const OrderData& data,
+                                     const math::Vector<Real, 3>& xi,
+                                     Real* SVMP_RESTRICT hessians_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
+                store_hessian(data.apex.hessians[i], hessians_out + i * 9u);
+            }
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_hessians.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal_hessians, hessians_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal_hessians, hessians_out);
+        }
+    }
+
+    static void evaluate_all_to(const OrderData& data,
+                                const math::Vector<Real, 3>& xi,
+                                Real* SVMP_RESTRICT values_out,
+                                Real* SVMP_RESTRICT gradients_out,
+                                Real* SVMP_RESTRICT hessians_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
+            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
+                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
+                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
+                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
+            }
+            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
+                const Real* hessian = data.apex.hessians[i].data();
+                std::copy(hessian, hessian + 9u, hessians_out + i * 9u);
+            }
+            return;
+        }
+
+        const std::size_t n = data.modal_terms.size();
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_values.resize(n);
+        modal_gradients.resize(n);
+        modal_hessians.resize(n);
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+
+        for (std::size_t m = 0; m < n; ++m) {
+            pyramid_modal::evaluate_term(
+                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
+        }
+
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_all_to(
+                data, modal_values, modal_gradients, modal_hessians, values_out, gradients_out, hessians_out);
+            return;
+        }
+
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Real value = Real(0);
+            Real gradient[3] = {Real(0), Real(0), Real(0)};
+            Real hessian[9] = {};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                value += coeff * modal_values[modal_j];
+
+                const Real* modal_gradient = modal_gradients[modal_j].data();
+                gradient[0] += coeff * modal_gradient[0];
+                gradient[1] += coeff * modal_gradient[1];
+                gradient[2] += coeff * modal_gradient[2];
+
+                const Real* modal_hessian = modal_hessians[modal_j].data();
+                hessian[0] += coeff * modal_hessian[0];
+                hessian[1] += coeff * modal_hessian[1];
+                hessian[2] += coeff * modal_hessian[2];
+                hessian[4] += coeff * modal_hessian[4];
+                hessian[5] += coeff * modal_hessian[5];
+                hessian[8] += coeff * modal_hessian[8];
+            }
+
+            values_out[basis_i] = value;
+            Real* gradient_out = gradients_out + basis_i * 3u;
+            gradient_out[0] = gradient[0];
+            gradient_out[1] = gradient[1];
+            gradient_out[2] = gradient[2];
+
+            Real* hessian_out = hessians_out + basis_i * 9u;
+            hessian_out[0] = hessian[0];
+            hessian_out[1] = hessian[1];
+            hessian_out[2] = hessian[2];
+            hessian_out[3] = hessian[1];
+            hessian_out[4] = hessian[4];
+            hessian_out[5] = hessian[5];
+            hessian_out[6] = hessian[2];
+            hessian_out[7] = hessian[5];
+            hessian_out[8] = hessian[8];
+        }
+    }
+
+    static void evaluate_at_quadrature_points_strided(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        const unsigned mask = (values_out != nullptr ? 1u : 0u) |
+                              (gradients_out != nullptr ? 2u : 0u) |
+                              (hessians_out != nullptr ? 4u : 0u);
+        switch (mask) {
+            case 0u:
+                validate_strided_points(points);
+                return;
+            case 1u:
+                evaluate_at_quadrature_points_strided_impl<true, false, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 2u:
+                evaluate_at_quadrature_points_strided_impl<false, true, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 3u:
+                evaluate_at_quadrature_points_strided_impl<true, true, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 4u:
+                evaluate_at_quadrature_points_strided_impl<false, false, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 5u:
+                evaluate_at_quadrature_points_strided_impl<true, false, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 6u:
+                evaluate_at_quadrature_points_strided_impl<false, true, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 7u:
+                evaluate_at_quadrature_points_strided_impl<true, true, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            default:
+                return;
+        }
+    }
+
+private:
+    static void validate_strided_points(const std::vector<math::Vector<Real, 3>>& points) {
+        for (const auto& xi : points) {
+            validate_top_plane_query(xi);
+        }
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void write_apex_strided(const OrderData& data,
+                                   std::size_t q,
+                                   std::size_t output_stride,
+                                   Real* SVMP_RESTRICT values_out,
+                                   Real* SVMP_RESTRICT gradients_out,
+                                   Real* SVMP_RESTRICT hessians_out) {
+        const std::size_t n = data.modal_terms.size();
+        if constexpr (NeedValues) {
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                values_out[basis_i * output_stride + q] = data.apex.values[basis_i];
+            }
+        }
+        if constexpr (NeedGradients) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                Real* g = gradients_out + basis_i * 3u * output_stride;
+                g[0u * output_stride + q] = data.apex.gradients[basis_i][0];
+                g[1u * output_stride + q] = data.apex.gradients[basis_i][1];
+                g[2u * output_stride + q] = data.apex.gradients[basis_i][2];
+            }
+        }
+        if constexpr (NeedHessians) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                const Real* hessian = data.apex.hessians[basis_i].data();
+                Real* H = hessians_out + basis_i * 9u * output_stride;
+                for (std::size_t component = 0; component < 9u; ++component) {
+                    H[component * output_stride + q] = hessian[component];
+                }
+            }
+        }
+    }
+
+    template <int Px,
+              int Py,
+              int Pz,
+              int DenomPower,
+              bool NeedValues,
+              bool NeedGradients,
+              bool NeedHessians>
+    static void fill_low_order_modal_jet(std::size_t modal_i,
+                                         const Real* SVMP_RESTRICT xp,
+                                         const Real* SVMP_RESTRICT yp,
+                                         const Real* SVMP_RESTRICT zp,
+                                         const Real* SVMP_RESTRICT inv_tp,
+                                         Real* SVMP_RESTRICT modal_values,
+                                         Real (*SVMP_RESTRICT modal_gradients)[3],
+                                         Real (*SVMP_RESTRICT modal_hessians)[9]) {
+        const Real xy_base = xp[Px] * yp[Py];
+        const Real base = xy_base * zp[Pz];
+        const Real inv_denom = inv_tp[DenomPower];
+        const Real value = base * inv_denom;
+
+        if constexpr (NeedValues) {
+            modal_values[modal_i] = value;
+        }
+        if constexpr (NeedGradients) {
+            Real* g = modal_gradients[modal_i];
+            if constexpr (Px > 0) {
+                g[0] = static_cast<Real>(Px) * xp[Px - 1] * yp[Py] * zp[Pz] * inv_denom;
+            } else {
+                g[0] = Real(0);
+            }
+            if constexpr (Py > 0) {
+                g[1] = static_cast<Real>(Py) * xp[Px] * yp[Py - 1] * zp[Pz] * inv_denom;
+            } else {
+                g[1] = Real(0);
+            }
+            Real gz = Real(0);
+            if constexpr (Pz > 0) {
+                gz += static_cast<Real>(Pz) * xy_base * zp[Pz - 1] * inv_denom;
+            }
+            if constexpr (DenomPower > 0) {
+                gz += static_cast<Real>(DenomPower) * base * inv_tp[DenomPower + 1];
+            }
+            g[2] = gz;
+        }
+        if constexpr (NeedHessians) {
+            Real* H = modal_hessians[modal_i];
+            if constexpr (Px > 1) {
+                H[0] = static_cast<Real>(Px * (Px - 1)) *
+                       xp[Px - 2] * yp[Py] * zp[Pz] * inv_denom;
+            } else {
+                H[0] = Real(0);
+            }
+            if constexpr (Py > 1) {
+                H[4] = static_cast<Real>(Py * (Py - 1)) *
+                       xp[Px] * yp[Py - 2] * zp[Pz] * inv_denom;
+            } else {
+                H[4] = Real(0);
+            }
+            Real hxy = Real(0);
+            if constexpr (Px > 0 && Py > 0) {
+                hxy = static_cast<Real>(Px * Py) *
+                      xp[Px - 1] * yp[Py - 1] * zp[Pz] * inv_denom;
+            }
+            H[1] = hxy;
+            H[3] = hxy;
+
+            Real hxz = Real(0);
+            if constexpr (Px > 0) {
+                constexpr Real px_real = static_cast<Real>(Px);
+                const Real x_deriv_y = px_real * xp[Px - 1] * yp[Py];
+                if constexpr (Pz > 0) {
+                    hxz += x_deriv_y * static_cast<Real>(Pz) *
+                           zp[Pz - 1] * inv_denom;
+                }
+                if constexpr (DenomPower > 0) {
+                    hxz += x_deriv_y * static_cast<Real>(DenomPower) *
+                           zp[Pz] * inv_tp[DenomPower + 1];
+                }
+            }
+            H[2] = hxz;
+            H[6] = hxz;
+
+            Real hyz = Real(0);
+            if constexpr (Py > 0) {
+                constexpr Real py_real = static_cast<Real>(Py);
+                const Real x_y_deriv = py_real * xp[Px] * yp[Py - 1];
+                if constexpr (Pz > 0) {
+                    hyz += x_y_deriv * static_cast<Real>(Pz) *
+                           zp[Pz - 1] * inv_denom;
+                }
+                if constexpr (DenomPower > 0) {
+                    hyz += x_y_deriv * static_cast<Real>(DenomPower) *
+                           zp[Pz] * inv_tp[DenomPower + 1];
+                }
+            }
+            H[5] = hyz;
+            H[7] = hyz;
+
+            Real hzz = Real(0);
+            if constexpr (Pz > 1) {
+                hzz += static_cast<Real>(Pz * (Pz - 1)) *
+                       xy_base * zp[Pz - 2] * inv_denom;
+            }
+            if constexpr (Pz > 0 && DenomPower > 0) {
+                hzz += static_cast<Real>(2 * Pz * DenomPower) * xy_base *
+                       zp[Pz - 1] * inv_tp[DenomPower + 1];
+            }
+            if constexpr (DenomPower > 0) {
+                hzz += static_cast<Real>(DenomPower * (DenomPower + 1)) *
+                       base * inv_tp[DenomPower + 2];
+            }
+            H[8] = hzz;
+        }
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void evaluate_low_order_modal_jets(const OrderData& data,
+                                              const math::Vector<Real, 3>& xi,
+                                              Real* SVMP_RESTRICT modal_values,
+                                              Real (*SVMP_RESTRICT modal_gradients)[3],
+                                              Real (*SVMP_RESTRICT modal_hessians)[9]) {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        const Real inv_t = Real(1) / (Real(1) - z);
+        const Real xp[3] = {Real(1), x, x * x};
+        const Real yp[3] = {Real(1), y, y * y};
+        const Real zp[3] = {Real(1), z, z * z};
+        Real inv_tp[5] = {Real(1), inv_t, Real(0), Real(0), Real(0)};
+        inv_tp[2] = inv_tp[1] * inv_t;
+        inv_tp[3] = inv_tp[2] * inv_t;
+        inv_tp[4] = inv_tp[3] * inv_t;
+
+        fill_low_order_modal_jet<0, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            0u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            1u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        if (data.order == 1) {
+            fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+                2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+                3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+                4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            return;
+        }
+
+        fill_low_order_modal_jet<2, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<2, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            5u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 2, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            6u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 2, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            7u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<2, 2, 0, 2, NeedValues, NeedGradients, NeedHessians>(
+            8u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            9u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            10u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 1, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            11u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 1, 1, 1, NeedValues, NeedGradients, NeedHessians>(
+            12u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 0, 2, 0, NeedValues, NeedGradients, NeedHessians>(
+            13u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static bool try_evaluate_low_order_strided(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        if (!has_low_order_fast_modal_to_nodal(data)) {
+            return false;
+        }
+        for (const auto& xi : points) {
+            validate_top_plane_query(xi);
+            if (is_apex_point(xi)) {
+                return false;
+            }
+        }
+
+        Real modal_values[14];
+        Real modal_gradients[14][3];
+        Real modal_hessians[14][9];
+        for (std::size_t q = 0; q < points.size(); ++q) {
+            evaluate_low_order_modal_jets<NeedValues, NeedGradients, NeedHessians>(
+                data, points[q], modal_values, modal_gradients, modal_hessians);
+            if constexpr (NeedValues) {
+                apply_low_order_combination(
+                    data,
+                    1u,
+                    [&](std::size_t modal_i, std::size_t) {
+                        return modal_values[modal_i];
+                    },
+                    [&](std::size_t basis_i, std::size_t, Real value) {
+                        values_out[basis_i * output_stride + q] = value;
+                    });
+            }
+            if constexpr (NeedGradients) {
+                apply_low_order_combination(
+                    data,
+                    3u,
+                    [&](std::size_t modal_i, std::size_t component) {
+                        return modal_gradients[modal_i][component];
+                    },
+                    [&](std::size_t basis_i, std::size_t component, Real value) {
+                        gradients_out[basis_i * 3u * output_stride +
+                                      component * output_stride + q] = value;
+                    });
+            }
+            if constexpr (NeedHessians) {
+                apply_low_order_combination(
+                    data,
+                    9u,
+                    [&](std::size_t modal_i, std::size_t component) {
+                        return modal_hessians[modal_i][component];
+                    },
+                    [&](std::size_t basis_i, std::size_t component, Real value) {
+                        hessians_out[basis_i * 9u * output_stride +
+                                     component * output_stride + q] = value;
+                    });
+            }
+        }
+        return true;
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void evaluate_at_quadrature_points_strided_impl(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        const std::size_t n = data.modal_terms.size();
+        if (points.empty() || n == 0u) {
+            return;
+        }
+        if (try_evaluate_low_order_strided<NeedValues, NeedGradients, NeedHessians>(
+                data, points, output_stride, values_out, gradients_out, hessians_out)) {
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        if constexpr (NeedValues) {
+            modal_values.resize(n);
+        }
+        if constexpr (NeedGradients) {
+            modal_gradients.resize(n);
+        }
+        if constexpr (NeedHessians) {
+            modal_hessians.resize(n);
+        }
+        const bool use_fast_modal_to_nodal = has_low_order_fast_modal_to_nodal(data);
+
+        if (!use_fast_modal_to_nodal) {
+            bool has_apex_query = false;
+            for (const auto& xi : points) {
+                validate_top_plane_query(xi);
+                has_apex_query = has_apex_query || is_apex_point(xi);
+            }
+
+            if (!has_apex_query) {
+                const std::size_t num_qpts = points.size();
+                if constexpr (NeedValues) {
+                    modal_values.resize(n * num_qpts);
+                }
+                if constexpr (NeedGradients) {
+                    scratch.modal_gradient_components.resize(n * 3u * num_qpts);
+                }
+                if constexpr (NeedHessians) {
+                    scratch.modal_hessian_components.resize(n * 9u * num_qpts);
+                }
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const auto& xi = points[q];
+                    pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+                    for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                        Real modal_value = Real(0);
+                        Gradient modal_gradient{};
+                        Hessian modal_hessian{};
+                        pyramid_modal::evaluate_term(
+                            data.modal_terms[modal_j],
+                            modal_point,
+                            modal_value,
+                            NeedGradients ? &modal_gradient : nullptr,
+                            NeedHessians ? &modal_hessian : nullptr);
+                        if constexpr (NeedValues) {
+                            modal_values[modal_j * num_qpts + q] = modal_value;
+                        }
+                        if constexpr (NeedGradients) {
+                            for (std::size_t component = 0; component < 3u; ++component) {
+                                scratch.modal_gradient_components[
+                                    (modal_j * 3u + component) * num_qpts + q] =
+                                    modal_gradient[component];
+                            }
+                        }
+                        if constexpr (NeedHessians) {
+                            for (std::size_t component = 0; component < 9u; ++component) {
+                                scratch.modal_hessian_components[
+                                    (modal_j * 9u + component) * num_qpts + q] =
+                                    modal_hessian.data()[component];
+                            }
+                        }
+                    }
+                }
+
+                const Real* transform = data.modal_to_nodal.data();
+                if constexpr (NeedValues) {
+                    math::dense_transform_batched_row_major(
+                        transform,
+                        n,
+                        n,
+                        modal_values.data(),
+                        num_qpts,
+                        values_out,
+                        output_stride,
+                        num_qpts);
+                }
+                if constexpr (NeedGradients) {
+                    for (std::size_t component = 0; component < 3u; ++component) {
+                        math::dense_transform_batched_row_major(
+                            transform,
+                            n,
+                            n,
+                            scratch.modal_gradient_components.data() + component * num_qpts,
+                            3u * num_qpts,
+                            gradients_out + component * output_stride,
+                            3u * output_stride,
+                            num_qpts);
+                    }
+                }
+                if constexpr (NeedHessians) {
+                    for (std::size_t component = 0; component < 9u; ++component) {
+                        math::dense_transform_batched_row_major(
+                            transform,
+                            n,
+                            n,
+                            scratch.modal_hessian_components.data() + component * num_qpts,
+                            9u * num_qpts,
+                            hessians_out + component * output_stride,
+                            9u * output_stride,
+                            num_qpts);
+                    }
+                }
+                return;
+            }
+        }
+
+        for (std::size_t q = 0; q < points.size(); ++q) {
+            const auto& xi = points[q];
+            validate_top_plane_query(xi);
+
+            if (is_apex_point(xi)) {
+                write_apex_strided<NeedValues, NeedGradients, NeedHessians>(
+                    data, q, output_stride, values_out, gradients_out, hessians_out);
+                continue;
+            }
+
+            pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                Gradient* gradient_out = nullptr;
+                Hessian* hessian_out = nullptr;
+                if constexpr (NeedGradients) {
+                    gradient_out = &modal_gradients[modal_j];
+                }
+                if constexpr (NeedHessians) {
+                    hessian_out = &modal_hessians[modal_j];
+                }
+                if constexpr (NeedValues) {
+                    pyramid_modal::evaluate_term(
+                        data.modal_terms[modal_j],
+                        modal_point,
+                        modal_values[modal_j],
+                        gradient_out,
+                        hessian_out);
+                } else {
+                    Real value = Real(0);
+                    pyramid_modal::evaluate_term(
+                        data.modal_terms[modal_j],
+                        modal_point,
+                        value,
+                        gradient_out,
+                        hessian_out);
+                }
+            }
+
+            if (use_fast_modal_to_nodal) {
+                if constexpr (NeedValues) {
+                    apply_low_order_combination(
+                        data,
+                        1u,
+                        [&](std::size_t modal_i, std::size_t) {
+                            return modal_values[modal_i];
+                        },
+                        [&](std::size_t basis_i, std::size_t, Real value) {
+                            values_out[basis_i * output_stride + q] = value;
+                        });
+                }
+                if constexpr (NeedGradients) {
+                    apply_low_order_combination(
+                        data,
+                        3u,
+                        [&](std::size_t modal_i, std::size_t component) {
+                            return modal_gradients[modal_i][component];
+                        },
+                        [&](std::size_t basis_i, std::size_t component, Real value) {
+                            gradients_out[basis_i * 3u * output_stride +
+                                          component * output_stride + q] = value;
+                        });
+                }
+                if constexpr (NeedHessians) {
+                    apply_low_order_combination(
+                        data,
+                        9u,
+                        [&](std::size_t modal_i, std::size_t component) {
+                            return modal_hessians[modal_i].data()[component];
+                        },
+                        [&](std::size_t basis_i, std::size_t component, Real value) {
+                            hessians_out[basis_i * 9u * output_stride +
+                                         component * output_stride + q] = value;
+                        });
+                }
+                continue;
+            }
+
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
+                [[maybe_unused]] Real value = Real(0);
+                [[maybe_unused]] std::array<Real, 3> gradient{};
+                [[maybe_unused]] std::array<Real, 9> hessian{};
+
+                for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                    const Real coeff = matrix_row[modal_j];
+                    if constexpr (NeedValues) {
+                        value += coeff * modal_values[modal_j];
+                    }
+                    if constexpr (NeedGradients) {
+                        const Real* modal_gradient = modal_gradients[modal_j].data();
+                        gradient[0] += coeff * modal_gradient[0];
+                        gradient[1] += coeff * modal_gradient[1];
+                        gradient[2] += coeff * modal_gradient[2];
+                    }
+                    if constexpr (NeedHessians) {
+                        const Real* modal_hessian = modal_hessians[modal_j].data();
+                        for (std::size_t component = 0; component < 9u; ++component) {
+                            hessian[component] += coeff * modal_hessian[component];
+                        }
+                    }
+                }
+
+                if constexpr (NeedValues) {
+                    values_out[basis_i * output_stride + q] = value;
+                }
+                if constexpr (NeedGradients) {
+                    Real* g = gradients_out + basis_i * 3u * output_stride;
+                    g[0u * output_stride + q] = gradient[0];
+                    g[1u * output_stride + q] = gradient[1];
+                    g[2u * output_stride + q] = gradient[2];
+                }
+                if constexpr (NeedHessians) {
+                    Real* H = hessians_out + basis_i * 9u * output_stride;
+                    for (std::size_t component = 0; component < 9u; ++component) {
+                        H[component * output_stride + q] = hessian[component];
+                    }
+                }
+            }
+        }
+    }
+
+    static Real apex_coord_tolerance() noexcept {
+        return basis_scaled_tolerance();
+    }
+
+    // Coefficient pruning for symbolic apex series, not a reference-coordinate
+    // roundoff test. Keep this strict and separate from BasisTolerance.
+    static constexpr Real kSeriesTolerance = Real(1e-12);
+
+    static Real binomial_coeff(int n, int k) {
+        if (k < 0 || k > n) {
+            return Real(0);
+        }
+        if (k == 0 || k == n) {
+            return Real(1);
+        }
+        k = std::min(k, n - k);
+        Real coeff = Real(1);
+        for (int i = 1; i <= k; ++i) {
+            coeff *= static_cast<Real>(n - (k - i));
+            coeff /= static_cast<Real>(i);
+        }
+        return coeff;
+    }
+
+    static void add_z_expansion(ApexSeries& series,
+                                int z_power,
+                                int beta0,
+                                int pu,
+                                int pv,
+                                Real coeff) {
+        for (int q = 0; q <= z_power; ++q) {
+            const Real z_coeff = coeff * binomial_coeff(z_power, q) *
+                                 ((q % 2 == 0) ? Real(1) : Real(-1));
+            series.add_term(beta0 + q, pu, pv, z_coeff, kSeriesTolerance);
+        }
+    }
+
+    static ApexSeries modal_value_asymptotic(const ModalTerm& term) {
+        ApexSeries series;
+        add_z_expansion(series,
+                        term.pz,
+                        term.px + term.py - term.denom_power,
+                        term.px,
+                        term.py,
+                        Real(1));
+        return series;
+    }
+
+    static GradientSeries modal_gradient_asymptotic(const ModalTerm& term) {
+        GradientSeries gradient_series{};
+
+        if (term.px > 0) {
+            add_z_expansion(gradient_series[0],
+                            term.pz,
+                            term.px - 1 + term.py - term.denom_power,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px));
+        }
+
+        if (term.py > 0) {
+            add_z_expansion(gradient_series[1],
+                            term.pz,
+                            term.px + term.py - 1 - term.denom_power,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py));
+        }
+
+        if (term.pz > 0) {
+            add_z_expansion(gradient_series[2],
+                            term.pz - 1,
+                            term.px + term.py - term.denom_power,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.pz));
+        }
+        if (term.denom_power > 0) {
+            add_z_expansion(gradient_series[2],
+                            term.pz,
+                            term.px + term.py - term.denom_power - 1,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.denom_power));
+        }
+
+        return gradient_series;
+    }
+
+    static HessianSeries modal_hessian_asymptotic(const ModalTerm& term) {
+        HessianSeries hessian_series{};
+
+        if (term.px > 1) {
+            add_z_expansion(hessian_series[0][0],
+                            term.pz,
+                            term.px - 2 + term.py - term.denom_power,
+                            term.px - 2,
+                            term.py,
+                            static_cast<Real>(term.px * (term.px - 1)));
+        }
+
+        if (term.py > 1) {
+            add_z_expansion(hessian_series[1][1],
+                            term.pz,
+                            term.px + term.py - 2 - term.denom_power,
+                            term.px,
+                            term.py - 2,
+                            static_cast<Real>(term.py * (term.py - 1)));
+        }
+
+        if (term.px > 0 && term.py > 0) {
+            add_z_expansion(hessian_series[0][1],
+                            term.pz,
+                            term.px + term.py - 2 - term.denom_power,
+                            term.px - 1,
+                            term.py - 1,
+                            static_cast<Real>(term.px * term.py));
+            hessian_series[1][0] = hessian_series[0][1];
+        }
+
+        if (term.px > 0 && term.pz > 0) {
+            add_z_expansion(hessian_series[0][2],
+                            term.pz - 1,
+                            term.px - 1 + term.py - term.denom_power,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px * term.pz));
+        }
+        if (term.px > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[0][2],
+                            term.pz,
+                            term.px - 1 + term.py - term.denom_power - 1,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px * term.denom_power));
+        }
+        hessian_series[2][0] = hessian_series[0][2];
+
+        if (term.py > 0 && term.pz > 0) {
+            add_z_expansion(hessian_series[1][2],
+                            term.pz - 1,
+                            term.px + term.py - 1 - term.denom_power,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py * term.pz));
+        }
+        if (term.py > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[1][2],
+                            term.pz,
+                            term.px + term.py - 1 - term.denom_power - 1,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py * term.denom_power));
+        }
+        hessian_series[2][1] = hessian_series[1][2];
+
+        if (term.pz > 1) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz - 2,
+                            term.px + term.py - term.denom_power,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.pz * (term.pz - 1)));
+        }
+        if (term.pz > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz - 1,
+                            term.px + term.py - term.denom_power - 1,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(2 * term.pz * term.denom_power));
+        }
+        if (term.denom_power > 0) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz,
+                            term.px + term.py - term.denom_power - 2,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.denom_power * (term.denom_power + 1)));
+        }
+
+        return hessian_series;
+    }
+
+    static ApexClassification classify_series(const ApexSeries& series) {
+        for (const auto& [beta, poly] : series.by_power) {
+            if (poly.empty(kSeriesTolerance)) {
+                continue;
+            }
+            if (beta < 0) {
+                return {ApexLimitKind::Singular, Real(0), beta};
+            }
+            if (beta > 0) {
+                return {ApexLimitKind::Constant, Real(0), beta};
+            }
+            if (poly.is_constant(kSeriesTolerance)) {
+                return {ApexLimitKind::Constant, poly.constant_value(kSeriesTolerance), beta};
+            }
+            return {ApexLimitKind::DirectionDependent, Real(0), beta};
+        }
+        return {ApexLimitKind::Constant, Real(0), 1};
+    }
+
+    static void accumulate_rank_status(ApexRankStatus& status,
+                                       const ApexClassification& classification) {
+        if (classification.kind == ApexLimitKind::Singular) {
+            status = ApexRankStatus::Singular;
+            return;
+        }
+        if (classification.kind == ApexLimitKind::DirectionDependent &&
+            status != ApexRankStatus::Singular) {
+            status = ApexRankStatus::DirectionDependent;
+        }
+    }
+
+    static std::string apex_status_message(const char* rank,
+                                           ApexRankStatus status) {
+        switch (status) {
+            case ApexRankStatus::DirectionDependent:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " at the exact apex is not uniquely defined under admissible interior approaches";
+            case ApexRankStatus::Singular:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " at the exact apex is singular for this basis family";
+            case ApexRankStatus::Exact:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " apex evaluation unexpectedly reported non-exact status";
+        }
+        return std::string("Pyramid rational nodal ") + rank +
+               " apex evaluation is not available";
+    }
+
+    static ApexData build_apex_data(const OrderData& data) {
+        const std::size_t n = data.modal_terms.size();
+
+        std::vector<ApexSeries> modal_values(n);
+        std::vector<GradientSeries> modal_gradients(n);
+        std::vector<HessianSeries> modal_hessians(n);
+        for (std::size_t m = 0; m < n; ++m) {
+            modal_values[m] = modal_value_asymptotic(data.modal_terms[m]);
+            modal_gradients[m] = modal_gradient_asymptotic(data.modal_terms[m]);
+            modal_hessians[m] = modal_hessian_asymptotic(data.modal_terms[m]);
+        }
+
+        std::vector<ApexSeries> nodal_values(n);
+        std::vector<GradientSeries> nodal_gradients(n);
+        std::vector<HessianSeries> nodal_hessians(n);
+        for (std::size_t i = 0; i < n; ++i) {
+            for (std::size_t m = 0; m < n; ++m) {
+                const Real coeff = data.modal_to_nodal[i * n + m];
+                nodal_values[i].add_scaled(modal_values[m], coeff, kSeriesTolerance);
+                for (int d = 0; d < 3; ++d) {
+                    nodal_gradients[i][static_cast<std::size_t>(d)].add_scaled(
+                        modal_gradients[m][static_cast<std::size_t>(d)], coeff, kSeriesTolerance);
+                }
+                for (int r = 0; r < 3; ++r) {
+                    for (int c = 0; c < 3; ++c) {
+                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]
+                            .add_scaled(
+                                modal_hessians[m][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)],
+                                coeff,
+                                kSeriesTolerance);
+                    }
+                }
+            }
+        }
+
+        ApexData apex;
+        apex.values.assign(n, Real(0));
+        apex.gradients.assign(n, Gradient{});
+        apex.hessians.assign(n, Hessian{});
+
+        for (std::size_t i = 0; i < n; ++i) {
+            const ApexClassification value_class = classify_series(nodal_values[i]);
+            if (value_class.kind != ApexLimitKind::Constant) {
+                throw BasisConstructionException(
+                    "Pyramid nodal value at apex is not uniquely defined for basis index " +
+                    std::to_string(i),
+                    __FILE__, __LINE__, __func__);
+            }
+            apex.values[i] = value_class.constant_value;
+
+            for (int d = 0; d < 3; ++d) {
+                const ApexClassification grad_class = classify_series(
+                    nodal_gradients[i][static_cast<std::size_t>(d)]);
+                accumulate_rank_status(apex.gradient_status, grad_class);
+                if (grad_class.kind == ApexLimitKind::Constant) {
+                    apex.gradients[i][static_cast<std::size_t>(d)] = grad_class.constant_value;
+                }
+            }
+
+            for (int r = 0; r < 3; ++r) {
+                for (int c = 0; c < 3; ++c) {
+                    const ApexClassification hess_class = classify_series(
+                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]);
+                    accumulate_rank_status(apex.hessian_status, hess_class);
+                    if (hess_class.kind == ApexLimitKind::Constant) {
+                        apex.hessians[i](static_cast<std::size_t>(r),
+                                         static_cast<std::size_t>(c)) = hess_class.constant_value;
+                    }
+                }
+            }
+        }
+
+        if (apex.gradient_status != ApexRankStatus::Exact) {
+            apex.gradients.clear();
+        }
+        if (apex.hessian_status != ApexRankStatus::Exact) {
+            apex.hessians.clear();
+        }
+
+        return apex;
+    }
+
+    static std::vector<math::Vector<Real, 3>> build_public_nodes(int order) {
+        if (order == 0) {
+            return {math::Vector<Real, 3>{Real(0), Real(0), Real(0.25)}};
+        }
+
+        std::vector<math::Vector<Real, 3>> nodes;
+        nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
+
+        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(-1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(-1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(1)});
+
+        for (int m = 1; m < order; ++m) {
+            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(-1), Real(0)});
+        }
+        for (int m = 1; m < order; ++m) {
+            nodes.push_back(math::Vector<Real, 3>{Real(1), equispaced_pm_one_coord(m, order), Real(0)});
+        }
+        for (int m = order - 1; m >= 1; --m) {
+            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(1), Real(0)});
+        }
+        for (int m = order - 1; m >= 1; --m) {
+            nodes.push_back(math::Vector<Real, 3>{Real(-1), equispaced_pm_one_coord(m, order), Real(0)});
+        }
+
+        for (int level = 1; level < order; ++level) {
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+            nodes.push_back(math::Vector<Real, 3>{-scale, -scale, z});
+            nodes.push_back(math::Vector<Real, 3>{scale, -scale, z});
+            nodes.push_back(math::Vector<Real, 3>{scale, scale, z});
+            nodes.push_back(math::Vector<Real, 3>{-scale, scale, z});
+        }
+
+        for (int j = 1; j < order; ++j) {
+            for (int i = 1; i < order; ++i) {
+                nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, order),
+                                                      equispaced_pm_one_coord(j, order),
+                                                      Real(0)});
+            }
+        }
+
+        for (int level = 1; level < order - 1; ++level) {
+            const int n = order - level;
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+
+            for (int m = 1; m < n; ++m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{s, -scale, z});
+            }
+            for (int m = 1; m < n; ++m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{scale, s, z});
+            }
+            for (int m = n - 1; m >= 1; --m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{s, scale, z});
+            }
+            for (int m = n - 1; m >= 1; --m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{-scale, s, z});
+            }
+        }
+
+        for (int level = 1; level < order - 1; ++level) {
+            const int n = order - level;
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+            for (int j = 1; j < n; ++j) {
+                for (int i = 1; i < n; ++i) {
+                    nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, n) * scale,
+                                                          equispaced_pm_one_coord(j, n) * scale,
+                                                          z});
+                }
+            }
+        }
+
+        return nodes;
+    }
+
+    struct VectorValueSink {
+        std::vector<Real>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, Real value) const { output[i] = value; }
+    };
+
+    struct RawValueSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, Real value) const { output[i] = value; }
+    };
+
+    struct VectorGradientSink {
+        std::vector<Gradient>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, const Gradient& value) const { output[i] = value; }
+    };
+
+    struct RawGradientSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, const Gradient& value) const {
+            Real* dst = output + i * 3u;
+            dst[0] = value[0];
+            dst[1] = value[1];
+            dst[2] = value[2];
+        }
+    };
+
+    struct VectorHessianSink {
+        std::vector<Hessian>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, const Hessian& value) const { output[i] = value; }
+    };
+
+    struct RawHessianSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, const Hessian& value) const {
+            store_hessian(value, output + i * 9u);
+        }
+    };
+
+    template <typename Get, typename Set>
+    static void apply_order1_combination(std::size_t components,
+                                         const Get& get,
+                                         const Set& set) {
+        for (std::size_t c = 0; c < components; ++c) {
+            const Real m0 = get(0u, c);
+            const Real m1 = get(1u, c);
+            const Real m2 = get(2u, c);
+            const Real m3 = get(3u, c);
+            const Real m4 = get(4u, c);
+            set(0u, c, Real(0.25) * (m0 - m1 - m2 + m3 - m4));
+            set(1u, c, Real(0.25) * (m0 + m1 - m2 - m3 - m4));
+            set(2u, c, Real(0.25) * (m0 + m1 + m2 + m3 - m4));
+            set(3u, c, Real(0.25) * (m0 - m1 + m2 - m3 - m4));
+            set(4u, c, m4);
+        }
+    }
+
+    template <typename Get, typename Set>
+    static void apply_order2_combination(std::size_t components,
+                                         const Get& get,
+                                         const Set& set) {
+        for (std::size_t c = 0; c < components; ++c) {
+            const Real m0 = get(0u, c);
+            const Real m1 = get(1u, c);
+            const Real m2 = get(2u, c);
+            const Real m3 = get(3u, c);
+            const Real m4 = get(4u, c);
+            const Real m5 = get(5u, c);
+            const Real m6 = get(6u, c);
+            const Real m7 = get(7u, c);
+            const Real m8 = get(8u, c);
+            const Real m9 = get(9u, c);
+            const Real m10 = get(10u, c);
+            const Real m11 = get(11u, c);
+            const Real m12 = get(12u, c);
+            const Real m13 = get(13u, c);
+            set(0u, c, Real(0.25) * (m4 - m5 - m7 + m8 - m9 + m10 + m11 - Real(2) * m12 + m13));
+            set(1u, c, Real(0.25) * (-m4 - m5 + m7 + m8 - m9 - m10 + m11 + Real(2) * m12 + m13));
+            set(2u, c, Real(0.25) * (m4 + m5 + m7 + m8 - m9 - m10 - m11 - Real(2) * m12 + m13));
+            set(3u, c, Real(0.25) * (-m4 + m5 - m7 + m8 - m9 + m10 - m11 + Real(2) * m12 + m13));
+            set(4u, c, -m9 + Real(2) * m13);
+            set(5u, c, Real(0.5) * (-m3 + m5 + m6 - m8 + m11));
+            set(6u, c, Real(0.5) * (m1 + m2 - m7 - m8 - m10));
+            set(7u, c, Real(0.5) * (m3 - m5 + m6 - m8 - m11));
+            set(8u, c, Real(0.5) * (-m1 + m2 + m7 - m8 + m10));
+            set(9u, c, m9 - m10 - m11 + m12 - m13);
+            set(10u, c, m9 + m10 - m11 - m12 - m13);
+            set(11u, c, m9 + m10 + m11 + m12 - m13);
+            set(12u, c, m9 - m10 + m11 - m12 - m13);
+            set(13u, c, m0 - m2 - m6 + m8 - Real(2) * m9 + m13);
+        }
+    }
+
+    template <typename Get, typename Set>
+    static void apply_low_order_combination(const OrderData& data,
+                                            std::size_t components,
+                                            const Get& get,
+                                            const Set& set) {
+        if (data.order == 1) {
+            apply_order1_combination(components, get, set);
+            return;
+        }
+        apply_order2_combination(components, get, set);
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Real>& modal_values,
+                                            std::vector<Real>& nodal_values) {
+        const std::size_t n = modal_values.size();
+        nodal_values.resize(n);
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Real>& modal_values,
+                                               Real* SVMP_RESTRICT nodal_values) {
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Gradient>& modal_gradients,
+                                            std::vector<Gradient>& nodal_gradients) {
+        const std::size_t n = modal_gradients.size();
+        nodal_gradients.resize(n);
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i][component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Gradient>& modal_gradients,
+                                               Real* SVMP_RESTRICT nodal_gradients) {
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i * 3u + component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Hessian>& modal_hessians,
+                                            std::vector<Hessian>& nodal_hessians) {
+        const std::size_t n = modal_hessians.size();
+        nodal_hessians.resize(n);
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i].data()[component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Hessian>& modal_hessians,
+                                               Real* SVMP_RESTRICT nodal_hessians) {
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i * 9u + component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_all(
+        const OrderData& data,
+        const std::vector<Real>& modal_values,
+        const std::vector<Gradient>& modal_gradients,
+        const std::vector<Hessian>& modal_hessians,
+        std::vector<Real>& nodal_values,
+        std::vector<Gradient>& nodal_gradients,
+        std::vector<Hessian>& nodal_hessians) {
+        const std::size_t n = modal_values.size();
+        nodal_values.resize(n);
+        nodal_gradients.resize(n);
+        nodal_hessians.resize(n);
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i][component] = value;
+            });
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i].data()[component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_all_to(
+        const OrderData& data,
+        const std::vector<Real>& modal_values,
+        const std::vector<Gradient>& modal_gradients,
+        const std::vector<Hessian>& modal_hessians,
+        Real* SVMP_RESTRICT nodal_values,
+        Real* SVMP_RESTRICT nodal_gradients,
+        Real* SVMP_RESTRICT nodal_hessians) {
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i * 3u + component] = value;
+            });
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i * 9u + component] = value;
+            });
+    }
+
+    template <typename Sink>
+    // Keep modal transform helpers free of forced-inline attributes unless
+    // compiler-versioned benchmarks and LLVM IR checks show a stable benefit.
+    static void apply_modal_values_to_nodal(const OrderData& data,
+                                            const std::vector<Real>& modal_values,
+                                            const Sink& sink) {
+        const std::size_t n = modal_values.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Real value = Real(0);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                value += row[modal_j] * modal_values[modal_j];
+            }
+            sink.write(basis_i, value);
+        }
+    }
+
+    template <typename Sink>
+    static void apply_modal_gradients_to_nodal(const OrderData& data,
+                                               const std::vector<Gradient>& modal_gradients,
+                                               const Sink& sink) {
+        const std::size_t n = modal_gradients.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Gradient gradient{};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    gradient[component] += coeff * modal_gradients[modal_j][component];
+                }
+            }
+            sink.write(basis_i, gradient);
+        }
+    }
+
+    template <typename Sink>
+    static void apply_modal_hessians_to_nodal(const OrderData& data,
+                                              const std::vector<Hessian>& modal_hessians,
+                                              const Sink& sink) {
+        const std::size_t n = modal_hessians.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
+            Hessian hessian{};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = matrix_row[modal_j];
+                for (std::size_t row = 0; row < 3u; ++row) {
+                    for (std::size_t col = 0; col < 3u; ++col) {
+                        hessian(row, col) += coeff * modal_hessians[modal_j](row, col);
+                    }
+                }
+            }
+            sink.write(basis_i, hessian);
+        }
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Real>& modal_values,
+                                     std::vector<Real>& nodal_values) {
+        apply_modal_values_to_nodal(data, modal_values, VectorValueSink{nodal_values});
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Gradient>& modal_gradients,
+                                     std::vector<Gradient>& nodal_gradients) {
+        apply_modal_gradients_to_nodal(data, modal_gradients, VectorGradientSink{nodal_gradients});
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Hessian>& modal_hessians,
+                                     std::vector<Hessian>& nodal_hessians) {
+        apply_modal_hessians_to_nodal(data, modal_hessians, VectorHessianSink{nodal_hessians});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Real>& modal_values,
+                                        Real* nodal_values) {
+        apply_modal_values_to_nodal(data, modal_values, RawValueSink{nodal_values});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Gradient>& modal_gradients,
+                                        Real* nodal_gradients) {
+        apply_modal_gradients_to_nodal(data, modal_gradients, RawGradientSink{nodal_gradients});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Hessian>& modal_hessians,
+                                        Real* nodal_hessians) {
+        apply_modal_hessians_to_nodal(data, modal_hessians, RawHessianSink{nodal_hessians});
+    }
+};
+
+namespace lagrange_pyramid {
+
+const std::vector<math::Vector<Real, 3>>& nodes(int order) {
+    return PyramidLagrangeCache::get(order).nodes;
+}
+
+void prewarm_scratch(int order, std::size_t max_qpts) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::prewarm_scratch(data.modal_terms.size(), max_qpts);
+}
+
+void evaluate_values(int order,
+                     const math::Vector<Real, 3>& xi,
+                     std::vector<Real>& values) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_values(data, xi, values);
+}
+
+void evaluate_gradients(int order,
+                        const math::Vector<Real, 3>& xi,
+                        std::vector<Gradient>& gradients) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_gradients(data, xi, gradients);
+}
+
+void evaluate_hessians(int order,
+                       const math::Vector<Real, 3>& xi,
+                       std::vector<Hessian>& hessians) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_hessians(data, xi, hessians);
+}
+
+void evaluate_all(int order,
+                  const math::Vector<Real, 3>& xi,
+                  std::vector<Real>& values,
+                  std::vector<Gradient>& gradients,
+                  std::vector<Hessian>& hessians) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_all(data, xi, values, gradients, hessians);
+}
+
+void evaluate_values_to(int order,
+                        const math::Vector<Real, 3>& xi,
+                        Real* SVMP_RESTRICT values_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_values_to(data, xi, values_out);
+}
+
+void evaluate_gradients_to(int order,
+                           const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT gradients_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_gradients_to(data, xi, gradients_out);
+}
+
+void evaluate_hessians_to(int order,
+                          const math::Vector<Real, 3>& xi,
+                          Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_hessians_to(data, xi, hessians_out);
+}
+
+void evaluate_all_to(int order,
+                     const math::Vector<Real, 3>& xi,
+                     Real* SVMP_RESTRICT values_out,
+                     Real* SVMP_RESTRICT gradients_out,
+                     Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_all_to(data, xi, values_out, gradients_out, hessians_out);
+}
+
+void evaluate_at_quadrature_points_strided(
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_at_quadrature_points_strided(
+        data, points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+} // namespace lagrange_pyramid
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
new file mode 100644
index 000000000..76859501c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
@@ -0,0 +1,67 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
+#define SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
+
+// Private declarations for the rational pyramid Lagrange helper implemented in
+// LagrangeBasisPyramid.cpp. This header is intentionally small so the large
+// construction and apex-classification code stays out of LagrangeBasis.cpp.
+
+#include "BasisFunction.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace lagrange_pyramid {
+
+const std::vector<math::Vector<Real, 3>>& nodes(int order);
+
+void prewarm_scratch(int order, std::size_t max_qpts = 0);
+
+void evaluate_values(int order,
+                     const math::Vector<Real, 3>& xi,
+                     std::vector<Real>& values);
+void evaluate_gradients(int order,
+                        const math::Vector<Real, 3>& xi,
+                        std::vector<Gradient>& gradients);
+void evaluate_hessians(int order,
+                       const math::Vector<Real, 3>& xi,
+                       std::vector<Hessian>& hessians);
+void evaluate_all(int order,
+                  const math::Vector<Real, 3>& xi,
+                  std::vector<Real>& values,
+                  std::vector<Gradient>& gradients,
+                  std::vector<Hessian>& hessians);
+
+void evaluate_values_to(int order,
+                        const math::Vector<Real, 3>& xi,
+                        Real* SVMP_RESTRICT values_out);
+void evaluate_gradients_to(int order,
+                           const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT gradients_out);
+void evaluate_hessians_to(int order,
+                          const math::Vector<Real, 3>& xi,
+                          Real* SVMP_RESTRICT hessians_out);
+void evaluate_all_to(int order,
+                     const math::Vector<Real, 3>& xi,
+                     Real* SVMP_RESTRICT values_out,
+                     Real* SVMP_RESTRICT gradients_out,
+                     Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_at_quadrature_points_strided(
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+} // namespace lagrange_pyramid
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
new file mode 100644
index 000000000..36325576a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
@@ -0,0 +1,2457 @@
+#include "LagrangeBasisSimplex.h"
+
+#include <array>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+// Falling-factorial (equispaced barycentric) Lagrange factors for simplex nodes.
+//
+// For a fixed polynomial order p and barycentric coordinate lambda in [0, 1],
+// define
+//   phi_a(lambda) = product_{m=0}^{a-1} (p * lambda - m) / (a - m), a = 0..p
+// Then for a multi-index (i0, i1, ..., id) with sum i_k = p, the simplex
+// Lagrange basis function is product_k phi_{i_k}(lambda_k), nodal on the
+// barycentric lattice.
+//
+// Output buffers must each be sized to at least p+1 entries; the function
+// writes every output slot (no pre-zero required by the caller).
+template <bool NeedFirst, bool NeedSecond>
+void simplex_lagrange_factor_sequence_impl(int p,
+                                           Real lambda,
+                                           Real* phi,
+                                           Real* dphi,
+                                           Real* d2phi) {
+    static_assert(!NeedSecond || NeedFirst,
+                  "second derivative factors require first-derivative recurrence state");
+
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        dphi[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        d2phi[0] = Real(0);
+    }
+    if (p == 0) {
+        return;
+    }
+
+    const Real t = static_cast<Real>(p) * lambda;
+    const Real dt_dlambda = static_cast<Real>(p);
+
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+
+    for (int a = 1; a <= p; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt_old = dphi_dt_prev;
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
+            dphi[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
+                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+void simplex_lagrange_factor_sequence(int p,
+                                      Real lambda,
+                                      Real* phi,
+                                      Real* dphi,
+                                      Real* d2phi) {
+    if (d2phi != nullptr) {
+        simplex_lagrange_factor_sequence_impl<true, true>(p, lambda, phi, dphi, d2phi);
+    } else if (dphi != nullptr) {
+        simplex_lagrange_factor_sequence_impl<true, false>(p, lambda, phi, dphi, nullptr);
+    } else {
+        simplex_lagrange_factor_sequence_impl<false, false>(p, lambda, phi, nullptr, nullptr);
+    }
+}
+
+constexpr int kFixedSimplexAxisOrder = 12;
+constexpr std::size_t kFixedSimplexAxisSize =
+    static_cast<std::size_t>(kFixedSimplexAxisOrder + 1);
+constexpr std::size_t kFixedSimplexBatchEntries = 512;
+
+template <int Order>
+inline void simplex_lagrange_factor_values_product(Real lambda,
+                                                   Real* SVMP_RESTRICT values) {
+    static_assert(Order >= 0, "simplex order must be non-negative");
+    values[0] = Real(1);
+    const Real t = static_cast<Real>(Order) * lambda;
+    for (int a = 1; a <= Order; ++a) {
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        values[a] = values[a - 1] * (t - static_cast<Real>(a - 1)) * inv_a;
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
+        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
+        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2];
+        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2];
+        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2];
+        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2];
+    }
+}
+
+bool try_evaluate_triangle_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+    case 4:
+        evaluate_triangle_simplex_values_q4<4>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_values_q4<5>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_values_q4<6>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_values_q4<7>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_values_q4<8>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
+        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
+        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
+        simplex_lagrange_factor_values_product<Order>(l3, phi3[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2] * phi3[0][i3];
+        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2] * phi3[1][i3];
+        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2] * phi3[2][i3];
+        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2] * phi3[3][i3];
+    }
+}
+
+bool try_evaluate_tetrahedron_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+    case 4:
+        evaluate_tetrahedron_simplex_values_q4<4>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 5:
+        evaluate_tetrahedron_simplex_values_q4<5>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 6:
+        evaluate_tetrahedron_simplex_values_q4<6>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 7:
+        evaluate_tetrahedron_simplex_values_q4<7>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 8:
+        evaluate_tetrahedron_simplex_values_q4<8>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    static_assert(Order >= 3 && Order <= 8,
+                  "specialized tetrahedron gradient path covers orders 3..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real dphi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l0, phi0[q], dphi0[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l1, phi1[q], dphi1[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l2, phi2[q], dphi2[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l3, phi3[q], dphi3[q], nullptr);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real gx[4];
+        Real gy[4];
+        Real gz[4];
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real v3 = phi3[q][i3];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real D3 = dphi3[q][i3];
+            const Real v23 = v2 * v3;
+            const Real v01 = v0 * v1;
+            const Real dl0 = D0 * v1 * v23;
+            gx[q] = v0 * D1 * v23 - dl0;
+            gy[q] = v01 * D2 * v3 - dl0;
+            gz[q] = v01 * v2 * D3 - dl0;
+        }
+
+        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+        g[0u] = gx[0];
+        g[1u] = gx[1];
+        g[2u] = gx[2];
+        g[3u] = gx[3];
+        g[output_stride + 0u] = gy[0];
+        g[output_stride + 1u] = gy[1];
+        g[output_stride + 2u] = gy[2];
+        g[output_stride + 3u] = gy[3];
+        g[2u * output_stride + 0u] = gz[0];
+        g[2u * output_stride + 1u] = gz[1];
+        g[2u * output_stride + 2u] = gz[2];
+        g[2u * output_stride + 3u] = gz[3];
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    static_assert((Order == 2) || (Order >= 4 && Order <= 8),
+                  "specialized simplex path covers order 2 and orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l0, phi0[q], dphi0[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l1, phi1[q], dphi1[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l2, phi2[q], dphi2[q], nullptr);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real dl0 = D0 * v1 * v2;
+            g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+            g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+            g[2u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+bool try_evaluate_triangle_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    switch (order) {
+    case 2:
+        evaluate_triangle_simplex_gradients_q4<2>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 4:
+        evaluate_triangle_simplex_gradients_q4<4>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_gradients_q4<5>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_gradients_q4<6>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_gradients_q4<7>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_gradients_q4<8>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+        H[2u * output_stride + 0u] = Real(0);
+        H[2u * output_stride + 1u] = Real(0);
+        H[2u * output_stride + 2u] = Real(0);
+        H[2u * output_stride + 3u] = Real(0);
+        H[5u * output_stride + 0u] = Real(0);
+        H[5u * output_stride + 1u] = Real(0);
+        H[5u * output_stride + 2u] = Real(0);
+        H[5u * output_stride + 3u] = Real(0);
+        H[6u * output_stride + 0u] = Real(0);
+        H[6u * output_stride + 1u] = Real(0);
+        H[6u * output_stride + 2u] = Real(0);
+        H[6u * output_stride + 3u] = Real(0);
+        H[7u * output_stride + 0u] = Real(0);
+        H[7u * output_stride + 1u] = Real(0);
+        H[7u * output_stride + 2u] = Real(0);
+        H[7u * output_stride + 3u] = Real(0);
+        H[8u * output_stride + 0u] = Real(0);
+        H[8u * output_stride + 1u] = Real(0);
+        H[8u * output_stride + 2u] = Real(0);
+        H[8u * output_stride + 3u] = Real(0);
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            if (value_row != nullptr) {
+                value_row[q] = v0 * v1 * v2;
+            }
+
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            if (g != nullptr) {
+                const Real dl0 = D0 * v1 * v2;
+                g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                g[2u * output_stride + q] = Real(0);
+            }
+
+            const Real DD0 = d2phi0[q][i0];
+            const Real DD1 = d2phi1[q][i1];
+            const Real DD2 = d2phi2[q][i2];
+            const Real H00 = DD0 * v1 * v2;
+            const Real H11 = v0 * DD1 * v2;
+            const Real H22 = v0 * v1 * DD2;
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+            const Real h01 = H00 - H01 - H02 + H12;
+            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+            H[1u * output_stride + q] = h01;
+            H[3u * output_stride + q] = h01;
+            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+        }
+    }
+}
+
+bool try_evaluate_triangle_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+    case 2:
+        evaluate_triangle_simplex_hessian_outputs_q4<2>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 3:
+        evaluate_triangle_simplex_hessian_outputs_q4<3>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 4:
+        evaluate_triangle_simplex_hessian_outputs_q4<4>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_hessian_outputs_q4<5>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_hessian_outputs_q4<6>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_hessian_outputs_q4<7>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_hessian_outputs_q4<8>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_hessian_q4(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real H00 = DD0 * v1 * v2 * v3;
+    const Real H11 = v0 * DD1 * v2 * v3;
+    const Real H22 = v0 * v1 * DD2 * v3;
+    const Real H33 = v0 * v1 * v2 * DD3;
+    const Real H01 = D0 * D1 * v2 * v3;
+    const Real H02 = D0 * v1 * D2 * v3;
+    const Real H03 = D0 * v1 * v2 * D3;
+    const Real H12 = v0 * D1 * D2 * v3;
+    const Real H13 = v0 * D1 * v2 * D3;
+    const Real H23 = v0 * v1 * D2 * D3;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+    H[0u * output_stride + Q] = H00 - Real(2) * H01 + H11;
+    H[1u * output_stride + Q] = h01;
+    H[2u * output_stride + Q] = h02;
+    H[3u * output_stride + Q] = h01;
+    H[4u * output_stride + Q] = H00 - Real(2) * H02 + H22;
+    H[5u * output_stride + Q] = h12;
+    H[6u * output_stride + Q] = h02;
+    H[7u * output_stride + Q] = h12;
+    H[8u * output_stride + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_hessian_stride4_q(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real v12 = v1 * v2;
+    const Real v13 = v1 * v3;
+    const Real v23 = v2 * v3;
+    const Real v123 = v1 * v23;
+    const Real v023 = v0 * v23;
+    const Real v013 = v0 * v13;
+    const Real v012 = v0 * v12;
+    const Real H00 = DD0 * v123;
+    const Real H11 = DD1 * v023;
+    const Real H22 = DD2 * v013;
+    const Real H33 = DD3 * v012;
+    const Real H01 = D0 * D1 * v23;
+    const Real H02 = D0 * D2 * v13;
+    const Real H03 = D0 * D3 * v12;
+    const Real H12 = D1 * D2 * v0 * v3;
+    const Real H13 = D1 * D3 * v0 * v2;
+    const Real H23 = D2 * D3 * v0 * v1;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+    H[Q] = H00 - Real(2) * H01 + H11;
+    H[4u + Q] = h01;
+    H[8u + Q] = h02;
+    H[12u + Q] = h01;
+    H[16u + Q] = H00 - Real(2) * H02 + H22;
+    H[20u + Q] = h12;
+    H[24u + Q] = h02;
+    H[28u + Q] = h12;
+    H[32u + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_all_stride4_q(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT g,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real v12 = v1 * v2;
+    const Real v13 = v1 * v3;
+    const Real v23 = v2 * v3;
+    const Real v123 = v1 * v23;
+    const Real v023 = v0 * v23;
+    const Real v013 = v0 * v13;
+    const Real v012 = v0 * v12;
+    const Real dl0 = D0 * v123;
+    const Real H00 = DD0 * v123;
+    const Real H11 = DD1 * v023;
+    const Real H22 = DD2 * v013;
+    const Real H33 = DD3 * v012;
+    const Real H01 = D0 * D1 * v23;
+    const Real H02 = D0 * D2 * v13;
+    const Real H03 = D0 * D3 * v12;
+    const Real H12 = D1 * D2 * v0 * v3;
+    const Real H13 = D1 * D3 * v0 * v2;
+    const Real H23 = D2 * D3 * v0 * v1;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+
+    value_row[Q] = v0 * v123;
+    g[Q] = D1 * v023 - dl0;
+    g[4u + Q] = D2 * v013 - dl0;
+    g[8u + Q] = D3 * v012 - dl0;
+    H[Q] = H00 - Real(2) * H01 + H11;
+    H[4u + Q] = h01;
+    H[8u + Q] = h02;
+    H[12u + Q] = h01;
+    H[16u + Q] = H00 - Real(2) * H02 + H22;
+    H[20u + Q] = h12;
+    H[24u + Q] = h02;
+    H[28u + Q] = h12;
+    H[32u + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real dphi3[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+    Real d2phi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l3, phi3[q], dphi3[q], d2phi3[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (values_out == nullptr && gradients_out == nullptr) {
+        if (output_stride == 4u) {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT H = hessians_out + node * 36u;
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+            }
+        } else {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                write_tetrahedron_simplex_hessian_q4<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+            }
+        }
+        return;
+    }
+
+    if (values_out != nullptr && gradients_out != nullptr) {
+        if (output_stride == 4u) {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                write_tetrahedron_simplex_all_stride4_q<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+            Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+            Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+            Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+
+            for (std::size_t q = 0; q < 4u; ++q) {
+                const Real v0 = phi0[q][i0];
+                const Real v1 = phi1[q][i1];
+                const Real v2 = phi2[q][i2];
+                const Real v3 = phi3[q][i3];
+                const Real D0 = dphi0[q][i0];
+                const Real D1 = dphi1[q][i1];
+                const Real D2 = dphi2[q][i2];
+                const Real D3 = dphi3[q][i3];
+                const Real DD0 = d2phi0[q][i0];
+                const Real DD1 = d2phi1[q][i1];
+                const Real DD2 = d2phi2[q][i2];
+                const Real DD3 = d2phi3[q][i3];
+                const Real v12 = v1 * v2;
+                const Real v13 = v1 * v3;
+                const Real v23 = v2 * v3;
+                const Real v123 = v1 * v23;
+                const Real v023 = v0 * v23;
+                const Real v013 = v0 * v13;
+                const Real v012 = v0 * v12;
+                const Real dl0 = D0 * v123;
+                const Real H00 = DD0 * v123;
+                const Real H11 = DD1 * v023;
+                const Real H22 = DD2 * v013;
+                const Real H33 = DD3 * v012;
+                const Real H01 = D0 * D1 * v23;
+                const Real H02 = D0 * D2 * v13;
+                const Real H03 = D0 * D3 * v12;
+                const Real H12 = D1 * D2 * v0 * v3;
+                const Real H13 = D1 * D3 * v0 * v2;
+                const Real H23 = D2 * D3 * v0 * v1;
+                const Real h01 = H00 - H01 - H02 + H12;
+                const Real h02 = H00 - H01 - H03 + H13;
+                const Real h12 = H00 - H02 - H03 + H23;
+
+                value_row[q] = v0 * v123;
+                g[0u * output_stride + q] = D1 * v023 - dl0;
+                g[1u * output_stride + q] = D2 * v013 - dl0;
+                g[2u * output_stride + q] = D3 * v012 - dl0;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = h02;
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = h12;
+                H[6u * output_stride + q] = h02;
+                H[7u * output_stride + q] = h12;
+                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+            }
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real v3 = phi3[q][i3];
+            if (value_row != nullptr) {
+                value_row[q] = v0 * v1 * v2 * v3;
+            }
+
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real D3 = dphi3[q][i3];
+            if (g != nullptr) {
+                const Real dl0 = D0 * v1 * v2 * v3;
+                g[0u * output_stride + q] = v0 * D1 * v2 * v3 - dl0;
+                g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
+                g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
+            }
+
+            const Real DD0 = d2phi0[q][i0];
+            const Real DD1 = d2phi1[q][i1];
+            const Real DD2 = d2phi2[q][i2];
+            const Real DD3 = d2phi3[q][i3];
+            const Real H00 = DD0 * v1 * v2 * v3;
+            const Real H11 = v0 * DD1 * v2 * v3;
+            const Real H22 = v0 * v1 * DD2 * v3;
+            const Real H33 = v0 * v1 * v2 * DD3;
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+            const Real h01 = H00 - H01 - H02 + H12;
+            const Real h02 = H00 - H01 - H03 + H13;
+            const Real h12 = H00 - H02 - H03 + H23;
+            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+            H[1u * output_stride + q] = h01;
+            H[2u * output_stride + q] = h02;
+            H[3u * output_stride + q] = h01;
+            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+            H[5u * output_stride + q] = h12;
+            H[6u * output_stride + q] = h02;
+            H[7u * output_stride + q] = h12;
+            H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+        }
+    }
+}
+
+bool try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+    case 2:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<2>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 3:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<3>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 4:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<4>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 5:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<5>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 6:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<6>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 7:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<7>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 8:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<8>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+// Per-thread scratch space for simplex factor sequences. Common low orders use
+// fixed storage; higher orders fall back to dynamic vectors.
+struct SimplexAxisScratch {
+    std::size_t size{0};
+    std::array<Real, kFixedSimplexAxisSize> phi_fixed{};
+    std::array<Real, kFixedSimplexAxisSize> dphi_fixed{};
+    std::array<Real, kFixedSimplexAxisSize> d2phi_fixed{};
+    std::vector<Real> phi_dynamic;
+    std::vector<Real> dphi_dynamic;
+    std::vector<Real> d2phi_dynamic;
+
+    void reserveFor(std::size_t n) {
+        size = n;
+        if (n <= kFixedSimplexAxisSize) {
+            return;
+        }
+        if (phi_dynamic.size() < n) phi_dynamic.resize(n);
+        if (dphi_dynamic.size() < n) dphi_dynamic.resize(n);
+        if (d2phi_dynamic.size() < n) d2phi_dynamic.resize(n);
+    }
+
+    Real* phi() noexcept {
+        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
+    }
+
+    Real* dphi() noexcept {
+        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
+    }
+
+    Real* d2phi() noexcept {
+        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
+    }
+
+    const Real* phi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
+    }
+
+    const Real* dphi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
+    }
+
+    const Real* d2phi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
+    }
+};
+
+SimplexAxisScratch& simplex_axis_scratch_slot(int slot) {
+    thread_local SimplexAxisScratch s[4];
+    return s[slot];
+}
+
+struct SimplexVectorSink {
+    std::vector<Real>* values;
+    std::vector<Gradient>* gradients;
+    std::vector<Hessian>* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t n_nodes) const {
+        if (values)    values->resize(n_nodes);
+        if (gradients) gradients->resize(n_nodes);
+        if (hessians)  hessians->resize(n_nodes);
+    }
+
+    void write_value(std::size_t n, Real value) const {
+        (*values)[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
+        auto& gradient = (*gradients)[n];
+        gradient[0] = x;
+        gradient[1] = y;
+        gradient[2] = z;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Hessian hessian{};
+        hessian(0, 0) = xx;
+        hessian(1, 1) = yy;
+        hessian(2, 2) = zz;
+        hessian(0, 1) = xy; hessian(1, 0) = xy;
+        hessian(0, 2) = xz; hessian(2, 0) = xz;
+        hessian(1, 2) = yz; hessian(2, 1) = yz;
+        (*hessians)[n] = hessian;
+    }
+};
+
+struct SimplexRawSink {
+    Real* values;
+    Real* gradients;
+    Real* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t) const {}
+
+    void write_value(std::size_t n, Real value) const {
+        values[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
+        Real* gradient = gradients + n * 3u;
+        gradient[0] = x;
+        gradient[1] = y;
+        gradient[2] = z;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Real* hessian = hessians + n * 9u;
+        hessian[0] = xx;
+        hessian[1] = xy;
+        hessian[2] = xz;
+        hessian[3] = xy;
+        hessian[4] = yy;
+        hessian[5] = yz;
+        hessian[6] = xz;
+        hessian[7] = yz;
+        hessian[8] = zz;
+    }
+};
+
+template <typename Sink>
+void evaluate_triangle_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                          int order,
+                                          const math::Vector<Real, 3>& xi,
+                                          const Sink& sink) {
+    const Real l1 = xi[0];
+    const Real l2 = xi[1];
+    const Real l0 = Real(1) - l1 - l2;
+
+    const std::size_t n = static_cast<std::size_t>(order + 1);
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(n);
+    s1.reserveFor(n);
+    s2.reserveFor(n);
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    sink.prepare(num_nodes);
+    const bool need_values = sink.wants_values();
+    const bool need_gradients = sink.wants_gradients();
+    const bool need_hessians = sink.wants_hessians();
+    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+
+    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+    const Real* phi0 = s0.phi();
+    const Real* phi1 = s1.phi();
+    const Real* phi2 = s2.phi();
+    const Real* dphi0 = s0.dphi();
+    const Real* dphi1 = s1.dphi();
+    const Real* dphi2 = s2.dphi();
+    const Real* d2phi0 = s0.d2phi();
+    const Real* d2phi1 = s1.d2phi();
+    const Real* d2phi2 = s2.d2phi();
+
+    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
+        const auto& e = simplex_exponents[n_idx];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+        const Real v0 = phi0[i0];
+        const Real v1 = phi1[i1];
+        const Real v2 = phi2[i2];
+        if (need_values) {
+            sink.write_value(n_idx, v0 * v1 * v2);
+        }
+        if (!need_gradients && !need_hessians) {
+            continue;
+        }
+
+        const Real D0 = dphi0[i0];
+        const Real D1 = dphi1[i1];
+        const Real D2 = dphi2[i2];
+
+        if (need_gradients) {
+            const Real dl0 = D0 * v1 * v2;
+            const Real dl1 = v0 * D1 * v2;
+            const Real dl2 = v0 * v1 * D2;
+            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, Real(0));
+        }
+
+        if (need_hessians) {
+            const Real DD0 = d2phi0[i0];
+            const Real DD1 = d2phi1[i1];
+            const Real DD2 = d2phi2[i2];
+
+            const Real H00 = DD0 * v1 * v2;
+            const Real H11 = v0 * DD1 * v2;
+            const Real H22 = v0 * v1 * DD2;
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+
+            sink.write_hessian(n_idx,
+                               H00 - Real(2) * H01 + H11,
+                               H00 - Real(2) * H02 + H22,
+                               Real(0),
+                               H00 - H01 - H02 + H12,
+                               Real(0),
+                               Real(0));
+        }
+    }
+}
+
+void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>* values,
+                                     std::vector<Gradient>* gradients,
+                                     std::vector<Hessian>* hessians) {
+    const SimplexVectorSink sink{values, gradients, hessians};
+    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out) {
+    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_triangle_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_out != nullptr;
+    const bool need_hessians = hessians_out != nullptr;
+    if (num_qpts == 4u &&
+        values_out != nullptr &&
+        !need_gradients &&
+        !need_hessians &&
+        try_evaluate_triangle_simplex_values_q4(
+            simplex_exponents, order, points, output_stride, values_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        values_out == nullptr &&
+        need_gradients &&
+        !need_hessians &&
+        try_evaluate_triangle_simplex_gradients_q4(
+            simplex_exponents, order, points, output_stride, gradients_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        need_hessians &&
+        try_evaluate_triangle_simplex_hessian_outputs_q4(
+            simplex_exponents, order, points, output_stride,
+            values_out, gradients_out, hessians_out)) {
+        return;
+    }
+    const std::size_t batch_entries = sequence_size * num_qpts;
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    value_row[q] =
+                        phi0_batch[offset + i0] *
+                        phi1_batch[offset + i1] *
+                        phi2_batch[offset + i2];
+                }
+            }
+            return;
+        }
+
+        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* g = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                    g[2u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        if (order >= 4 &&
+            values_out == nullptr &&
+            gradients_out == nullptr &&
+            hessians_out != nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset,
+                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset,
+                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset,
+                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* H = hessians_out + node * 9u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    const Real h01 = H00 - H01 - H02 + H12;
+
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = Real(0);
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = Real(0);
+                    H[6u * output_stride + q] = Real(0);
+                    H[7u * output_stride + q] = Real(0);
+                    H[8u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+
+                if (gradients_out != nullptr) {
+                    const Real dl0 = D0 * v1 * v2;
+                    const Real dl1 = v0 * D1 * v2;
+                    const Real dl2 = v0 * v1 * D2;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    g[2u * output_stride + q] = Real(0);
+                }
+
+                if (hessians_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    const Real h01 = H00 - H01 - H02 + H12;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = Real(0);
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = Real(0);
+                    H[6u * output_stride + q] = Real(0);
+                    H[7u * output_stride + q] = Real(0);
+                    H[8u * output_stride + q] = Real(0);
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+            const Real value = v0 * v1 * v2;
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = value;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+
+            if (gradients_out != nullptr) {
+                const Real dl0 = D0 * v1 * v2;
+                const Real dl1 = v0 * D1 * v2;
+                const Real dl2 = v0 * v1 * D2;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+                g[2u * output_stride + q] = Real(0);
+            }
+
+            if (hessians_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real h01 = H00 - H01 - H02 + H12;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = Real(0);
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = Real(0);
+                H[6u * output_stride + q] = Real(0);
+                H[7u * output_stride + q] = Real(0);
+                H[8u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_triangle_simplex_basis_wedge_components_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_xy_out,
+    Real* SVMP_RESTRICT hessians_xx_xy_yy_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_xy_out != nullptr;
+    const bool need_hessians = hessians_xx_xy_yy_out != nullptr;
+    const std::size_t batch_entries = sequence_size * num_qpts;
+
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr &&
+            gradients_xy_out != nullptr &&
+            hessians_xx_xy_yy_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* value_row = values_out + node * output_stride;
+                Real* g = gradients_xy_out + node * 2u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    value_row[q] = v0 * v1 * v2;
+                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                }
+            }
+            return;
+        }
+
+        if (values_out != nullptr &&
+            gradients_xy_out != nullptr &&
+            hessians_xx_xy_yy_out != nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l0, phi0_batch.data() + offset,
+                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l1, phi1_batch.data() + offset,
+                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l2, phi2_batch.data() + offset,
+                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+                Real* SVMP_RESTRICT g = gradients_xy_out + node * 2u * output_stride;
+                Real* SVMP_RESTRICT H = hessians_xx_xy_yy_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    const Real dl1 = v0 * D1 * v2;
+                    const Real dl2 = v0 * v1 * D2;
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+
+                    value_row[q] = v0 * v1 * v2;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_xy_out ? gradients_xy_out + node * 2u * output_stride : nullptr;
+            Real* H = hessians_xx_xy_yy_out ? hessians_xx_xy_yy_out + node * 3u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+                const Real dl0 = D0 * v1 * v2;
+                const Real dl1 = v0 * D1 * v2;
+                const Real dl2 = v0 * v1 * D2;
+
+                if (gradients_xy_out != nullptr) {
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                }
+
+                if (hessians_xx_xy_yy_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = v0 * v1 * v2;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+            const Real dl0 = D0 * v1 * v2;
+            const Real dl1 = v0 * D1 * v2;
+            const Real dl2 = v0 * v1 * D2;
+
+            if (gradients_xy_out != nullptr) {
+                Real* g = gradients_xy_out + node * 2u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+            }
+
+            if (hessians_xx_xy_yy_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+                Real* H = hessians_xx_xy_yy_out + node * 3u * output_stride;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+            }
+        }
+    }
+}
+
+template <typename Sink>
+void evaluate_tetrahedron_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                             int order,
+                                             const math::Vector<Real, 3>& xi,
+                                             const Sink& sink) {
+    const Real l1 = xi[0];
+    const Real l2 = xi[1];
+    const Real l3 = xi[2];
+    const Real l0 = Real(1) - l1 - l2 - l3;
+
+    const std::size_t n = static_cast<std::size_t>(order + 1);
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
+    s0.reserveFor(n);
+    s1.reserveFor(n);
+    s2.reserveFor(n);
+    s3.reserveFor(n);
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    sink.prepare(num_nodes);
+    const bool need_values = sink.wants_values();
+    const bool need_gradients = sink.wants_gradients();
+    const bool need_hessians = sink.wants_hessians();
+    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+    Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
+    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+    Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
+
+    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+    simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
+    const Real* phi0 = s0.phi();
+    const Real* phi1 = s1.phi();
+    const Real* phi2 = s2.phi();
+    const Real* phi3 = s3.phi();
+    const Real* dphi0 = s0.dphi();
+    const Real* dphi1 = s1.dphi();
+    const Real* dphi2 = s2.dphi();
+    const Real* dphi3 = s3.dphi();
+    const Real* d2phi0 = s0.d2phi();
+    const Real* d2phi1 = s1.d2phi();
+    const Real* d2phi2 = s2.d2phi();
+    const Real* d2phi3 = s3.d2phi();
+
+    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
+        const auto& e = simplex_exponents[n_idx];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+
+        const Real v0 = phi0[i0];
+        const Real v1 = phi1[i1];
+        const Real v2 = phi2[i2];
+        const Real v3 = phi3[i3];
+        if (need_values) {
+            sink.write_value(n_idx, v0 * v1 * v2 * v3);
+        }
+        if (!need_gradients && !need_hessians) {
+            continue;
+        }
+
+        const Real D0 = dphi0[i0];
+        const Real D1 = dphi1[i1];
+        const Real D2 = dphi2[i2];
+        const Real D3 = dphi3[i3];
+
+        if (need_gradients) {
+            const Real dl0 = D0 * v1 * v2 * v3;
+            const Real dl1 = v0 * D1 * v2 * v3;
+            const Real dl2 = v0 * v1 * D2 * v3;
+            const Real dl3 = v0 * v1 * v2 * D3;
+            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, dl3 - dl0);
+        }
+
+        if (need_hessians) {
+            const Real DD0 = d2phi0[i0];
+            const Real DD1 = d2phi1[i1];
+            const Real DD2 = d2phi2[i2];
+            const Real DD3 = d2phi3[i3];
+
+            const Real H00 = DD0 * v1 * v2 * v3;
+            const Real H11 = v0 * DD1 * v2 * v3;
+            const Real H22 = v0 * v1 * DD2 * v3;
+            const Real H33 = v0 * v1 * v2 * DD3;
+
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+
+            sink.write_hessian(n_idx,
+                               H00 - Real(2) * H01 + H11,
+                               H00 - Real(2) * H02 + H22,
+                               H00 - Real(2) * H03 + H33,
+                               H00 - H01 - H02 + H12,
+                               H00 - H01 - H03 + H13,
+                               H00 - H02 - H03 + H23);
+        }
+    }
+}
+
+void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians) {
+    const SimplexVectorSink sink{values, gradients, hessians};
+    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                           int order,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out) {
+    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_tetrahedron_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_out != nullptr;
+    const bool need_hessians = hessians_out != nullptr;
+    if (num_qpts == 4u &&
+        values_out != nullptr &&
+        !need_gradients &&
+        !need_hessians &&
+        try_evaluate_tetrahedron_simplex_values_q4(
+            simplex_exponents, order, points, output_stride, values_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        values_out == nullptr &&
+        need_gradients &&
+        !need_hessians) {
+        switch (order) {
+        case 3:
+            evaluate_tetrahedron_simplex_gradients_q4<3>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 4:
+            evaluate_tetrahedron_simplex_gradients_q4<4>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 5:
+            evaluate_tetrahedron_simplex_gradients_q4<5>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 6:
+            evaluate_tetrahedron_simplex_gradients_q4<6>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 7:
+            evaluate_tetrahedron_simplex_gradients_q4<7>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 8:
+            evaluate_tetrahedron_simplex_gradients_q4<8>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        default:
+            break;
+        }
+    }
+    if (num_qpts == 4u &&
+        need_hessians &&
+        try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
+            simplex_exponents, order, points, output_stride,
+            values_out, gradients_out, hessians_out)) {
+        return;
+    }
+    const std::size_t batch_entries = sequence_size * num_qpts;
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l3 = xi[2];
+                const Real l0 = Real(1) - l1 - l2 - l3;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l3, phi3_batch.data() + offset, nullptr, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    value_row[q] =
+                        phi0_batch[offset + i0] *
+                        phi1_batch[offset + i1] *
+                        phi2_batch[offset + i2] *
+                        phi3_batch[offset + i3];
+                }
+            }
+            return;
+        }
+
+        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l3 = xi[2];
+                const Real l0 = Real(1) - l1 - l2 - l3;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l3, phi3_batch.data() + offset, dphi3_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* g = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real v3 = phi3_batch[offset + i3];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real D3 = dphi3_batch[offset + i3];
+                    const Real v23 = v2 * v3;
+                    const Real dl0 = D0 * v1 * v23;
+                    g[0u * output_stride + q] = v0 * D1 * v23 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
+                    g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi3_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l3 = xi[2];
+            const Real l0 = Real(1) - l1 - l2 - l3;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d3_out = (need_gradients || need_hessians) ? dphi3_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            Real* d23_out = need_hessians ? d2phi3_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+            simplex_lagrange_factor_sequence(order, l3, phi3_batch.data() + offset, d3_out, d23_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                const Real v3 = phi3_batch[offset + i3];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2 * v3;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+                const Real D3 = dphi3_batch[offset + i3];
+
+                if (gradients_out != nullptr) {
+                    const Real dl0 = D0 * v1 * v2 * v3;
+                    const Real dl1 = v0 * D1 * v2 * v3;
+                    const Real dl2 = v0 * v1 * D2 * v3;
+                    const Real dl3 = v0 * v1 * v2 * D3;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    g[2u * output_stride + q] = dl3 - dl0;
+                }
+
+                if (hessians_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real DD3 = d2phi3_batch[offset + i3];
+                    const Real H00 = DD0 * v1 * v2 * v3;
+                    const Real H11 = v0 * DD1 * v2 * v3;
+                    const Real H22 = v0 * v1 * DD2 * v3;
+                    const Real H33 = v0 * v1 * v2 * DD3;
+                    const Real H01 = D0 * D1 * v2 * v3;
+                    const Real H02 = D0 * v1 * D2 * v3;
+                    const Real H03 = D0 * v1 * v2 * D3;
+                    const Real H12 = v0 * D1 * D2 * v3;
+                    const Real H13 = v0 * D1 * v2 * D3;
+                    const Real H23 = v0 * v1 * D2 * D3;
+                    const Real h01 = H00 - H01 - H02 + H12;
+                    const Real h02 = H00 - H01 - H03 + H13;
+                    const Real h12 = H00 - H02 - H03 + H23;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = h02;
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = h12;
+                    H[6u * output_stride + q] = h02;
+                    H[7u * output_stride + q] = h12;
+                    H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+    s3.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+        Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
+
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+        simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* phi3 = s3.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* dphi3 = s3.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+        const Real* d2phi3 = s3.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+            const Real v3 = phi3[i3];
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = v0 * v1 * v2 * v3;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+            const Real D3 = dphi3[i3];
+
+            if (gradients_out != nullptr) {
+                const Real dl0 = D0 * v1 * v2 * v3;
+                const Real dl1 = v0 * D1 * v2 * v3;
+                const Real dl2 = v0 * v1 * D2 * v3;
+                const Real dl3 = v0 * v1 * v2 * D3;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+                g[2u * output_stride + q] = dl3 - dl0;
+            }
+
+            if (hessians_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+                const Real DD3 = d2phi3[i3];
+
+                const Real H00 = DD0 * v1 * v2 * v3;
+                const Real H11 = v0 * DD1 * v2 * v3;
+                const Real H22 = v0 * v1 * DD2 * v3;
+                const Real H33 = v0 * v1 * v2 * DD3;
+
+                const Real H01 = D0 * D1 * v2 * v3;
+                const Real H02 = D0 * v1 * D2 * v3;
+                const Real H03 = D0 * v1 * v2 * D3;
+                const Real H12 = v0 * D1 * D2 * v3;
+                const Real H13 = v0 * D1 * v2 * D3;
+                const Real H23 = v0 * v1 * D2 * D3;
+
+                const Real h01 = H00 - H01 - H02 + H12;
+                const Real h02 = H00 - H01 - H03 + H13;
+                const Real h12 = H00 - H02 - H03 + H23;
+
+                Real* H = hessians_out + node * 9u * output_stride;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = h02;
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = h12;
+                H[6u * output_stride + q] = h02;
+                H[7u * output_stride + q] = h12;
+                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+            }
+        }
+    }
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
new file mode 100644
index 000000000..19cf725bd
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
@@ -0,0 +1,78 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
+#define SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
+
+// Private declarations for simplex Lagrange evaluation helpers implemented in
+// LagrangeBasisSimplex.cpp.
+
+#include "BasisFunction.h"
+
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>* values,
+                                     std::vector<Gradient>* gradients,
+                                     std::vector<Hessian>* hessians);
+
+void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_triangle_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_triangle_simplex_basis_wedge_components_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_xy_out,
+    Real* SVMP_RESTRICT hessians_xx_xy_yy_out);
+
+void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians);
+
+void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                           int order,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_tetrahedron_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
new file mode 100644
index 000000000..e622de1c6
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
@@ -0,0 +1,25 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
+#define SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
+
+// Private helper for LagrangeBasis internals.
+// This header is only intended to be included after the FE basis scalar types
+// are already available.
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+inline constexpr Real equispaced_pm_one_coord(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
new file mode 100644
index 000000000..20f743916
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -0,0 +1,818 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "NodeOrderingConventions.h"
+#include "Basis/BasisExceptions.h"
+#include "Basis/BasisTraits.h"
+
+#include <array>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using Point = math::Vector<Real, 3>;
+using RawPoint = std::array<Real, 3>;
+
+template<std::size_t N>
+using NodeTable = std::array<RawPoint, N>;
+
+struct NodeTableView {
+    const RawPoint* data{nullptr};
+    std::size_t size{0};
+};
+
+inline constexpr NodeTable<2> kLine2Nodes = {{
+    {Real(-1), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<3> kLine3Nodes = {{
+    {Real(-1), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<3> kTriangle3Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<6> kTriangle6Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+}};
+
+inline constexpr NodeTable<4> kQuad4Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<9> kQuad9Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<8> kQuad8Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<4> kTetra4Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+}};
+
+inline constexpr NodeTable<10> kTetra10Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+    {Real(0), Real(0), Real(0.5)},
+    {Real(0.5), Real(0), Real(0.5)},
+    {Real(0), Real(0.5), Real(0.5)},
+}};
+
+inline constexpr NodeTable<8> kHex8Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+}};
+
+inline constexpr NodeTable<27> kHex27Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+    {Real(0), Real(-1), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(-1), Real(0), Real(-1)},
+    {Real(0), Real(-1), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(-1), Real(0), Real(1)},
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<20> kHex20Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+    {Real(0), Real(-1), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(-1), Real(0), Real(-1)},
+    {Real(0), Real(-1), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(-1), Real(0), Real(1)},
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+}};
+
+// Mesh uses conventional Hex20 ordering: corners first, then edge midpoints in
+// {bottom, top, vertical} groups. The quadratic Hex20 serendipity polynomial
+// table uses an axis-grouped edge order. This maps public mesh/reference index
+// to the internal polynomial-table index.
+constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+    8, 13, 10, 12,
+    9, 15, 11, 14,
+    16, 17, 19, 18
+};
+
+inline constexpr NodeTable<6> kWedge6Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+}};
+
+inline constexpr NodeTable<18> kWedge18Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(0.5), Real(0), Real(-1)},
+    {Real(0.5), Real(0.5), Real(-1)},
+    {Real(0), Real(0.5), Real(-1)},
+    {Real(0.5), Real(0), Real(1)},
+    {Real(0.5), Real(0.5), Real(1)},
+    {Real(0), Real(0.5), Real(1)},
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+}};
+
+inline constexpr NodeTable<15> kWedge15Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(0.5), Real(0), Real(-1)},
+    {Real(0.5), Real(0.5), Real(-1)},
+    {Real(0), Real(0.5), Real(-1)},
+    {Real(0.5), Real(0), Real(1)},
+    {Real(0.5), Real(0.5), Real(1)},
+    {Real(0), Real(0.5), Real(1)},
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<5> kPyramid5Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+}};
+
+inline constexpr NodeTable<14> kPyramid14Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(-0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(0.5), Real(0.5)},
+    {Real(-0.5), Real(0.5), Real(0.5)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<13> kPyramid13Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(-0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(0.5), Real(0.5)},
+    {Real(-0.5), Real(0.5), Real(0.5)},
+}};
+
+template<std::size_t N>
+constexpr NodeTableView view(const NodeTable<N>& table) noexcept {
+    return NodeTableView{table.data(), table.size()};
+}
+
+Point to_point(const RawPoint& raw) {
+    return Point{raw[0], raw[1], raw[2]};
+}
+
+constexpr NodeTableView fixed_node_table(ElementType elem_type) noexcept {
+    switch (elem_type) {
+        case ElementType::Line2:     return view(kLine2Nodes);
+        case ElementType::Line3:     return view(kLine3Nodes);
+        case ElementType::Triangle3: return view(kTriangle3Nodes);
+        case ElementType::Triangle6: return view(kTriangle6Nodes);
+        case ElementType::Quad4:     return view(kQuad4Nodes);
+        case ElementType::Quad8:     return view(kQuad8Nodes);
+        case ElementType::Quad9:     return view(kQuad9Nodes);
+        case ElementType::Tetra4:    return view(kTetra4Nodes);
+        case ElementType::Tetra10:   return view(kTetra10Nodes);
+        case ElementType::Hex8:      return view(kHex8Nodes);
+        case ElementType::Hex20:     return view(kHex20Nodes);
+        case ElementType::Hex27:     return view(kHex27Nodes);
+        case ElementType::Wedge6:    return view(kWedge6Nodes);
+        case ElementType::Wedge15:   return view(kWedge15Nodes);
+        case ElementType::Wedge18:   return view(kWedge18Nodes);
+        case ElementType::Pyramid5:  return view(kPyramid5Nodes);
+        case ElementType::Pyramid13: return view(kPyramid13Nodes);
+        case ElementType::Pyramid14: return view(kPyramid14Nodes);
+        default:                     return {};
+    }
+}
+
+constexpr NodeTableView fixed_complete_lagrange_table(ElementType canonical_type,
+                                                      int order) noexcept {
+    switch (canonical_type) {
+        case ElementType::Line2:
+            return order == 1 ? view(kLine2Nodes) :
+                   order == 2 ? view(kLine3Nodes) : NodeTableView{};
+        case ElementType::Triangle3:
+            return order == 1 ? view(kTriangle3Nodes) :
+                   order == 2 ? view(kTriangle6Nodes) : NodeTableView{};
+        case ElementType::Quad4:
+            return order == 1 ? view(kQuad4Nodes) :
+                   order == 2 ? view(kQuad9Nodes) : NodeTableView{};
+        case ElementType::Tetra4:
+            return order == 1 ? view(kTetra4Nodes) :
+                   order == 2 ? view(kTetra10Nodes) : NodeTableView{};
+        case ElementType::Hex8:
+            return order == 1 ? view(kHex8Nodes) :
+                   order == 2 ? view(kHex27Nodes) : NodeTableView{};
+        case ElementType::Wedge6:
+            return order == 1 ? view(kWedge6Nodes) :
+                   order == 2 ? view(kWedge18Nodes) : NodeTableView{};
+        case ElementType::Pyramid5:
+            return order == 1 ? view(kPyramid5Nodes) :
+                   order == 2 ? view(kPyramid14Nodes) : NodeTableView{};
+        default:
+            return {};
+    }
+}
+
+Real line_coord_pm_one(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+Real line_coord_zero_one(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+void append_triangle_face_interior(std::vector<Point>& nodes,
+                                   const Point& v0,
+                                   const Point& v1,
+                                   const Point& v2,
+                                   int order) {
+    for (int c = 1; c <= order - 2; ++c) {
+        for (int b = 1; b <= order - c - 1; ++b) {
+            const int a = order - b - c;
+            const Real la = static_cast<Real>(a) / static_cast<Real>(order);
+            const Real lb = static_cast<Real>(b) / static_cast<Real>(order);
+            const Real lc = static_cast<Real>(c) / static_cast<Real>(order);
+            nodes.push_back(v0 * la + v1 * lb + v2 * lc);
+        }
+    }
+}
+
+std::vector<Point> generate_line_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>(order + 1));
+    nodes.push_back(Point{Real(-1), Real(0), Real(0)});
+    nodes.push_back(Point{Real(1), Real(0), Real(0)});
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(0), Real(0)});
+    }
+    return nodes;
+}
+
+std::vector<Point> generate_triangle_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
+
+    nodes.push_back(Point{Real(0), Real(0), Real(0)});
+    nodes.push_back(Point{Real(1), Real(0), Real(0)});
+    nodes.push_back(Point{Real(0), Real(1), Real(0)});
+
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_zero_one(m, order), Real(0), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_zero_one(order - m, order),
+                              line_coord_zero_one(m, order), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{Real(0), line_coord_zero_one(order - m, order), Real(0)});
+    }
+
+    append_triangle_face_interior(
+        nodes,
+        Point{Real(0), Real(0), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)},
+        order);
+
+    return nodes;
+}
+
+std::vector<Point> generate_quad_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
+
+    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
+
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), Real(0)});
+    }
+    for (int j = 1; j < order; ++j) {
+        nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), Real(0)});
+    }
+    for (int i = order - 1; i >= 1; --i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), Real(0)});
+    }
+    for (int j = order - 1; j >= 1; --j) {
+        nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), Real(0)});
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_tetra_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0.25), Real(0.25), Real(0.25)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
+
+    const Point verts[] = {
+        Point{Real(0), Real(0), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)},
+        Point{Real(0), Real(0), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[6][2] = {
+        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    const int faces[4][3] = {
+        {0, 1, 2},
+        {0, 1, 3},
+        {1, 2, 3},
+        {0, 2, 3},
+    };
+    for (const auto& face : faces) {
+        append_triangle_face_interior(
+            nodes,
+            verts[face[0]],
+            verts[face[1]],
+            verts[face[2]],
+            order);
+    }
+
+    for (int l = 1; l <= order - 3; ++l) {
+        for (int k = 1; k <= order - l - 2; ++k) {
+            for (int j = 1; j <= order - l - k - 1; ++j) {
+                const Real x = static_cast<Real>(j) / static_cast<Real>(order);
+                const Real y = static_cast<Real>(k) / static_cast<Real>(order);
+                const Real z = static_cast<Real>(l) / static_cast<Real>(order);
+                nodes.push_back(Point{x, y, z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_hex_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
+
+    const Point verts[] = {
+        Point{Real(-1), Real(-1), Real(-1)},
+        Point{Real(1), Real(-1), Real(-1)},
+        Point{Real(1), Real(1), Real(-1)},
+        Point{Real(-1), Real(1), Real(-1)},
+        Point{Real(-1), Real(-1), Real(1)},
+        Point{Real(1), Real(-1), Real(1)},
+        Point{Real(1), Real(1), Real(1)},
+        Point{Real(-1), Real(1), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[12][2] = {
+        {0, 1}, {1, 2}, {2, 3}, {3, 0},
+        {4, 5}, {5, 6}, {6, 7}, {7, 4},
+        {0, 4}, {1, 5}, {2, 6}, {3, 7},
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(-1)});
+        }
+    }
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(1)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int j = 1; j < order; ++j) {
+            nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int i = order - 1; i >= 1; --i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int j = order - 1; j >= 1; --j) {
+            nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+        }
+    }
+
+    for (int k = 1; k < order; ++k) {
+        for (int j = 1; j < order; ++j) {
+            for (int i = 1; i < order; ++i) {
+                nodes.push_back(Point{line_coord_pm_one(i, order),
+                                      line_coord_pm_one(j, order),
+                                      line_coord_pm_one(k, order)});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_wedge_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
+
+    const Point verts[] = {
+        Point{Real(0), Real(0), Real(-1)},
+        Point{Real(1), Real(0), Real(-1)},
+        Point{Real(0), Real(1), Real(-1)},
+        Point{Real(0), Real(0), Real(1)},
+        Point{Real(1), Real(0), Real(1)},
+        Point{Real(0), Real(1), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[9][2] = {
+        {0, 1}, {1, 2}, {2, 0},
+        {3, 4}, {4, 5}, {5, 3},
+        {0, 3}, {1, 4}, {2, 5},
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    append_triangle_face_interior(
+        nodes, verts[0], verts[1], verts[2], order);
+    append_triangle_face_interior(
+        nodes, verts[3], verts[4], verts[5], order);
+
+    for (int r = 1; r < order; ++r) {
+        const Real z = line_coord_pm_one(r, order);
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{t, Real(0), z});
+        }
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{Real(1) - t, t, z});
+        }
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{Real(0), Real(1) - t, z});
+        }
+    }
+
+    for (int r = 1; r < order; ++r) {
+        const Real z = line_coord_pm_one(r, order);
+        for (int c = 1; c <= order - 2; ++c) {
+            for (int b = 1; b <= order - c - 1; ++b) {
+                const Real x = static_cast<Real>(b) / static_cast<Real>(order);
+                const Real y = static_cast<Real>(c) / static_cast<Real>(order);
+                nodes.push_back(Point{x, y, z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_pyramid_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0.25)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
+
+    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(0), Real(0), Real(1)});
+
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_pm_one(m, order), Real(-1), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{Real(1), line_coord_pm_one(m, order), Real(0)});
+    }
+    for (int m = order - 1; m >= 1; --m) {
+        nodes.push_back(Point{line_coord_pm_one(m, order), Real(1), Real(0)});
+    }
+    for (int m = order - 1; m >= 1; --m) {
+        nodes.push_back(Point{Real(-1), line_coord_pm_one(m, order), Real(0)});
+    }
+
+    for (int level = 1; level < order; ++level) {
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+        nodes.push_back(Point{-scale, -scale, z});
+        nodes.push_back(Point{scale, -scale, z});
+        nodes.push_back(Point{scale, scale, z});
+        nodes.push_back(Point{-scale, scale, z});
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+        }
+    }
+
+    for (int level = 1; level < order - 1; ++level) {
+        const int n = order - level;
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+
+        for (int m = 1; m < n; ++m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{s, -scale, z});
+        }
+        for (int m = 1; m < n; ++m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{scale, s, z});
+        }
+        for (int m = n - 1; m >= 1; --m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{s, scale, z});
+        }
+        for (int m = n - 1; m >= 1; --m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{-scale, s, z});
+        }
+    }
+
+    for (int level = 1; level < order - 1; ++level) {
+        const int n = order - level;
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+        for (int j = 1; j < n; ++j) {
+            for (int i = 1; i < n; ++i) {
+                nodes.push_back(Point{line_coord_pm_one(i, n) * scale,
+                                      line_coord_pm_one(j, n) * scale,
+                                      z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+} // namespace
+
+math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+                                                     std::size_t local_node) {
+    const auto table = fixed_node_table(elem_type);
+    if (table.data != nullptr && local_node < table.size) {
+        return to_point(table.data[local_node]);
+    }
+
+    throw BasisNodeOrderingException("Invalid element type or node index in ReferenceNodeLayout::get_node_coords",
+                                     __FILE__, __LINE__, __func__);
+}
+
+std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
+    const auto table = fixed_node_table(elem_type);
+    if (table.data != nullptr) {
+        return table.size;
+    }
+
+    throw BasisNodeOrderingException("Unknown element type in ReferenceNodeLayout::num_nodes",
+                                     __FILE__, __LINE__, __func__);
+}
+
+std::vector<math::Vector<Real, 3>>
+ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+    if (order < 0) {
+        throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords requires non-negative order",
+                                         __FILE__, __LINE__, __func__);
+    }
+
+    const ElementType type = canonical_lagrange_type(canonical_type);
+    const auto fixed_table = fixed_complete_lagrange_table(type, order);
+    if (fixed_table.data != nullptr) {
+        std::vector<Point> nodes;
+        nodes.reserve(fixed_table.size);
+        for (std::size_t i = 0; i < fixed_table.size; ++i) {
+            nodes.push_back(to_point(fixed_table.data[i]));
+        }
+        return nodes;
+    }
+
+    switch (type) {
+        case ElementType::Point1:
+            return {Point{Real(0), Real(0), Real(0)}};
+        case ElementType::Line2:
+            return generate_line_nodes(order);
+        case ElementType::Triangle3:
+            return generate_triangle_nodes(order);
+        case ElementType::Quad4:
+            return generate_quad_nodes(order);
+        case ElementType::Tetra4:
+            return generate_tetra_nodes(order);
+        case ElementType::Hex8:
+            return generate_hex_nodes(order);
+        case ElementType::Wedge6:
+            return generate_wedge_nodes(order);
+        case ElementType::Pyramid5:
+            return generate_pyramid_nodes(order);
+        case ElementType::Quad8:
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+        case ElementType::Pyramid13:
+            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords does not support serendipity topologies",
+                                             __FILE__, __LINE__, __func__);
+        default:
+            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords: unsupported topology",
+                                             __FILE__, __LINE__, __func__);
+    }
+}
+
+std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
+    if (elem_type == ElementType::Hex20) {
+        return std::span<const std::size_t>(
+            kHex20MeshToBasisOrder.data(),
+            kHex20MeshToBasisOrder.size());
+    }
+    return {};
+}
+
+bool ReferenceNodeLayout::is_simplex(ElementType elem_type) {
+    return svmp::FE::basis::is_simplex(elem_type);
+}
+
+bool ReferenceNodeLayout::is_tensor_product(ElementType elem_type) {
+    return svmp::FE::basis::is_tensor_product(elem_type);
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
new file mode 100644
index 000000000..52af4d932
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -0,0 +1,538 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
+#define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
+
+#include "Types.h"
+#include "Math/Vector.h"
+#include <cstddef>
+#include <vector>
+
+/**
+ * @file NodeOrderingConventions.h
+ * @brief Documentation of node ordering conventions for all element types
+ *
+ * This file provides comprehensive documentation of the node ordering
+ * conventions used throughout the FE library. These orderings are consistent
+ * with VTK conventions and must be matched exactly when interfacing with
+ * the Mesh library.
+ *
+ * IMPORTANT: The FE library (Basis, Quadrature, Geometry) uses "node" to refer
+ * to degrees of freedom locations on reference elements. The Mesh library uses
+ * "vertex" for geometry vertices and "cell" for mesh elements. When interfacing
+ * between the two, ensure consistent ordering.
+ *
+ * Reference Element Conventions:
+ * - Line:       xi in [-1, 1]
+ * - Quad:       (xi, eta) in [-1, 1] x [-1, 1]
+ * - Hex:        (xi, eta, zeta) in [-1, 1]^3
+ * - Triangle:   (xi, eta) in simplex with vertices (0,0), (1,0), (0,1)
+ * - Tetrahedron: (xi, eta, zeta) in simplex with vertices
+ *                (0,0,0), (1,0,0), (0,1,0), (0,0,1)
+ * - Wedge:      Triangle base x line height, zeta in [-1, 1]
+ * - Pyramid:    Quad base at z=0, apex at (0, 0, 1)
+ *
+ *
+ * =============================================================================
+ * 1D ELEMENTS
+ * =============================================================================
+ *
+ * Line2 (Linear Line)
+ * -------------------
+ *   0---------1
+ *   |         |
+ *  xi=-1     xi=+1
+ *
+ * Node 0: xi = -1
+ * Node 1: xi = +1
+ *
+ *
+ * Line3 (Quadratic Line)
+ * ----------------------
+ *   0----2----1
+ *   |    |    |
+ *  xi=-1 0   xi=+1
+ *
+ * Node 0: xi = -1
+ * Node 1: xi = +1
+ * Node 2: xi =  0 (mid-edge)
+ *
+ *
+ * =============================================================================
+ * 2D QUADRILATERAL ELEMENTS
+ * =============================================================================
+ *
+ * Quad4 (Bilinear Quadrilateral)
+ * ------------------------------
+ *
+ *   3-----------2
+ *   |           |
+ *   |           |
+ *   |           |
+ *   0-----------1
+ *
+ * Node 0: (xi, eta) = (-1, -1)
+ * Node 1: (xi, eta) = (+1, -1)
+ * Node 2: (xi, eta) = (+1, +1)
+ * Node 3: (xi, eta) = (-1, +1)
+ *
+ *
+ * Quad8 (Serendipity Quadrilateral)
+ * ---------------------------------
+ *
+ *   3-----6-----2
+ *   |           |
+ *   7           5
+ *   |           |
+ *   0-----4-----1
+ *
+ * Corners (same as Quad4):
+ *   Node 0: (-1, -1)
+ *   Node 1: (+1, -1)
+ *   Node 2: (+1, +1)
+ *   Node 3: (-1, +1)
+ *
+ * Mid-edge nodes:
+ *   Node 4: ( 0, -1)  (edge 0-1)
+ *   Node 5: (+1,  0)  (edge 1-2)
+ *   Node 6: ( 0, +1)  (edge 2-3)
+ *   Node 7: (-1,  0)  (edge 3-0)
+ *
+ *
+ * Quad9 (Biquadratic Quadrilateral)
+ * ---------------------------------
+ *
+ *   3-----6-----2
+ *   |           |
+ *   7     8     5
+ *   |           |
+ *   0-----4-----1
+ *
+ * Same as Quad8 plus:
+ *   Node 8: (0, 0)  (center)
+ *
+ *
+ * =============================================================================
+ * 3D HEXAHEDRAL ELEMENTS
+ * =============================================================================
+ *
+ * Hex8 (Trilinear Hexahedron)
+ * ---------------------------
+ *
+ *        7-----------6
+ *       /|          /|
+ *      / |         / |
+ *     4-----------5  |
+ *     |  |        |  |
+ *     |  3--------|--2
+ *     | /         | /
+ *     |/          |/
+ *     0-----------1
+ *
+ * Bottom face (zeta = -1):
+ *   Node 0: (xi, eta, zeta) = (-1, -1, -1)
+ *   Node 1: (xi, eta, zeta) = (+1, -1, -1)
+ *   Node 2: (xi, eta, zeta) = (+1, +1, -1)
+ *   Node 3: (xi, eta, zeta) = (-1, +1, -1)
+ *
+ * Top face (zeta = +1):
+ *   Node 4: (xi, eta, zeta) = (-1, -1, +1)
+ *   Node 5: (xi, eta, zeta) = (+1, -1, +1)
+ *   Node 6: (xi, eta, zeta) = (+1, +1, +1)
+ *   Node 7: (xi, eta, zeta) = (-1, +1, +1)
+ *
+ *
+ * Hex20 (Serendipity Hexahedron)
+ * ------------------------------
+ *
+ *        7-----14-----6
+ *       /|           /|
+ *     15 |         13 |
+ *     /  19        /  18
+ *    4-----12-----5   |
+ *    |   |        |   |
+ *    |   3-----10-|---2
+ *   16  /        17  /
+ *    | 11         | 9
+ *    |/           |/
+ *    0------8-----1
+ *
+ * Corners (same as Hex8): Nodes 0-7
+ *
+ * Mid-edge nodes on bottom face (zeta = -1):
+ *   Node 8:  ( 0, -1, -1)  (edge 0-1)
+ *   Node 9:  (+1,  0, -1)  (edge 1-2)
+ *   Node 10: ( 0, +1, -1)  (edge 2-3)
+ *   Node 11: (-1,  0, -1)  (edge 3-0)
+ *
+ * Mid-edge nodes on top face (zeta = +1):
+ *   Node 12: ( 0, -1, +1)  (edge 4-5)
+ *   Node 13: (+1,  0, +1)  (edge 5-6)
+ *   Node 14: ( 0, +1, +1)  (edge 6-7)
+ *   Node 15: (-1,  0, +1)  (edge 7-4)
+ *
+ * Mid-edge nodes on vertical edges:
+ *   Node 16: (-1, -1,  0)  (edge 0-4)
+ *   Node 17: (+1, -1,  0)  (edge 1-5)
+ *   Node 18: (+1, +1,  0)  (edge 2-6)
+ *   Node 19: (-1, +1,  0)  (edge 3-7)
+ *
+ *
+ * Hex27 (Triquadratic Hexahedron)
+ * -------------------------------
+ * Same as Hex20 plus face-center and body-center nodes:
+ *
+ * Face centers:
+ *   Node 20: ( 0,  0, -1)  (bottom face)
+ *   Node 21: ( 0,  0, +1)  (top face)
+ *   Node 22: ( 0, -1,  0)  (front face)
+ *   Node 23: (+1,  0,  0)  (right face)
+ *   Node 24: ( 0, +1,  0)  (back face)
+ *   Node 25: (-1,  0,  0)  (left face)
+ *
+ * Body center:
+ *   Node 26: (0, 0, 0)
+ *
+ *
+ * =============================================================================
+ * 2D TRIANGULAR ELEMENTS
+ * =============================================================================
+ *
+ * Triangle3 (Linear Triangle)
+ * ---------------------------
+ *
+ *   2
+ *   |\
+ *   | \
+ *   |  \
+ *   |   \
+ *   0----1
+ *
+ * Reference: (xi, eta) simplex with vertices at:
+ *   Node 0: (xi, eta) = (0, 0)
+ *   Node 1: (xi, eta) = (1, 0)
+ *   Node 2: (xi, eta) = (0, 1)
+ *
+ *
+ * Triangle6 (Quadratic Triangle)
+ * ------------------------------
+ *
+ *   2
+ *   |\
+ *   | \
+ *   5  4
+ *   |   \
+ *   0--3--1
+ *
+ * Corners: Nodes 0-2 (same as Triangle3)
+ *
+ * Mid-edge nodes:
+ *   Node 3: (0.5,   0)  (edge 0-1)
+ *   Node 4: (0.5, 0.5)  (edge 1-2)
+ *   Node 5: (  0, 0.5)  (edge 2-0)
+ *
+ *
+ * =============================================================================
+ * 3D TETRAHEDRAL ELEMENTS
+ * =============================================================================
+ *
+ * Tetrahedron4 (Linear Tetrahedron)
+ * ---------------------------------
+ *
+ *             3
+ *            /|\
+ *           / | \
+ *          /  |  \
+ *         /   |   \
+ *        /    |    \
+ *       0-----|-----2
+ *        \    |    /
+ *         \   |   /
+ *          \  |  /
+ *           \ | /
+ *            \|/
+ *             1
+ *
+ * Reference: (xi, eta, zeta) simplex with vertices at:
+ *   Node 0: (0, 0, 0)
+ *   Node 1: (1, 0, 0)
+ *   Node 2: (0, 1, 0)
+ *   Node 3: (0, 0, 1)
+ *
+ *
+ * Tetrahedron10 (Quadratic Tetrahedron)
+ * -------------------------------------
+ * Corners: Nodes 0-3 (same as Tet4)
+ *
+ * Mid-edge nodes:
+ *   Node 4: (0.5,   0,   0)  (edge 0-1)
+ *   Node 5: (0.5, 0.5,   0)  (edge 1-2)
+ *   Node 6: (  0, 0.5,   0)  (edge 2-0)
+ *   Node 7: (  0,   0, 0.5)  (edge 0-3)
+ *   Node 8: (0.5,   0, 0.5)  (edge 1-3)
+ *   Node 9: (  0, 0.5, 0.5)  (edge 2-3)
+ *
+ *
+ * =============================================================================
+ * 3D WEDGE (PRISM) ELEMENTS
+ * =============================================================================
+ *
+ * Wedge6 (Linear Wedge)
+ * ---------------------
+ *
+ *         5
+ *        /|\
+ *       / | \
+ *      /  |  \
+ *     3---|---4
+ *     |   2   |
+ *     |  / \  |
+ *     | /   \ |
+ *     |/     \|
+ *     0-------1
+ *
+ * Reference: Triangle base at zeta = -1, top at zeta = +1
+ *
+ * Bottom face (zeta = -1):
+ *   Node 0: (0, 0, -1)
+ *   Node 1: (1, 0, -1)
+ *   Node 2: (0, 1, -1)
+ *
+ * Top face (zeta = +1):
+ *   Node 3: (0, 0, +1)
+ *   Node 4: (1, 0, +1)
+ *   Node 5: (0, 1, +1)
+ *
+ *
+ * Wedge15 (Quadratic Wedge)
+ * -------------------------
+ * Corners: Nodes 0-5 (same as Wedge6)
+ *
+ * Mid-edge nodes on bottom face:
+ *   Node 6:  (0.5,   0, -1)  (edge 0-1)
+ *   Node 7:  (0.5, 0.5, -1)  (edge 1-2)
+ *   Node 8:  (  0, 0.5, -1)  (edge 2-0)
+ *
+ * Mid-edge nodes on top face:
+ *   Node 9:  (0.5,   0, +1)  (edge 3-4)
+ *   Node 10: (0.5, 0.5, +1)  (edge 4-5)
+ *   Node 11: (  0, 0.5, +1)  (edge 5-3)
+ *
+ * Mid-edge nodes on vertical edges:
+ *   Node 12: (0, 0, 0)  (edge 0-3)
+ *   Node 13: (1, 0, 0)  (edge 1-4)
+ *   Node 14: (0, 1, 0)  (edge 2-5)
+ *
+ *
+ * Wedge18 (Complete Quadratic Wedge)
+ * ----------------------------------
+ * Corners and mid-edges: Nodes 0-14 (same as Wedge15)
+ *
+ * Face-center nodes on quadrilateral faces:
+ *   Node 15: (0.5, 0.0, 0.0)  (face with vertices 0-1-4-3, y = 0)
+ *   Node 16: (0.5, 0.5, 0.0)  (face with vertices 1-2-5-4, x + y = 1)
+ *   Node 17: (0.0, 0.5, 0.0)  (face with vertices 2-0-3-5, x = 0)
+ *
+ *
+ * =============================================================================
+ * 3D PYRAMID ELEMENTS
+ * =============================================================================
+ *
+ * Pyramid5 (Linear Pyramid)
+ * -------------------------
+ *
+ *           4
+ *          /|\
+ *         / | \
+ *        /  |  \
+ *       /   |   \
+ *      3----|----2
+ *      |    |    |
+ *      |    +    |   (apex projects to center of base)
+ *      |         |
+ *      0---------1
+ *
+ * Reference: Quad base in xi-eta plane at zeta = 0, apex at zeta = 1
+ *
+ * Base (zeta = 0):
+ *   Node 0: (-1, -1, 0)
+ *   Node 1: (+1, -1, 0)
+ *   Node 2: (+1, +1, 0)
+ *   Node 3: (-1, +1, 0)
+ *
+ * Apex:
+ *   Node 4: (0, 0, 1)
+ *
+ *
+ * Pyramid13 (Quadratic Pyramid)
+ * -----------------------------
+ * Corners: Nodes 0-4 (same as Pyramid5)
+ *
+ * Mid-edge nodes on base:
+ *   Node 5: ( 0, -1, 0)  (edge 0-1)
+ *   Node 6: (+1,  0, 0)  (edge 1-2)
+ *   Node 7: ( 0, +1, 0)  (edge 2-3)
+ *   Node 8: (-1,  0, 0)  (edge 3-0)
+ *
+ * Mid-edge nodes to apex:
+ *   Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
+ *   Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
+ *   Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
+ *   Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
+ *
+ *
+ * Pyramid14 (Quadratic Rational Pyramid)
+ * --------------------------------------
+ *
+ * This retained low-order compatibility layout matches the generated
+ * complete-family quadratic Lagrange ordering for the reference pyramid with
+ * base (-1,-1,0)..(1,1,0) and apex at (0,0,1). Nodes 0-12 coincide with the
+ * Pyramid13 layout; node 13 is the base center.
+ *
+ *   Base corners (same as Pyramid5):
+ *     Node 0: (-1, -1, 0)
+ *     Node 1: (+1, -1, 0)
+ *     Node 2: (+1, +1, 0)
+ *     Node 3: (-1, +1, 0)
+ *
+ *   Apex:
+ *     Node 4: (0, 0, 1)
+ *
+ *   Base mid-edges (same as Pyramid13):
+ *     Node 5:  ( 0, -1, 0)   (edge 0-1)
+ *     Node 6:  (+1,  0, 0)   (edge 1-2)
+ *     Node 7:  ( 0, +1, 0)   (edge 2-3)
+ *     Node 8:  (-1,  0, 0)   (edge 3-0)
+ *
+ *   Mid-edges to apex (same as Pyramid13):
+ *     Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
+ *     Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
+ *     Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
+ *     Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
+ *
+ *   Base center:
+ *     Node 13: (0, 0, 0)
+ *
+ *
+ * =============================================================================
+ * NOTES ON VTK COMPATIBILITY
+ * =============================================================================
+ *
+ * The node orderings above are consistent with VTK cell types:
+ *
+ *   VTK_LINE           (3)  -> Line2
+ *   VTK_QUADRATIC_EDGE (21) -> Line3
+ *   VTK_TRIANGLE       (5)  -> Triangle3
+ *   VTK_QUADRATIC_TRIANGLE (22) -> Triangle6
+ *   VTK_QUAD           (9)  -> Quad4
+ *   VTK_QUADRATIC_QUAD (23) -> Quad8
+ *   VTK_BIQUADRATIC_QUAD (28) -> Quad9
+ *   VTK_TETRA          (10) -> Tetrahedron4
+ *   VTK_QUADRATIC_TETRA (24) -> Tetrahedron10
+ *   VTK_HEXAHEDRON     (12) -> Hex8
+ *   VTK_QUADRATIC_HEXAHEDRON (25) -> Hex20
+ *   VTK_TRIQUADRATIC_HEXAHEDRON (29) -> Hex27
+ *   VTK_WEDGE          (13) -> Wedge6
+ *   VTK_QUADRATIC_WEDGE (26) -> Wedge15
+ *   VTK_BIQUADRATIC_QUADRATIC_WEDGE (32) -> Wedge18
+ *   VTK_PYRAMID        (14) -> Pyramid5
+ *   VTK_QUADRATIC_PYRAMID (27) -> Pyramid13
+ *
+ *
+ * =============================================================================
+ * BARYCENTRIC COORDINATES
+ * =============================================================================
+ *
+ * For simplex elements, barycentric coordinates (lambda_0, ..., lambda_n)
+ * satisfy sum(lambda_i) = 1.
+ *
+ * Triangle:
+ *   lambda_0 = 1 - xi - eta
+ *   lambda_1 = xi
+ *   lambda_2 = eta
+ *
+ * Tetrahedron:
+ *   lambda_0 = 1 - xi - eta - zeta
+ *   lambda_1 = xi
+ *   lambda_2 = eta
+ *   lambda_3 = zeta
+ *
+ */
+
+#include <span>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief Basis-side reference node coordinate queries
+ *
+ * This is intentionally named differently from `svmp::NodeOrdering` in Mesh,
+ * which handles mesh-format permutations rather than reference basis layouts.
+ */
+class ReferenceNodeLayout {
+public:
+    /**
+     * @brief Get reference coordinates for a node
+     * @param elem_type Element type
+     * @param local_node Local node index (0-based)
+     *
+     * Complete-family low-order Lagrange aliases (`Line2/3`, `Triangle3/6`,
+     * `Quad4/9`, `Tetra4/10`, `Hex8/27`, `Wedge6/18`, `Pyramid5/14`) are
+     * served by the generated arbitrary-order Lagrange ordering path. Explicit
+     * hard-coded tables remain only for serendipity-only enums such as
+     * `Quad8`, `Hex20`, `Wedge15`, and `Pyramid13`.
+     *
+     * @return Reference coordinates (xi, eta, zeta)
+     */
+    static math::Vector<Real, 3> get_node_coords(ElementType elem_type, std::size_t local_node);
+
+    /**
+     * @brief Get number of nodes for an element type
+     *
+     * The low-order complete-family Lagrange aliases share the same generated
+     * ordering path used by `get_node_coords`.
+     */
+    static std::size_t num_nodes(ElementType elem_type);
+
+    /**
+     * @brief Generate complete-family Lagrange node coordinates for a canonical topology and order
+     *
+     * This covers arbitrary-order complete nodal Lagrange spaces on the
+     * canonical topologies `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`,
+     * `Wedge6`, and `Pyramid5`. Serendipity variants are intentionally
+     * excluded.
+     */
+    static std::vector<math::Vector<Real, 3>>
+    get_lagrange_node_coords(ElementType canonical_type, int order);
+
+    /**
+     * @brief Optional mapping from mesh/reference node order to internal basis order
+     *
+     * Returns an empty span when the public node order is already the basis
+     * table order or no special mapping is registered.
+     */
+    static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
+
+    /**
+     * @brief Check if element is a simplex (triangle, tetrahedron)
+     */
+    static bool is_simplex(ElementType elem_type);
+
+    /**
+     * @brief Check if element uses tensor-product topology
+     */
+    static bool is_tensor_product(ElementType elem_type);
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
diff --git a/Code/Source/solver/FE/Basis/PyramidModalBasis.h b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
new file mode 100644
index 000000000..1ecdae282
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
@@ -0,0 +1,265 @@
+#ifndef SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
+#define SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
+
+// Shared rational/modal pyramid helpers for scalar complete-family and spectral
+// pyramid bases. The degenerate z=1 top plane is evaluated by its apex limit;
+// callers that reject non-apex top-plane queries must validate before calling.
+
+#include "BasisFunction.h"
+#include "BasisTolerance.h"
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace pyramid_modal {
+
+struct Term {
+    int px{0};
+    int py{0};
+    int pz{0};
+    int denom_power{0};
+};
+
+struct EvaluationPoint {
+    Real x{Real(0)};
+    Real y{Real(0)};
+    Real z{Real(0)};
+    Real t{Real(1)};
+    bool top_plane{false};
+    std::vector<Real> x_powers;
+    std::vector<Real> y_powers;
+    std::vector<Real> z_powers;
+    std::vector<Real> t_powers;
+};
+
+inline std::vector<Term> build_terms(int order) {
+    std::vector<Term> terms;
+    terms.reserve(static_cast<std::size_t>((order + 1) * (order + 2) *
+                                           (2 * order + 3) / 6));
+    for (int pz = 0; pz <= order; ++pz) {
+        const int n = order - pz;
+        for (int py = 0; py <= n; ++py) {
+            for (int px = 0; px <= n; ++px) {
+                terms.push_back(Term{px, py, pz, std::min(px, py)});
+            }
+        }
+    }
+    return terms;
+}
+
+inline bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi,
+                                    Real tolerance = detail::basis_scaled_tolerance()) {
+    return std::abs(Real(1) - xi[2]) <= tolerance;
+}
+
+inline void fill_powers(Real base, int max_power, std::vector<Real>& powers) {
+    powers.assign(static_cast<std::size_t>(max_power + 1), Real(1));
+    for (int p = 1; p <= max_power; ++p) {
+        powers[static_cast<std::size_t>(p)] =
+            powers[static_cast<std::size_t>(p - 1)] * base;
+    }
+}
+
+inline void prepare_evaluation_point(const math::Vector<Real, 3>& xi,
+                                     int max_px,
+                                     int max_py,
+                                     int max_pz,
+                                     int max_denom_power,
+                                     EvaluationPoint& point) {
+    point.x = xi[0];
+    point.y = xi[1];
+    point.z = xi[2];
+    point.t = Real(1) - point.z;
+    point.top_plane = on_degenerate_top_plane(xi);
+
+    fill_powers(point.x, std::max(max_px, 0), point.x_powers);
+    fill_powers(point.y, std::max(max_py, 0), point.y_powers);
+    fill_powers(point.z, std::max(max_pz, 0), point.z_powers);
+    if (point.top_plane) [[unlikely]] {
+        point.t_powers.assign(1u, Real(1));
+    } else {
+        fill_powers(point.t, std::max(max_denom_power + 2, 0), point.t_powers);
+    }
+}
+
+inline void prepare_evaluation_point(const std::vector<Term>& terms,
+                                     const math::Vector<Real, 3>& xi,
+                                     EvaluationPoint& point) {
+    int max_px = 0;
+    int max_py = 0;
+    int max_pz = 0;
+    int max_denom_power = 0;
+    for (const Term& term : terms) {
+        max_px = std::max(max_px, term.px);
+        max_py = std::max(max_py, term.py);
+        max_pz = std::max(max_pz, term.pz);
+        max_denom_power = std::max(max_denom_power, term.denom_power);
+    }
+    prepare_evaluation_point(xi, max_px, max_py, max_pz, max_denom_power, point);
+}
+
+inline void evaluate_term(const Term& term,
+                          const EvaluationPoint& point,
+                          Real& value,
+                          Gradient* gradient = nullptr,
+                          Hessian* hessian = nullptr) {
+    const auto pow_x = [&](int p) -> Real {
+        return point.x_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_y = [&](int p) -> Real {
+        return point.y_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_z = [&](int p) -> Real {
+        return point.z_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_t = [&](int p) -> Real {
+        return point.t_powers[static_cast<std::size_t>(p)];
+    };
+
+    if (point.top_plane) [[unlikely]] {
+        if (term.px == 0 && term.py == 0) {
+            value = pow_z(term.pz);
+        } else {
+            value = Real(0);
+        }
+        if (gradient != nullptr) {
+            *gradient = Gradient{};
+            if (term.px == 0 && term.py == 0 && term.pz > 0) {
+                (*gradient)[2] = static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+            }
+        }
+        if (hessian != nullptr) {
+            *hessian = Hessian{};
+            if (term.px == 0 && term.py == 0 && term.pz > 1) {
+                (*hessian)(2, 2) =
+                    static_cast<Real>(term.pz * (term.pz - 1)) *
+                    pow_z(term.pz - 2);
+            }
+        }
+        return;
+    }
+
+    const Real base = pow_x(term.px) * pow_y(term.py) * pow_z(term.pz);
+    const Real denom = pow_t(term.denom_power);
+    value = base / denom;
+
+    if (gradient != nullptr) {
+        *gradient = Gradient{};
+        if (term.px > 0) {
+            (*gradient)[0] =
+                static_cast<Real>(term.px) * pow_x(term.px - 1) *
+                pow_y(term.py) * pow_z(term.pz) / denom;
+        }
+        if (term.py > 0) {
+            (*gradient)[1] =
+                static_cast<Real>(term.py) * pow_x(term.px) *
+                pow_y(term.py - 1) * pow_z(term.pz) / denom;
+        }
+
+        Real gz = Real(0);
+        if (term.pz > 0) {
+            gz += static_cast<Real>(term.pz) * pow_x(term.px) *
+                  pow_y(term.py) * pow_z(term.pz - 1) / denom;
+        }
+        if (term.denom_power > 0) {
+            gz += static_cast<Real>(term.denom_power) * base / pow_t(term.denom_power + 1);
+        }
+        (*gradient)[2] = gz;
+    }
+
+    if (hessian == nullptr) {
+        return;
+    }
+
+    *hessian = Hessian{};
+    if (term.px > 1) {
+        (*hessian)(0, 0) =
+            static_cast<Real>(term.px * (term.px - 1)) *
+            pow_x(term.px - 2) * pow_y(term.py) * pow_z(term.pz) / denom;
+    }
+    if (term.py > 1) {
+        (*hessian)(1, 1) =
+            static_cast<Real>(term.py * (term.py - 1)) *
+            pow_x(term.px) * pow_y(term.py - 2) * pow_z(term.pz) / denom;
+    }
+    if (term.px > 0 && term.py > 0) {
+        const Real hxy =
+            static_cast<Real>(term.px * term.py) *
+            pow_x(term.px - 1) * pow_y(term.py - 1) * pow_z(term.pz) / denom;
+        (*hessian)(0, 1) = hxy;
+        (*hessian)(1, 0) = hxy;
+    }
+
+    if (term.px > 0) {
+        Real hxz =
+            static_cast<Real>(term.px) * pow_x(term.px - 1) *
+            pow_y(term.py) / denom;
+        if (term.pz > 0) {
+            hxz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+        } else {
+            hxz = Real(0);
+        }
+        if (term.denom_power > 0) {
+            hxz += static_cast<Real>(term.px * term.denom_power) *
+                   pow_x(term.px - 1) * pow_y(term.py) *
+                   pow_z(term.pz) / pow_t(term.denom_power + 1);
+        }
+        (*hessian)(0, 2) = hxz;
+        (*hessian)(2, 0) = hxz;
+    }
+
+    if (term.py > 0) {
+        Real hyz =
+            static_cast<Real>(term.py) * pow_x(term.px) *
+            pow_y(term.py - 1) / denom;
+        if (term.pz > 0) {
+            hyz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+        } else {
+            hyz = Real(0);
+        }
+        if (term.denom_power > 0) {
+            hyz += static_cast<Real>(term.py * term.denom_power) *
+                   pow_x(term.px) * pow_y(term.py - 1) *
+                   pow_z(term.pz) / pow_t(term.denom_power + 1);
+        }
+        (*hessian)(1, 2) = hyz;
+        (*hessian)(2, 1) = hyz;
+    }
+
+    Real hzz = Real(0);
+    if (term.pz > 1) {
+        hzz += static_cast<Real>(term.pz * (term.pz - 1)) *
+               pow_x(term.px) * pow_y(term.py) * pow_z(term.pz - 2) / denom;
+    }
+    if (term.pz > 0 && term.denom_power > 0) {
+        hzz += static_cast<Real>(2 * term.pz * term.denom_power) *
+               pow_x(term.px) * pow_y(term.py) *
+               pow_z(term.pz - 1) / pow_t(term.denom_power + 1);
+    }
+    if (term.denom_power > 0) {
+        hzz += static_cast<Real>(term.denom_power * (term.denom_power + 1)) *
+               base / pow_t(term.denom_power + 2);
+    }
+    (*hessian)(2, 2) = hzz;
+}
+
+inline void evaluate_term(const Term& term,
+                          const math::Vector<Real, 3>& xi,
+                          Real& value,
+                          Gradient* gradient = nullptr,
+                          Hessian* hessian = nullptr) {
+    EvaluationPoint point;
+    prepare_evaluation_point(
+        xi, term.px, term.py, term.pz, term.denom_power, point);
+    evaluate_term(term, point, value, gradient, hessian);
+}
+
+} // namespace pyramid_modal
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
new file mode 100644
index 000000000..309fd18be
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -0,0 +1,882 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "SerendipityBasis.h"
+#include "LagrangeBasis.h"
+#include "NodeOrderingConventions.h"
+#include "Math/DenseLinearAlgebra.h"
+#include "Math/IntegerMath.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <span>
+#include <string>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using math::pow_int;
+
+namespace {
+using Vec3 = math::Vector<Real, 3>;
+
+int quad_serendipity_superlinear_degree(int ax, int ay) {
+    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
+}
+
+std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
+    std::vector<std::array<int, 2>> exponents;
+    for (int ay = 0; ay <= order; ++ay) {
+        for (int ax = 0; ax <= order; ++ax) {
+            if (quad_serendipity_superlinear_degree(ax, ay) <= order) {
+                exponents.push_back({ax, ay});
+            }
+        }
+    }
+    return exponents;
+}
+
+std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
+    std::vector<Vec3> nodes;
+    if (order <= 0) {
+        return nodes;
+    }
+
+    const Real inv_order = Real(1) / Real(order);
+
+    nodes.push_back(Vec3{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Vec3{Real(1),  Real(-1), Real(0)});
+    nodes.push_back(Vec3{Real(1),  Real(1),  Real(0)});
+    nodes.push_back(Vec3{Real(-1), Real(1),  Real(0)});
+
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(-1) + Real(2 * i) * inv_order, Real(-1), Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(1), Real(-1) + Real(2 * i) * inv_order, Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(1) - Real(2 * i) * inv_order, Real(1), Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
+    }
+
+    if (nodes.size() > total_size) {
+        throw BasisConstructionException(
+            "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size",
+            __FILE__, __LINE__, __func__);
+    }
+
+    const std::size_t interior_count = total_size - nodes.size();
+    if (interior_count == 0u) {
+        return nodes;
+    }
+
+    std::vector<Vec3> interior_candidates;
+    interior_candidates.reserve(static_cast<std::size_t>((order - 1) * (order - 1)));
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            interior_candidates.push_back(
+                Vec3{Real(-1) + Real(2 * i) * inv_order,
+                     Real(-1) + Real(2 * j) * inv_order,
+                     Real(0)});
+        }
+    }
+
+    std::sort(interior_candidates.begin(), interior_candidates.end(),
+              [](const Vec3& a, const Vec3& b) {
+                  const Real a_linf = std::max(std::abs(a[0]), std::abs(a[1]));
+                  const Real b_linf = std::max(std::abs(b[0]), std::abs(b[1]));
+                  if (a_linf != b_linf) {
+                      return a_linf < b_linf;
+                  }
+
+                  const Real a_l1 = std::abs(a[0]) + std::abs(a[1]);
+                  const Real b_l1 = std::abs(b[0]) + std::abs(b[1]);
+                  if (a_l1 != b_l1) {
+                      return a_l1 < b_l1;
+                  }
+
+                  if (a[1] != b[1]) {
+                      return a[1] < b[1];
+                  }
+                  return a[0] < b[0];
+              });
+
+    if (interior_count > interior_candidates.size()) {
+        throw BasisConstructionException(
+            "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order",
+            __FILE__, __LINE__, __func__);
+    }
+
+    nodes.insert(nodes.end(),
+                 interior_candidates.begin(),
+                 interior_candidates.begin() + static_cast<std::ptrdiff_t>(interior_count));
+    return nodes;
+}
+
+std::vector<Real> invert_dense_matrix(std::vector<Real> matrix, int n, const char* label) {
+    return math::invert_dense_matrix(
+        std::move(matrix),
+        static_cast<std::size_t>(n),
+        std::string("SerendipityBasis interpolation matrix for ") + label);
+}
+
+std::vector<Real> quad_serendipity_inverse_vandermonde(
+    std::span<const Vec3> nodes,
+    std::span<const std::array<int, 2>> exponents,
+    int order) {
+    const int n = static_cast<int>(nodes.size());
+    if (n == 0 || exponents.size() != nodes.size()) {
+        throw BasisConstructionException(
+            "SerendipityBasis: invalid quadrilateral serendipity interpolation setup",
+            __FILE__, __LINE__, __func__);
+    }
+
+    std::vector<Real> vandermonde(static_cast<std::size_t>(n * n), Real(0));
+    auto idx = [n](int row, int col) -> std::size_t {
+        return static_cast<std::size_t>(row * n + col);
+    };
+
+    for (int row = 0; row < n; ++row) {
+        const Real x = nodes[static_cast<std::size_t>(row)][0];
+        const Real y = nodes[static_cast<std::size_t>(row)][1];
+        for (int col = 0; col < n; ++col) {
+            const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
+            vandermonde[idx(row, col)] = pow_int(x, ax) * pow_int(y, ay);
+        }
+    }
+
+    const std::string label = "Quad order " + std::to_string(order);
+    return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
+}
+constexpr std::array<Real, 13> kPyramid13CenterRedistribution = {
+    Real(-0.25), Real(-0.25), Real(-0.25), Real(-0.25),
+    Real(0),
+    Real(0.5), Real(0.5), Real(0.5), Real(0.5),
+    Real(0), Real(0), Real(0), Real(0)
+};
+
+constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
+    {{0, 0, 0}},
+    {{0, 0, 1}},
+    {{0, 0, 2}},
+    {{0, 1, 0}},
+    {{0, 1, 1}},
+    {{0, 1, 2}},
+    {{0, 2, 0}},
+    {{0, 2, 1}},
+    {{1, 0, 0}},
+    {{1, 0, 1}},
+    {{1, 0, 2}},
+    {{1, 1, 0}},
+    {{1, 1, 1}},
+    {{2, 0, 0}},
+    {{2, 0, 1}}
+}};
+
+constexpr std::array<std::array<Real, 15>, 15> kWedge15Coefficients = {{
+    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}},
+    {{-0.5, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{0.5, -0, -0, 0.5, -0, -0, -0, -0, -0, -0, -0, -0, -1, -0, -0}},
+    {{-1, 0, -1, -1, 0, -1, 0, 0, 2, 0, 0, 2, -1, 0, 1}},
+    {{1.5, 0, 0.5, -1.5, 0, -0.5, 0, 0, -2, 0, 0, 2, 0, 0, 0}},
+    {{-0.5, -0, 0.5, -0.5, -0, 0.5, -0, -0, -0, -0, -0, -0, 1, -0, -1}},
+    {{1, 0, 1, 1, 0, 1, 0, 0, -2, 0, 0, -2, 0, 0, 0}},
+    {{-1, 0, -1, 1, 0, 1, 0, 0, 2, 0, 0, -2, 0, 0, 0}},
+    {{-1, -1, 0, -1, -1, 0, 2, 0, 0, 2, 0, 0, -1, 1, 0}},
+    {{1.5, 0.5, 0, -1.5, -0.5, 0, -2, 0, 0, 2, 0, 0, 0, 0, 0}},
+    {{-0.5, 0.5, -0, -0.5, 0.5, -0, -0, -0, -0, -0, -0, -0, 1, -1, -0}},
+    {{2, 0, -0, 2, 0, -0, -2, 2, -2, -2, 2, -2, -0, -0, -0}},
+    {{-2, 0, 0, 2, 0, 0, 2, -2, 2, -2, 2, -2, 0, 0, 0}},
+    {{1, 1, -0, 1, 1, -0, -2, -0, -0, -2, -0, -0, -0, -0, -0}},
+    {{-1, -1, -0, 1, 1, -0, 2, -0, -0, -2, -0, -0, -0, -0, -0}}
+}};
+
+static const int hex20_monomial_exponents[20][3] = {
+    {0, 0, 0}, {0, 0, 1}, {0, 0, 2}, {0, 1, 0}, {0, 1, 1},
+    {0, 1, 2}, {0, 2, 0}, {0, 2, 1}, {1, 0, 0}, {1, 0, 1},
+    {1, 0, 2}, {1, 1, 0}, {1, 1, 1}, {1, 1, 2}, {1, 2, 0},
+    {1, 2, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}
+};
+
+static const Real hex20_coeffs[20][20] = {
+    {-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25},
+    {0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25},
+    {0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0, -0.25, -0.25, 0.25, 0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0},
+    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0},
+    {0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0, 0, 0, 0, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0},
+    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25},
+    {-0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25},
+    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0},
+    {0.125, -0.125, -0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}
+};
+
+inline std::array<Real, 3> quadratic_powers(Real x) {
+    return {Real(1), x, x * x};
+}
+
+void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real phi[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+        phi[j] = rp[static_cast<std::size_t>(a)] *
+                 sp[static_cast<std::size_t>(b)] *
+                 tp[static_cast<std::size_t>(c)];
+    }
+    for (int i = 0; i < 20; ++i) {
+        Real v = Real(0);
+        for (int j = 0; j < 20; ++j) {
+            v += hex20_coeffs[j][i] * phi[j];
+        }
+        internal_vals[i] = v;
+    }
+}
+
+void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real dphi_dr[20], dphi_ds[20], dphi_dt[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+
+        dphi_dr[j] = (a > 0) ? Real(a) * rp[static_cast<std::size_t>(a - 1)] *
+                                    sp[static_cast<std::size_t>(b)] *
+                                    tp[static_cast<std::size_t>(c)]
+                              : Real(0);
+        dphi_ds[j] = (b > 0) ? rp[static_cast<std::size_t>(a)] *
+                                    Real(b) * sp[static_cast<std::size_t>(b - 1)] *
+                                    tp[static_cast<std::size_t>(c)]
+                              : Real(0);
+        dphi_dt[j] = (c > 0) ? rp[static_cast<std::size_t>(a)] *
+                                    sp[static_cast<std::size_t>(b)] *
+                                    Real(c) * tp[static_cast<std::size_t>(c - 1)]
+                              : Real(0);
+    }
+
+    for (int i = 0; i < 20; ++i) {
+        Real gr = Real(0), gs = Real(0), gt = Real(0);
+        for (int j = 0; j < 20; ++j) {
+            gr += hex20_coeffs[j][i] * dphi_dr[j];
+            gs += hex20_coeffs[j][i] * dphi_ds[j];
+            gt += hex20_coeffs[j][i] * dphi_dt[j];
+        }
+        internal_grads[i][0] = gr;
+        internal_grads[i][1] = gs;
+        internal_grads[i][2] = gt;
+    }
+}
+
+void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real d2phi_drr[20], d2phi_dss[20], d2phi_dtt[20];
+    Real d2phi_drs[20], d2phi_drt[20], d2phi_dst[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+
+        d2phi_drr[j] = (a > 1) ? Real(a * (a - 1)) *
+                                      rp[static_cast<std::size_t>(a - 2)] *
+                                      sp[static_cast<std::size_t>(b)] *
+                                      tp[static_cast<std::size_t>(c)]
+                                : Real(0);
+        d2phi_dss[j] = (b > 1) ? rp[static_cast<std::size_t>(a)] *
+                                      Real(b * (b - 1)) *
+                                      sp[static_cast<std::size_t>(b - 2)] *
+                                      tp[static_cast<std::size_t>(c)]
+                                : Real(0);
+        d2phi_dtt[j] = (c > 1) ? rp[static_cast<std::size_t>(a)] *
+                                      sp[static_cast<std::size_t>(b)] *
+                                      Real(c * (c - 1)) *
+                                      tp[static_cast<std::size_t>(c - 2)]
+                                : Real(0);
+        d2phi_drs[j] = (a > 0 && b > 0) ? Real(a * b) *
+                                              rp[static_cast<std::size_t>(a - 1)] *
+                                              sp[static_cast<std::size_t>(b - 1)] *
+                                              tp[static_cast<std::size_t>(c)]
+                                        : Real(0);
+        d2phi_drt[j] = (a > 0 && c > 0) ? Real(a * c) *
+                                              rp[static_cast<std::size_t>(a - 1)] *
+                                              sp[static_cast<std::size_t>(b)] *
+                                              tp[static_cast<std::size_t>(c - 1)]
+                                        : Real(0);
+        d2phi_dst[j] = (b > 0 && c > 0) ? rp[static_cast<std::size_t>(a)] *
+                                              Real(b * c) *
+                                              sp[static_cast<std::size_t>(b - 1)] *
+                                              tp[static_cast<std::size_t>(c - 1)]
+                                        : Real(0);
+    }
+
+    for (int i = 0; i < 20; ++i) {
+        Hessian H{};
+        for (int j = 0; j < 20; ++j) {
+            H(0, 0) += hex20_coeffs[j][i] * d2phi_drr[j];
+            H(1, 1) += hex20_coeffs[j][i] * d2phi_dss[j];
+            H(2, 2) += hex20_coeffs[j][i] * d2phi_dtt[j];
+            H(0, 1) += hex20_coeffs[j][i] * d2phi_drs[j];
+            H(0, 2) += hex20_coeffs[j][i] * d2phi_drt[j];
+            H(1, 2) += hex20_coeffs[j][i] * d2phi_dst[j];
+        }
+        H(1, 0) = H(0, 1);
+        H(2, 0) = H(0, 2);
+        H(2, 1) = H(1, 2);
+        internal_hessians[i] = H;
+    }
+}
+
+void eval_wedge15_polynomial(Real r,
+                             Real s,
+                             Real t,
+                             Real* values,
+                             Gradient* gradients,
+                             Hessian* hessians) {
+    Real phi[15]{};
+    Real dr[15]{};
+    Real ds[15]{};
+    Real dt[15]{};
+    Real drr[15]{};
+    Real dss[15]{};
+    Real dtt[15]{};
+    Real drs[15]{};
+    Real drt[15]{};
+    Real dst[15]{};
+
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+
+    for (int j = 0; j < 15; ++j) {
+        const auto& exponent = kWedge15MonomialExponents[static_cast<std::size_t>(j)];
+        const int a = exponent[0];
+        const int b = exponent[1];
+        const int c = exponent[2];
+        const auto ar = static_cast<std::size_t>(a);
+        const auto bs = static_cast<std::size_t>(b);
+        const auto ct = static_cast<std::size_t>(c);
+
+        const Real ra = rp[ar];
+        const Real sb = sp[bs];
+        const Real tc = tp[ct];
+
+        if (values) {
+            phi[j] = ra * sb * tc;
+        }
+        if (gradients) {
+            dr[j] = (a > 0) ? Real(a) * rp[ar - 1u] * sb * tc : Real(0);
+            ds[j] = (b > 0) ? ra * Real(b) * sp[bs - 1u] * tc : Real(0);
+            dt[j] = (c > 0) ? ra * sb * Real(c) * tp[ct - 1u] : Real(0);
+        }
+        if (hessians) {
+            drr[j] = (a > 1) ? Real(a * (a - 1)) * rp[ar - 2u] * sb * tc : Real(0);
+            dss[j] = (b > 1) ? ra * Real(b * (b - 1)) * sp[bs - 2u] * tc : Real(0);
+            dtt[j] = (c > 1) ? ra * sb * Real(c * (c - 1)) * tp[ct - 2u] : Real(0);
+            drs[j] = (a > 0 && b > 0) ? Real(a * b) * rp[ar - 1u] * sp[bs - 1u] * tc : Real(0);
+            drt[j] = (a > 0 && c > 0) ? Real(a * c) * rp[ar - 1u] * sb * tp[ct - 1u] : Real(0);
+            dst[j] = (b > 0 && c > 0) ? ra * Real(b * c) * sp[bs - 1u] * tp[ct - 1u] : Real(0);
+        }
+    }
+
+    for (int i = 0; i < 15; ++i) {
+        Real value = Real(0);
+        Real gr = Real(0);
+        Real gs = Real(0);
+        Real gt = Real(0);
+        Hessian H{};
+        for (int j = 0; j < 15; ++j) {
+            const Real coefficient =
+                kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
+            if (values) {
+                value += coefficient * phi[j];
+            }
+            if (gradients) {
+                gr += coefficient * dr[j];
+                gs += coefficient * ds[j];
+                gt += coefficient * dt[j];
+            }
+            if (hessians) {
+                H(0, 0) += coefficient * drr[j];
+                H(1, 1) += coefficient * dss[j];
+                H(2, 2) += coefficient * dtt[j];
+                H(0, 1) += coefficient * drs[j];
+                H(0, 2) += coefficient * drt[j];
+                H(1, 2) += coefficient * dst[j];
+            }
+        }
+
+        const std::size_t index = static_cast<std::size_t>(i);
+        if (values) {
+            values[index] = value;
+        }
+        if (gradients) {
+            gradients[index][0] = gr;
+            gradients[index][1] = gs;
+            gradients[index][2] = gt;
+        }
+        if (hessians) {
+            H(1, 0) = H(0, 1);
+            H(2, 0) = H(0, 2);
+            H(2, 1) = H(1, 2);
+            hessians[index] = H;
+        }
+    }
+}
+
+} // namespace
+
+SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mode)
+    : element_type_(type), dimension_(0), order_(order), size_(0), geometry_mode_(geometry_mode) {
+    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
+        dimension_ = 2;
+        if (order_ < 1) {
+            order_ = 1;
+        }
+        if (type == ElementType::Quad8 && order_ != 2) {
+            throw BasisConfigurationException(
+                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity",
+                __FILE__, __LINE__, __func__);
+        }
+        quad_monomial_exponents_ = quad_serendipity_exponents(order_);
+        size_ = quad_monomial_exponents_.size();
+        nodes_ = quad_serendipity_nodes(order_, size_);
+        if (nodes_.size() != size_) {
+            throw BasisConstructionException(
+                "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes",
+                __FILE__, __LINE__, __func__);
+        }
+        quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
+    } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
+        dimension_ = 3;
+        if (order_ < 1) order_ = 1;
+        if (order_ == 1) {
+            size_ = 8;
+        } else if (order_ == 2) {
+            size_ = 20;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on hexahedra",
+                __FILE__, __LINE__, __func__);
+        }
+    } else if (type == ElementType::Wedge15) {
+        dimension_ = 3;
+        if (order_ < 2) {
+            order_ = 2;
+        }
+        if (order_ == 2) {
+            size_ = 15;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on wedge15",
+                __FILE__, __LINE__, __func__);
+        }
+    } else if (type == ElementType::Pyramid13) {
+        dimension_ = 3;
+        if (order_ < 2) {
+            order_ = 2;
+        }
+        if (order_ == 2) {
+            size_ = 13;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on pyramid13",
+                __FILE__, __LINE__, __func__);
+        }
+    } else {
+        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, Wedge15, and Pyramid13 elements",
+                                                 __FILE__, __LINE__, __func__);
+    }
+
+    if (nodes_.empty()) {
+        nodes_.reserve(size_);
+        for (std::size_t i = 0; i < size_; ++i) {
+            nodes_.push_back(ReferenceNodeLayout::get_node_coords(element_type_, i));
+        }
+    }
+}
+
+bool SerendipityBasis::cache_identity_words(std::vector<std::uint64_t>& words) const {
+    words.push_back(0x736572656e646970ULL);
+    words.push_back(static_cast<std::uint64_t>(basis_type()));
+    words.push_back(static_cast<std::uint64_t>(element_type_));
+    words.push_back(static_cast<std::uint64_t>(dimension_));
+    words.push_back(static_cast<std::uint64_t>(order_));
+    words.push_back(static_cast<std::uint64_t>(size_));
+    words.push_back(geometry_mode_ ? 1u : 0u);
+    return true;
+}
+
+void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                       std::vector<Real>& values) const {
+    values.assign(size_, Real(0));
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> monomials(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            monomials[j] = pow_int(x, ax) * pow_int(y, ay);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            Real value = Real(0);
+            for (std::size_t j = 0; j < size_; ++j) {
+                value += monomials[j] * quad_inv_vandermonde_[j * size_ + i];
+            }
+            values[i] = value;
+        }
+        return;
+    }
+
+    if (dimension_ == 3 && order_ == 1) {
+        // Hex8 trilinear shape functions
+        const Real r = x;
+        const Real s = y;
+        const Real t = z;
+        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
+        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
+        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
+        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
+        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
+        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
+        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
+        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
+        return;
+    }
+
+    const Real r = x;
+    const Real s = y;
+    const Real t = z;
+
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        // Hex20 geometry mode: use trilinear Hex8 shape functions on corners, edges zero.
+        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
+        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
+        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
+        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
+        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
+        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
+        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
+        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
+        for (std::size_t i = 8; i < 20; ++i) {
+            values[i] = Real(0);
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex20) {
+        Real internal_vals[20];
+        eval_hex20_internal(r, s, t, internal_vals);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            values[i] = internal_vals[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Wedge15) {
+        eval_wedge15_polynomial(r, s, t, values.data(), nullptr, nullptr);
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14> parent_values{};
+        parent.evaluate_values_to(xi, parent_values.data());
+        for (std::size_t i = 0; i < 13; ++i) {
+            values[i] = parent_values[i] + kPyramid13CenterRedistribution[i] * parent_values[13];
+        }
+        return;
+    }
+}
+
+void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                          std::vector<Gradient>& gradients) const {
+    gradients.assign(size_, Gradient{});
+
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for gradient evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> dmon_dx(size_, Real(0));
+        std::vector<Real> dmon_dy(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            dmon_dx[j] = (ax > 0) ? Real(ax) * pow_int(x, ax - 1) * pow_int(y, ay) : Real(0);
+            dmon_dy[j] = (ay > 0) ? pow_int(x, ax) * Real(ay) * pow_int(y, ay - 1) : Real(0);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            Real gx = Real(0);
+            Real gy = Real(0);
+            for (std::size_t j = 0; j < size_; ++j) {
+                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
+                gx += dmon_dx[j] * coeff;
+                gy += dmon_dy[j] * coeff;
+            }
+            gradients[i][0] = gx;
+            gradients[i][1] = gy;
+        }
+        return;
+    }
+
+    // 3D linear hex (Hex8)
+    if (dimension_ == 3 && order_ == 1) {
+        const Real r = x, s = y, t = z;
+        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
+
+        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        return;
+    }
+
+    // Hex20 geometry mode: use Hex8 gradients
+    if (dimension_ == 3 && order_ == 2 && geometry_mode_ &&
+        (element_type_ == ElementType::Hex20 || element_type_ == ElementType::Quad8)) {
+        const Real r = x, s = y, t = z;
+        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
+
+        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        // Edge-node gradients remain zero
+        return;
+    }
+
+    // Hex20 analytical gradients using monomial differentiation
+    if (element_type_ == ElementType::Hex20 && order_ == 2) {
+        const Real r = x, s = y, t = z;
+        Gradient internal_grads[20];
+        eval_hex20_grad_internal(r, s, t, internal_grads);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            gradients[i] = internal_grads[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    // Wedge15 analytical gradients using monomial differentiation
+    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
+        eval_wedge15_polynomial(x, y, z, nullptr, gradients.data(), nullptr);
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14u * 3u> parent_gradients{};
+        // Pyramid13 inherits the complete-family pyramid apex contract from the
+        // parent basis rather than introducing a separate regularized path.
+        parent.evaluate_gradients_to(xi, parent_gradients.data());
+        const auto parent_gradient = [&](std::size_t node, std::size_t component) {
+            return parent_gradients[node * 3u + component];
+        };
+        for (std::size_t i = 0; i < 13; ++i) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                gradients[i][c] =
+                    parent_gradient(i, c) +
+                    kPyramid13CenterRedistribution[i] * parent_gradient(13u, c);
+            }
+        }
+        return;
+    }
+
+    throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                         std::vector<Hessian>& hessians) const {
+    hessians.assign(size_, Hessian{});
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for Hessian evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> dxx(size_, Real(0));
+        std::vector<Real> dxy(size_, Real(0));
+        std::vector<Real> dyy(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            dxx[j] = (ax > 1) ? Real(ax * (ax - 1)) * pow_int(x, ax - 2) * pow_int(y, ay) : Real(0);
+            dxy[j] = (ax > 0 && ay > 0) ? Real(ax * ay) * pow_int(x, ax - 1) * pow_int(y, ay - 1) : Real(0);
+            dyy[j] = (ay > 1) ? Real(ay * (ay - 1)) * pow_int(x, ax) * pow_int(y, ay - 2) : Real(0);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            for (std::size_t j = 0; j < size_; ++j) {
+                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
+                hessians[i](0, 0) += dxx[j] * coeff;
+                hessians[i](0, 1) += dxy[j] * coeff;
+                hessians[i](1, 1) += dyy[j] * coeff;
+            }
+            hessians[i](1, 0) = hessians[i](0, 1);
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex8 && order_ == 1) {
+        static const LagrangeBasis parent(ElementType::Hex8, 1);
+        parent.evaluate_hessians(xi, hessians);
+        return;
+    }
+
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        static const LagrangeBasis parent(ElementType::Hex8, 1);
+        std::array<Real, 8u * 9u> parent_hessians{};
+        parent.evaluate_hessians_to(xi, parent_hessians.data());
+        for (std::size_t i = 0; i < 8; ++i) {
+            for (std::size_t r = 0; r < 3; ++r) {
+                for (std::size_t c = 0; c < 3; ++c) {
+                    hessians[i](r, c) = parent_hessians[i * 9u + r * 3u + c];
+                }
+            }
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex20 && order_ == 2) {
+        Hessian internal_hessians[20];
+        eval_hex20_hess_internal(x, y, z, internal_hessians);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            hessians[i] = internal_hessians[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
+        eval_wedge15_polynomial(x, y, z, nullptr, nullptr, hessians.data());
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14u * 9u> parent_hessians{};
+        // Pyramid13 inherits the complete-family pyramid apex contract from the
+        // parent basis rather than introducing a separate regularized path.
+        parent.evaluate_hessians_to(xi, parent_hessians.data());
+        const Hessian center_hessian = load_hessian(parent_hessians.data() + 13u * 9u);
+        for (std::size_t i = 0; i < 13; ++i) {
+            hessians[i] = load_hessian(parent_hessians.data() + i * 9u);
+            add_scaled_hessian(hessians[i], center_hessian, kPyramid13CenterRedistribution[i]);
+        }
+        return;
+    }
+
+    throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
+                                   __FILE__, __LINE__, __func__);
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
new file mode 100644
index 000000000..98c01415a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -0,0 +1,70 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_SERENDIPITYBASIS_H
+#define SVMP_FE_BASIS_SERENDIPITYBASIS_H
+
+/**
+ * @file SerendipityBasis.h
+ * @brief Reduced-degree-of-freedom serendipity bases
+ *
+ * `Pyramid13` inherits its apex contract from the complete-family rational
+ * pyramid basis: values remain exact at the apex, while exact-apex gradient
+ * and Hessian queries throw because the inherited nodal derivative limit is
+ * not unique.
+ */
+
+#include "BasisFunction.h"
+
+#include <array>
+#include <cstdint>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+class SerendipityBasis : public BasisFunction {
+public:
+    SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
+
+    BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+    bool cache_identity_words(std::vector<std::uint64_t>& words) const override;
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override;
+
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const override;
+
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const override;
+
+private:
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_;
+    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<std::array<int, 2>> quad_monomial_exponents_;
+    // Row-major inverse Vandermonde, indexed as [monomial, basis].
+    std::vector<Real> quad_inv_vandermonde_;
+
+    // When true, this basis is used purely for geometry mapping and may use
+    // reduced polynomial order (e.g., Hex20 geometry as Hex8).
+    bool geometry_mode_;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_SERENDIPITYBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasis.h b/Code/Source/solver/FE/Basis/VectorBasis.h
new file mode 100644
index 000000000..d442c2160
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasis.h
@@ -0,0 +1,255 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASIS_H
+#define SVMP_FE_BASIS_VECTORBASIS_H
+
+/**
+ * @file VectorBasis.h
+ * @brief Vector-valued bases for H(div) and H(curl) conforming spaces
+ */
+
+#include "BasisFunction.h"
+#include "VectorBasisModalPolynomial.h"
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief DOF entity type for vector-valued basis functions
+ */
+enum class DofEntity {
+    Vertex,   ///< DOF associated with a vertex
+    Edge,     ///< DOF associated with an edge (tangential moments for H(curl))
+    Face,     ///< DOF associated with a face (normal moments for H(div), tangential for H(curl))
+    Interior  ///< DOF associated with element interior
+};
+
+/**
+ * @brief DOF association metadata for a single DOF
+ */
+struct DofAssociation {
+    DofEntity entity_type{DofEntity::Interior};
+    int entity_id{-1};      ///< Local index of the entity (edge/face/vertex)
+    int moment_index{0};    ///< Index within the entity's moment space
+};
+
+struct SparseModalCoefficientMatrix {
+    std::size_t rows{0};
+    std::size_t cols{0};
+    std::vector<std::size_t> row_offsets;
+    std::vector<std::size_t> dofs;
+    std::vector<Real> coefficients;
+};
+
+class VectorBasisFunction : public BasisFunction {
+public:
+    bool is_vector_valued() const noexcept override { return true; }
+    bool supports_vector_jacobians() const noexcept override { return true; }
+    void evaluate_values(const math::Vector<Real, 3>&,
+                         std::vector<Real>&) const override {
+        throw BasisEvaluationException("Vector basis uses evaluate_vector_values",
+                                       __FILE__, __LINE__, __func__);
+    }
+
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /**
+     * @brief Get DOF association metadata for all basis functions
+     *
+     * Returns a vector of size(), where each entry describes which
+     * geometric entity (vertex/edge/face/interior) the corresponding
+     * DOF is associated with. This is essential for orientation-aware
+     * assembly of H(div) and H(curl) spaces.
+     */
+    virtual std::vector<DofAssociation> dof_associations() const {
+        // Default: all interior DOFs (subclasses should override)
+        std::vector<DofAssociation> result(size());
+        for (std::size_t i = 0; i < size(); ++i) {
+            result[i].entity_type = DofEntity::Interior;
+            result[i].entity_id = 0;
+            result[i].moment_index = static_cast<int>(i);
+        }
+        return result;
+    }
+};
+
+/**
+ * @brief Raviart-Thomas H(div) basis on supported element families
+ */
+class RaviartThomasBasis : public VectorBasisFunction {
+public:
+    RaviartThomasBasis(ElementType type, int order = 0);
+
+    BasisType basis_type() const noexcept override { return BasisType::RaviartThomas; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& divergence) const override;
+    bool supports_divergence() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (face/edge DOFs for 2D, face DOFs for 3D H(div))
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+    using SeedJacobianEvaluator = void (*)(
+        const math::Vector<Real, 3>&,
+        std::vector<VectorJacobian>&);
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+
+    bool nodal_generated_{false};
+    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid RT(k=1,2) transformed from direct seed functions
+    std::vector<int> transformed_seed_indices_;
+    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
+    std::vector<ModalPolynomial> monomials_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
+    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
+    // Sparse coefficients for nodal basis in modal monomial basis:
+    //   phi_j = sum_p c(p,j) * modal_p.
+    // Rows index modal functions; entries target nodal DOFs.
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
+};
+
+/**
+ * @brief First-kind Nedelec H(curl) basis on supported element families
+ */
+class NedelecBasis : public VectorBasisFunction {
+public:
+    NedelecBasis(ElementType type, int order = 0);
+
+    BasisType basis_type() const noexcept override { return BasisType::Nedelec; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_curl(const math::Vector<Real, 3>& xi,
+                       std::vector<math::Vector<Real, 3>>& curl) const override;
+    bool supports_curl() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (edge DOFs for H(curl), face DOFs for 3D interior)
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+    using SeedJacobianEvaluator = void (*)(
+        const math::Vector<Real, 3>&,
+        std::vector<VectorJacobian>&);
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+
+    bool nodal_generated_{false};
+    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid ND(k=1,2) transformed from direct seed/candidate functions
+    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
+    std::vector<ModalPolynomial> monomials_;
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
+    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
+};
+
+/**
+ * @brief Brezzi-Douglas-Marini basis (simple linear variant)
+ */
+class BDMBasis : public VectorBasisFunction {
+public:
+    BDMBasis(ElementType type, int order = 1);
+
+    BasisType basis_type() const noexcept override { return BasisType::BDM; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& divergence) const override;
+    bool supports_divergence() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (face/edge DOFs for H(div))
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+    bool nodal_generated_{false};
+    std::vector<ModalPolynomial> monomials_;
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
new file mode 100644
index 000000000..7ec848633
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
@@ -0,0 +1,593 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "VectorBasisEvaluationHelpers.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace vector_common {
+
+VectorBasisScratch& vector_basis_scratch() {
+    // Scratch is intentionally thread-local: production assembly uses a
+    // persistent worker-thread team, so buffers stay warm on each worker.
+    static thread_local VectorBasisScratch scratch;
+    return scratch;
+}
+
+void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts) {
+    vector_basis_scratch().prewarm(max_size, max_qpts);
+}
+
+void fill_powers(Real x, int max_p, std::vector<Real>& out) {
+    BASIS_CHECK_CONSTRUCTION(max_p >= 0, "powers: negative max_p");
+    out.assign(static_cast<std::size_t>(max_p + 1), Real(1));
+    for (int i = 1; i <= max_p; ++i) {
+        out[static_cast<std::size_t>(i)] =
+            out[static_cast<std::size_t>(i - 1)] * x;
+    }
+}
+
+void fill_power_tables(const Vec3& xi,
+                       const std::array<int, 3>& limits,
+                       VectorBasisScratch& scratch) {
+    fill_powers(xi[0], limits[0], scratch.px);
+    fill_powers(xi[1], limits[1], scratch.py);
+    fill_powers(xi[2], limits[2], scratch.pz);
+}
+
+namespace {
+
+constexpr Real kSparseCoefficientRelativeTolerance =
+    Real(256) * std::numeric_limits<Real>::epsilon();
+
+void fill_batched_axis_powers(const std::vector<Vec3>& points,
+                              std::size_t axis,
+                              int max_power,
+                              std::vector<Real>& out) {
+    BASIS_CHECK_CONSTRUCTION(max_power >= 0, "batched powers: negative max_p");
+    const std::size_t num_qpts = points.size();
+    out.assign(static_cast<std::size_t>(max_power + 1) * num_qpts, Real(1));
+    if (num_qpts == 0 || max_power == 0) {
+        return;
+    }
+
+    Real* first_power = out.data() + num_qpts;
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        first_power[q] = points[q][axis];
+    }
+    for (int power = 2; power <= max_power; ++power) {
+        const Real* previous =
+            out.data() + static_cast<std::size_t>(power - 1) * num_qpts;
+        Real* current = out.data() + static_cast<std::size_t>(power) * num_qpts;
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            current[q] = previous[q] * points[q][axis];
+        }
+    }
+}
+
+} // namespace
+
+void fill_batched_power_tables(const std::vector<Vec3>& points,
+                               const std::array<int, 3>& limits,
+                               VectorBasisScratch& scratch) {
+    fill_batched_axis_powers(points, 0u, limits[0], scratch.batched_px);
+    fill_batched_axis_powers(points, 1u, limits[1], scratch.batched_py);
+    fill_batched_axis_powers(points, 2u, limits[2], scratch.batched_pz);
+}
+
+void validate_vector_strided_outputs(std::size_t num_qpts,
+                                     std::size_t output_stride,
+                                     const char* family_name) {
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException(
+            std::string(family_name) +
+                " strided vector evaluation requires output_stride >= points.size()",
+            __FILE__, __LINE__, __func__);
+    }
+}
+
+void zero_active_strided_rows(Real* output,
+                              std::size_t rows,
+                              std::size_t output_stride,
+                              std::size_t num_qpts) {
+    for (std::size_t row = 0; row < rows; ++row) {
+        std::fill_n(output + row * output_stride, num_qpts, Real(0));
+    }
+}
+
+SparseModalCoefficientMatrix build_sparse_modal_coefficients(
+    const std::vector<Real>& dense_coefficients,
+    std::size_t rows,
+    std::size_t cols) {
+    BASIS_CHECK_CONSTRUCTION(dense_coefficients.size() == rows * cols,
+                 "build_sparse_modal_coefficients: dense coefficient size mismatch");
+
+    SparseModalCoefficientMatrix sparse;
+    sparse.rows = rows;
+    sparse.cols = cols;
+    sparse.row_offsets.reserve(rows + 1u);
+    sparse.row_offsets.push_back(0u);
+
+    Real max_abs = Real(0);
+    for (const Real coefficient : dense_coefficients) {
+        max_abs = std::max(max_abs, std::abs(coefficient));
+    }
+    const Real prune_threshold = kSparseCoefficientRelativeTolerance * max_abs;
+
+    for (std::size_t row = 0; row < rows; ++row) {
+        const Real* dense_row = dense_coefficients.data() + row * cols;
+        for (std::size_t col = 0; col < cols; ++col) {
+            const Real coefficient = dense_row[col];
+            if (std::abs(coefficient) > prune_threshold) {
+                sparse.dofs.push_back(col);
+                sparse.coefficients.push_back(coefficient);
+            }
+        }
+        sparse.row_offsets.push_back(sparse.dofs.size());
+    }
+
+    return sparse;
+}
+
+Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept {
+    return Vec3{J(2u, 1u) - J(1u, 2u),
+                J(0u, 2u) - J(2u, 0u),
+                J(1u, 0u) - J(0u, 1u)};
+}
+
+Real divergence_from_jacobian(const VectorJacobian& J) noexcept {
+    return J(0u, 0u) + J(1u, 1u) + J(2u, 2u);
+}
+
+void write_vector_values_strided(const std::vector<Vec3>& values,
+                                 std::size_t num_dofs,
+                                 std::size_t output_stride,
+                                 std::size_t q,
+                                 Real* SVMP_RESTRICT values_out) {
+    if (values_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(values.size() == num_dofs,
+                 "vector value evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            values_out[(dof * 3u + component) * output_stride + q] =
+                values[dof][component];
+        }
+    }
+}
+
+void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
+                                    std::size_t num_dofs,
+                                    std::size_t output_stride,
+                                    std::size_t q,
+                                    Real* SVMP_RESTRICT jacobians_out) {
+    if (jacobians_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
+                 "vector Jacobian evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        const auto& J = jacobians[dof];
+        for (std::size_t component = 0; component < 3u; ++component) {
+            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
+                jacobians_out[(dof * 9u + component * 3u + derivative) *
+                                  output_stride + q] = J(component, derivative);
+            }
+        }
+    }
+}
+
+void write_vector_curl_strided(const std::vector<Vec3>& curl,
+                               std::size_t num_dofs,
+                               std::size_t output_stride,
+                               std::size_t q,
+                               Real* SVMP_RESTRICT curls_out) {
+    if (curls_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(curl.size() == num_dofs,
+                 "vector curl evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            curls_out[(dof * 3u + component) * output_stride + q] =
+                curl[dof][component];
+        }
+    }
+}
+
+void write_vector_divergence_strided(const std::vector<Real>& divergence,
+                                     std::size_t num_dofs,
+                                     std::size_t output_stride,
+                                     std::size_t q,
+                                     Real* SVMP_RESTRICT divergence_out) {
+    if (divergence_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(divergence.size() == num_dofs,
+                 "vector divergence evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        divergence_out[dof * output_stride + q] = divergence[dof];
+    }
+}
+
+void write_curl_and_divergence_from_jacobians_strided(
+    const std::vector<VectorJacobian>& jacobians,
+    std::size_t num_dofs,
+    std::size_t output_stride,
+    std::size_t q,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) {
+    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
+                 "vector Jacobian evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        const auto& J = jacobians[dof];
+        if (curls_out != nullptr) {
+            const Vec3 curl = curl_from_jacobian(J);
+            for (std::size_t component = 0; component < 3u; ++component) {
+                curls_out[(dof * 3u + component) * output_stride + q] =
+                    curl[component];
+            }
+        }
+        if (divergence_out != nullptr) {
+            divergence_out[dof * output_stride + q] = divergence_from_jacobian(J);
+        }
+    }
+}
+
+Vec3 lerp(const Vec3& a, const Vec3& b, Real s) {
+    const Real t = (s + Real(1)) * Real(0.5);
+    return a * (Real(1) - t) + b * t;
+}
+
+Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w) {
+    const Real N0 = Real(0.25) * (Real(1) - u) * (Real(1) - w);
+    const Real N1 = Real(0.25) * (Real(1) + u) * (Real(1) - w);
+    const Real N2 = Real(0.25) * (Real(1) + u) * (Real(1) + w);
+    const Real N3 = Real(0.25) * (Real(1) - u) * (Real(1) + w);
+    return v[0] * N0 + v[1] * N1 + v[2] * N2 + v[3] * N3;
+}
+
+Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w) {
+    (void)u;
+    const Real dN0 = -Real(0.25) * (Real(1) - w);
+    const Real dN1 =  Real(0.25) * (Real(1) - w);
+    const Real dN2 =  Real(0.25) * (Real(1) + w);
+    const Real dN3 = -Real(0.25) * (Real(1) + w);
+    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
+}
+
+Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w) {
+    (void)w;
+    const Real dN0 = -Real(0.25) * (Real(1) - u);
+    const Real dN1 = -Real(0.25) * (Real(1) + u);
+    const Real dN2 =  Real(0.25) * (Real(1) + u);
+    const Real dN3 =  Real(0.25) * (Real(1) - u);
+    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
+}
+
+Vec3 cross3(const Vec3& a, const Vec3& b) {
+    return Vec3{a[1] * b[2] - a[2] * b[1],
+                a[2] * b[0] - a[0] * b[2],
+                a[0] * b[1] - a[1] * b[0]};
+}
+
+Vec3 normalize3(const Vec3& v) {
+    const Real n = v.norm();
+    BASIS_CHECK_CONSTRUCTION(n > std::numeric_limits<Real>::epsilon(),
+                 "normalize3: zero-length vector");
+    return v / n;
+}
+
+std::array<int, 3> component_monomial_power_limits(
+    const std::vector<std::array<int, 4>>& candidates) {
+    std::array<int, 3> limits{{0, 0, 0}};
+    for (const auto& mono : candidates) {
+        limits[0] = std::max(limits[0], mono[1]);
+        limits[1] = std::max(limits[1], mono[2]);
+        limits[2] = std::max(limits[2], mono[3]);
+    }
+    return limits;
+}
+
+std::size_t triangle_poly_dim(std::size_t k) {
+    return (k + 1u) * (k + 2u) / 2u;
+}
+
+std::size_t tetra_poly_dim(std::size_t k) {
+    return (k + 1u) * (k + 2u) * (k + 3u) / 6u;
+}
+
+std::size_t rt_wedge_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t face_dofs =
+        2u * triangle_poly_dim(k) + 3u * (k + 1u) * (k + 1u);
+    const std::size_t interior_dofs =
+        (k >= 1u) ? (3u * k * (k + 1u) * (k + 1u) / 2u) : 0u;
+    return face_dofs + interior_dofs;
+}
+
+std::size_t rt_pyramid_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t face_dofs = (k + 1u) * (k + 1u) + 4u * triangle_poly_dim(k);
+    const std::size_t interior_dofs = (k >= 1u) ? (3u * k * k * k) : 0u;
+    return face_dofs + interior_dofs;
+}
+
+std::size_t nd_wedge_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t edge_dofs = 9u * (k + 1u);
+    const std::size_t face_dofs = (k >= 1u) ? (8u * k * (k + 1u)) : 0u;
+    const std::size_t interior_dofs =
+        (k >= 2u) ? (3u * k * (k - 1u) * (k + 1u) / 2u) : 0u;
+    return edge_dofs + face_dofs + interior_dofs;
+}
+
+std::size_t nd_pyramid_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t edge_dofs = 8u * (k + 1u);
+    const std::size_t face_dofs = (k >= 1u) ? (6u * k * (k + 1u)) : 0u;
+    const std::size_t interior_dofs =
+        (k >= 2u) ? (k * (k - 1u) * (k + 1u) / 2u) : 0u;
+    return edge_dofs + face_dofs + interior_dofs;
+}
+
+void ensure_supported_hybrid_vector_order(ElementType type,
+                                          int order,
+                                          const char* family_name) {
+    (void)type;
+    (void)order;
+    (void)family_name;
+}
+
+std::vector<std::array<int, 4>> make_component_monomial_candidates(
+    int max_total_degree) {
+    BASIS_CHECK_CONSTRUCTION(max_total_degree >= 0,
+                 "make_component_monomial_candidates: negative total degree");
+
+    std::vector<std::array<int, 4>> candidates;
+    for (int component = 0; component < 3; ++component) {
+        for (int total = 0; total <= max_total_degree; ++total) {
+            for (int pz = 0; pz <= total; ++pz) {
+                for (int py = 0; py <= total - pz; ++py) {
+                    const int px = total - py - pz;
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
+                                                                  int order) {
+    if (order >= 3) {
+        return make_component_monomial_candidates(3 * order);
+    }
+
+    std::vector<std::array<int, 4>> candidates;
+    if (!is_pyramid(type) || order != 2) {
+        return candidates;
+    }
+
+    for (int component = 0; component < 3; ++component) {
+        for (int pz = 0; pz <= 2; ++pz) {
+            for (int py = 0; py <= 2 - pz; ++py) {
+                for (int px = 0; px <= 2 - py - pz; ++px) {
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz) {
+    return px[static_cast<std::size_t>(mono[1])] *
+           py[static_cast<std::size_t>(mono[2])] *
+           pz[static_cast<std::size_t>(mono[3])];
+}
+
+Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
+                                             const std::vector<Real>& px,
+                                             const std::vector<Real>& py,
+                                             const std::vector<Real>& pz) {
+    const int component = mono[0];
+    const int px_pow = mono[1];
+    const int py_pow = mono[2];
+    const int pz_pow = mono[3];
+
+    if (component == 0) {
+        if (px_pow == 0) {
+            return Real(0);
+        }
+        return Real(px_pow) *
+               px[static_cast<std::size_t>(px_pow - 1)] *
+               py[static_cast<std::size_t>(py_pow)] *
+               pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (component == 1) {
+        if (py_pow == 0) {
+            return Real(0);
+        }
+        return Real(py_pow) *
+               px[static_cast<std::size_t>(px_pow)] *
+               py[static_cast<std::size_t>(py_pow - 1)] *
+               pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (pz_pow == 0) {
+        return Real(0);
+    }
+    return Real(pz_pow) *
+           px[static_cast<std::size_t>(px_pow)] *
+           py[static_cast<std::size_t>(py_pow)] *
+           pz[static_cast<std::size_t>(pz_pow - 1)];
+}
+
+void add_component_monomial_jacobian(VectorJacobian& J,
+                                     int component,
+                                     int px_pow,
+                                     int py_pow,
+                                     int pz_pow,
+                                     Real coefficient,
+                                     const std::vector<Real>& px,
+                                     const std::vector<Real>& py,
+                                     const std::vector<Real>& pz) {
+    const auto comp = static_cast<std::size_t>(component);
+    if (px_pow > 0) {
+        J(comp, 0) += coefficient * Real(px_pow) *
+                      px[static_cast<std::size_t>(px_pow - 1)] *
+                      py[static_cast<std::size_t>(py_pow)] *
+                      pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (py_pow > 0) {
+        J(comp, 1) += coefficient * Real(py_pow) *
+                      px[static_cast<std::size_t>(px_pow)] *
+                      py[static_cast<std::size_t>(py_pow - 1)] *
+                      pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (pz_pow > 0) {
+        J(comp, 2) += coefficient * Real(pz_pow) *
+                      px[static_cast<std::size_t>(px_pow)] *
+                      py[static_cast<std::size_t>(py_pow)] *
+                      pz[static_cast<std::size_t>(pz_pow - 1)];
+    }
+}
+
+VectorJacobian eval_transformed_component_monomial_jacobian(
+    const std::array<int, 4>& mono,
+    const std::vector<Real>& px,
+    const std::vector<Real>& py,
+    const std::vector<Real>& pz) {
+    VectorJacobian J{};
+    add_component_monomial_jacobian(
+        J, mono[0], mono[1], mono[2], mono[3], Real(1), px, py, pz);
+    return J;
+}
+
+void add_component_monomial_curl(Vec3& curl,
+                                 int component,
+                                 int px_pow,
+                                 int py_pow,
+                                 int pz_pow,
+                                 Real coefficient,
+                                 const std::vector<Real>& px,
+                                 const std::vector<Real>& py,
+                                 const std::vector<Real>& pz) {
+    const Real dphidx = (px_pow == 0)
+        ? Real(0)
+        : coefficient * Real(px_pow) *
+              px[static_cast<std::size_t>(px_pow - 1)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidy = (py_pow == 0)
+        ? Real(0)
+        : coefficient * Real(py_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow - 1)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidz = (pz_pow == 0)
+        ? Real(0)
+        : coefficient * Real(pz_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow - 1)];
+
+    if (component == 0) {
+        curl[1] += dphidz;
+        curl[2] -= dphidy;
+    } else if (component == 1) {
+        curl[0] -= dphidz;
+        curl[2] += dphidx;
+    } else {
+        curl[0] += dphidy;
+        curl[1] -= dphidx;
+    }
+}
+
+std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType,
+                                                                  int order) {
+    if (order >= 3) {
+        return make_component_monomial_candidates(3 * order);
+    }
+
+    std::vector<std::array<int, 4>> candidates;
+    const int max_total_degree = (order == 1) ? 4 : 5;
+    for (int component = 0; component < 3; ++component) {
+        for (int total = 0; total <= max_total_degree; ++total) {
+            for (int pz = 0; pz <= total; ++pz) {
+                for (int py = 0; py <= total - pz; ++py) {
+                    const int px = total - py - pz;
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz) {
+    return px[static_cast<std::size_t>(mono[1])] *
+           py[static_cast<std::size_t>(mono[2])] *
+           pz[static_cast<std::size_t>(mono[3])];
+}
+
+Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
+                                       const std::vector<Real>& px,
+                                       const std::vector<Real>& py,
+                                       const std::vector<Real>& pz) {
+    const int component = mono[0];
+    const int px_pow = mono[1];
+    const int py_pow = mono[2];
+    const int pz_pow = mono[3];
+
+    const Real dphidx = (px_pow == 0)
+        ? Real(0)
+        : Real(px_pow) *
+              px[static_cast<std::size_t>(px_pow - 1)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidy = (py_pow == 0)
+        ? Real(0)
+        : Real(py_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow - 1)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidz = (pz_pow == 0)
+        ? Real(0)
+        : Real(pz_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow - 1)];
+
+    if (component == 0) {
+        return Vec3{Real(0), dphidz, -dphidy};
+    }
+    if (component == 1) {
+        return Vec3{-dphidz, Real(0), dphidx};
+    }
+    return Vec3{dphidy, -dphidx, Real(0)};
+}
+
+} // namespace vector_common
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
new file mode 100644
index 000000000..e0e6daa10
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
@@ -0,0 +1,751 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
+#define SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
+
+#include "VectorBasis.h"
+#include "Basis/BasisTraits.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace vector_common {
+
+using Vec3 = math::Vector<Real, 3>;
+
+struct VectorBasisScratch {
+    std::vector<Real> px;
+    std::vector<Real> py;
+    std::vector<Real> pz;
+    std::vector<Real> batched_px;
+    std::vector<Real> batched_py;
+    std::vector<Real> batched_pz;
+    std::vector<Real> candidate_values;
+    std::vector<Real> candidate_dx;
+    std::vector<Real> candidate_dy;
+    std::vector<Real> candidate_dz;
+    std::vector<Real> modal_values_batched;
+    std::vector<Real> modal_jacobians_batched;
+    std::vector<Real> modal_curls_batched;
+    std::vector<Real> modal_divergence_batched;
+    std::vector<Vec3> vector_values;
+    std::vector<VectorJacobian> vector_jacobians;
+    std::vector<Real> scalars;
+    std::vector<Vec3> api_values;
+    std::vector<VectorJacobian> api_jacobians;
+    std::vector<Vec3> api_curl;
+    std::vector<Real> api_divergence;
+
+    void prewarm(std::size_t max_size, std::size_t max_qpts) {
+        const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
+        px.reserve(max_size);
+        py.reserve(max_size);
+        pz.reserve(max_size);
+        batched_px.reserve(batched_size);
+        batched_py.reserve(batched_size);
+        batched_pz.reserve(batched_size);
+        candidate_values.reserve(max_size);
+        candidate_dx.reserve(max_size);
+        candidate_dy.reserve(max_size);
+        candidate_dz.reserve(max_size);
+        modal_values_batched.reserve(batched_size * 3u);
+        modal_jacobians_batched.reserve(batched_size * 9u);
+        modal_curls_batched.reserve(batched_size * 3u);
+        modal_divergence_batched.reserve(batched_size);
+        vector_values.reserve(max_size);
+        vector_jacobians.reserve(max_size);
+        scalars.reserve(max_size);
+        api_values.reserve(max_size);
+        api_jacobians.reserve(max_size);
+        api_curl.reserve(max_size);
+        api_divergence.reserve(max_size);
+    }
+};
+
+VectorBasisScratch& vector_basis_scratch();
+void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts = 0);
+
+void fill_powers(Real x, int max_p, std::vector<Real>& out);
+void fill_power_tables(const Vec3& xi,
+                       const std::array<int, 3>& limits,
+                       VectorBasisScratch& scratch);
+void fill_batched_power_tables(const std::vector<Vec3>& points,
+                               const std::array<int, 3>& limits,
+                               VectorBasisScratch& scratch);
+void validate_vector_strided_outputs(std::size_t num_qpts,
+                                     std::size_t output_stride,
+                                     const char* family_name);
+void zero_active_strided_rows(Real* output,
+                              std::size_t rows,
+                              std::size_t output_stride,
+                              std::size_t num_qpts);
+SparseModalCoefficientMatrix build_sparse_modal_coefficients(
+    const std::vector<Real>& dense_coefficients,
+    std::size_t rows,
+    std::size_t cols);
+Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept;
+Real divergence_from_jacobian(const VectorJacobian& J) noexcept;
+
+inline Real batched_power_product(const std::vector<Real>& px,
+                                  const std::vector<Real>& py,
+                                  const std::vector<Real>& pz,
+                                  std::size_t stride,
+                                  int px_pow,
+                                  int py_pow,
+                                  int pz_pow,
+                                  std::size_t q) noexcept {
+    return px[static_cast<std::size_t>(px_pow) * stride + q] *
+           py[static_cast<std::size_t>(py_pow) * stride + q] *
+           pz[static_cast<std::size_t>(pz_pow) * stride + q];
+}
+
+inline Real batched_component_partial(const std::vector<Real>& px,
+                                      const std::vector<Real>& py,
+                                      const std::vector<Real>& pz,
+                                      std::size_t stride,
+                                      int px_pow,
+                                      int py_pow,
+                                      int pz_pow,
+                                      int derivative_axis,
+                                      std::size_t q) noexcept {
+    if (derivative_axis == 0) {
+        if (px_pow == 0) {
+            return Real(0);
+        }
+        return Real(px_pow) *
+               px[static_cast<std::size_t>(px_pow - 1) * stride + q] *
+               py[static_cast<std::size_t>(py_pow) * stride + q] *
+               pz[static_cast<std::size_t>(pz_pow) * stride + q];
+    }
+    if (derivative_axis == 1) {
+        if (py_pow == 0) {
+            return Real(0);
+        }
+        return Real(py_pow) *
+               px[static_cast<std::size_t>(px_pow) * stride + q] *
+               py[static_cast<std::size_t>(py_pow - 1) * stride + q] *
+               pz[static_cast<std::size_t>(pz_pow) * stride + q];
+    }
+    if (pz_pow == 0) {
+        return Real(0);
+    }
+    return Real(pz_pow) *
+           px[static_cast<std::size_t>(px_pow) * stride + q] *
+           py[static_cast<std::size_t>(py_pow) * stride + q] *
+           pz[static_cast<std::size_t>(pz_pow - 1) * stride + q];
+}
+
+inline Vec3 curl_from_component_gradient(int component,
+                                         Real dphidx,
+                                         Real dphidy,
+                                         Real dphidz) noexcept {
+    if (component == 0) {
+        return Vec3{Real(0), dphidz, -dphidy};
+    }
+    if (component == 1) {
+        return Vec3{-dphidz, Real(0), dphidx};
+    }
+    return Vec3{dphidy, -dphidx, Real(0)};
+}
+
+inline void axpy_qpoints(Real* target,
+                         const Real* source,
+                         Real coefficient,
+                         std::size_t num_qpts) noexcept {
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        target[q] += coefficient * source[q];
+    }
+}
+
+void write_vector_values_strided(const std::vector<Vec3>& values,
+                                 std::size_t num_dofs,
+                                 std::size_t output_stride,
+                                 std::size_t q,
+                                 Real* SVMP_RESTRICT values_out);
+void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
+                                    std::size_t num_dofs,
+                                    std::size_t output_stride,
+                                    std::size_t q,
+                                    Real* SVMP_RESTRICT jacobians_out);
+void write_vector_curl_strided(const std::vector<Vec3>& curl,
+                               std::size_t num_dofs,
+                               std::size_t output_stride,
+                               std::size_t q,
+                               Real* SVMP_RESTRICT curls_out);
+void write_vector_divergence_strided(const std::vector<Real>& divergence,
+                                     std::size_t num_dofs,
+                                     std::size_t output_stride,
+                                     std::size_t q,
+                                     Real* SVMP_RESTRICT divergence_out);
+void write_curl_and_divergence_from_jacobians_strided(
+    const std::vector<VectorJacobian>& jacobians,
+    std::size_t num_dofs,
+    std::size_t output_stride,
+    std::size_t q,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out);
+
+template <typename BasisLike>
+void evaluate_vector_public_api_strided(
+    const BasisLike& basis,
+    const std::vector<Vec3>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out,
+    bool use_direct_curl,
+    bool use_direct_divergence,
+    const char* family_name) {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = basis.size();
+    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
+
+    auto& scratch = vector_basis_scratch();
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out != nullptr) {
+            basis.evaluate_vector_values(points[q], scratch.api_values);
+            write_vector_values_strided(
+                scratch.api_values, num_dofs, output_stride, q, values_out);
+        }
+
+        const bool needs_jacobians =
+            jacobians_out != nullptr ||
+            (curls_out != nullptr && !use_direct_curl) ||
+            (divergence_out != nullptr && !use_direct_divergence);
+
+        if (needs_jacobians) {
+            basis.evaluate_vector_jacobians(points[q], scratch.api_jacobians);
+            write_vector_jacobians_strided(
+                scratch.api_jacobians, num_dofs, output_stride, q, jacobians_out);
+            write_curl_and_divergence_from_jacobians_strided(
+                scratch.api_jacobians,
+                num_dofs,
+                output_stride,
+                q,
+                curls_out,
+                divergence_out);
+            continue;
+        }
+
+        if (curls_out != nullptr) {
+            basis.evaluate_curl(points[q], scratch.api_curl);
+            write_vector_curl_strided(
+                scratch.api_curl, num_dofs, output_stride, q, curls_out);
+        }
+        if (divergence_out != nullptr) {
+            basis.evaluate_divergence(points[q], scratch.api_divergence);
+            write_vector_divergence_strided(
+                scratch.api_divergence, num_dofs, output_stride, q, divergence_out);
+        }
+    }
+}
+
+Vec3 lerp(const Vec3& a, const Vec3& b, Real s);
+Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 cross3(const Vec3& a, const Vec3& b);
+Vec3 normalize3(const Vec3& v);
+
+template <typename ModalPolynomials>
+std::array<int, 3> modal_power_limits(const ModalPolynomials& monomials) {
+    std::array<int, 3> limits{{0, 0, 0}};
+    for (const auto& poly : monomials) {
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            limits[0] = std::max(limits[0], m.px);
+            limits[1] = std::max(limits[1], m.py);
+            limits[2] = std::max(limits[2], m.pz);
+        }
+    }
+    return limits;
+}
+
+std::array<int, 3> component_monomial_power_limits(
+    const std::vector<std::array<int, 4>>& candidates);
+std::size_t triangle_poly_dim(std::size_t k);
+std::size_t tetra_poly_dim(std::size_t k);
+std::size_t rt_wedge_size(int order);
+std::size_t rt_pyramid_size(int order);
+std::size_t nd_wedge_size(int order);
+std::size_t nd_pyramid_size(int order);
+void ensure_supported_hybrid_vector_order(ElementType type,
+                                          int order,
+                                          const char* family_name);
+std::vector<std::array<int, 4>> make_component_monomial_candidates(int max_total_degree);
+std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
+                                                                  int order);
+Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz);
+Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
+                                             const std::vector<Real>& px,
+                                             const std::vector<Real>& py,
+                                             const std::vector<Real>& pz);
+
+void add_component_monomial_jacobian(VectorJacobian& J,
+                                     int component,
+                                     int px_pow,
+                                     int py_pow,
+                                     int pz_pow,
+                                     Real coefficient,
+                                     const std::vector<Real>& px,
+                                     const std::vector<Real>& py,
+                                     const std::vector<Real>& pz);
+VectorJacobian eval_transformed_component_monomial_jacobian(
+    const std::array<int, 4>& mono,
+    const std::vector<Real>& px,
+    const std::vector<Real>& py,
+    const std::vector<Real>& pz);
+void add_component_monomial_curl(Vec3& curl,
+                                 int component,
+                                 int px_pow,
+                                 int py_pow,
+                                 int pz_pow,
+                                 Real coefficient,
+                                 const std::vector<Real>& px,
+                                 const std::vector<Real>& py,
+                                 const std::vector<Real>& pz);
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_values_with_limits(const ModalPolynomials& monomials,
+                                                    const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                    std::size_t n,
+                                                    const Vec3& xi,
+                                                    const std::array<int, 3>& power_limits,
+                                                    std::vector<Vec3>& values) {
+    values.assign(n, Vec3{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_vals = scratch.vector_values;
+    modal_vals.assign(n, Vec3{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& v = modal_vals[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            const Real mv =
+                px[static_cast<std::size_t>(m.px)] *
+                py[static_cast<std::size_t>(m.py)] *
+                pz[static_cast<std::size_t>(m.pz)];
+            v[static_cast<std::size_t>(m.component)] += m.coefficient * mv;
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_values: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_values: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Vec3& mv = modal_vals[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            values[dof][0] += c * mv[0];
+            values[dof][1] += c * mv[1];
+            values[dof][2] += c * mv[2];
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_jacobians_with_limits(const ModalPolynomials& monomials,
+                                                       const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                       std::size_t n,
+                                                       const Vec3& xi,
+                                                       const std::array<int, 3>& power_limits,
+                                                       std::vector<VectorJacobian>& jacobians) {
+    jacobians.assign(n, VectorJacobian{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_jacs = scratch.vector_jacobians;
+    modal_jacs.assign(n, VectorJacobian{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& J = modal_jacs[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            add_component_monomial_jacobian(J, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& Jp = modal_jacs[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            for (std::size_t r = 0; r < 3; ++r) {
+                for (std::size_t col = 0; col < 3; ++col) {
+                    jacobians[dof](r, col) += c * Jp(r, col);
+                }
+            }
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_curl_with_limits(const ModalPolynomials& monomials,
+                                                  const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                  std::size_t n,
+                                                  const Vec3& xi,
+                                                  const std::array<int, 3>& power_limits,
+                                                  std::vector<Vec3>& curl) {
+    curl.assign(n, Vec3{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_curl = scratch.vector_values;
+    modal_curl.assign(n, Vec3{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& c = modal_curl[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            add_component_monomial_curl(c, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_curl: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_curl: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Vec3& cm = modal_curl[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            curl[dof][0] += c * cm[0];
+            curl[dof][1] += c * cm[1];
+            curl[dof][2] += c * cm[2];
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_divergence_with_limits(const ModalPolynomials& monomials,
+                                                 const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                 std::size_t n,
+                                                 const Vec3& xi,
+                                                 const std::array<int, 3>& power_limits,
+                                                 std::vector<Real>& divergence) {
+    divergence.assign(n, Real(0));
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_divergence = scratch.scalars;
+    modal_divergence.assign(n, Real(0));
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        Real div = Real(0);
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            if (m.component == 0 && m.px > 0) {
+                div += m.coefficient * Real(m.px) *
+                       px[static_cast<std::size_t>(m.px - 1)] *
+                       py[static_cast<std::size_t>(m.py)] *
+                       pz[static_cast<std::size_t>(m.pz)];
+            } else if (m.component == 1 && m.py > 0) {
+                div += m.coefficient * Real(m.py) *
+                       px[static_cast<std::size_t>(m.px)] *
+                       py[static_cast<std::size_t>(m.py - 1)] *
+                       pz[static_cast<std::size_t>(m.pz)];
+            } else if (m.component == 2 && m.pz > 0) {
+                div += m.coefficient * Real(m.pz) *
+                       px[static_cast<std::size_t>(m.px)] *
+                       py[static_cast<std::size_t>(m.py)] *
+                       pz[static_cast<std::size_t>(m.pz - 1)];
+            }
+        }
+        modal_divergence[p] = div;
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_divergence: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_divergence: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Real div = modal_divergence[p];
+        if (div == Real(0)) {
+            continue;
+        }
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            divergence[sparse_coeffs.dofs[entry]] +=
+                sparse_coeffs.coefficients[entry] * div;
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_strided_with_limits(
+    const ModalPolynomials& monomials,
+    const SparseModalCoefficientMatrix& sparse_coeffs,
+    std::size_t n,
+    const std::vector<Vec3>& points,
+    std::size_t output_stride,
+    const std::array<int, 3>& power_limits,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out,
+    const char* family_name) {
+    const std::size_t num_qpts = points.size();
+    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_strided: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_strided: sparse coefficient entry mismatch");
+
+    auto& scratch = vector_basis_scratch();
+    const bool need_values = values_out != nullptr;
+    const bool need_jacobians = jacobians_out != nullptr;
+    const bool need_curls = curls_out != nullptr;
+    const bool need_divergence = divergence_out != nullptr;
+
+    if (need_values) {
+        zero_active_strided_rows(values_out, n * 3u, output_stride, num_qpts);
+    }
+    if (need_jacobians) {
+        zero_active_strided_rows(jacobians_out, n * 9u, output_stride, num_qpts);
+    }
+    if (need_curls) {
+        zero_active_strided_rows(curls_out, n * 3u, output_stride, num_qpts);
+    }
+    if (need_divergence) {
+        zero_active_strided_rows(divergence_out, n, output_stride, num_qpts);
+    }
+    if (num_qpts == 0 || n == 0) {
+        return;
+    }
+
+    fill_batched_power_tables(points, power_limits, scratch);
+    const auto& px = scratch.batched_px;
+    const auto& py = scratch.batched_py;
+    const auto& pz = scratch.batched_pz;
+    const std::size_t power_stride = num_qpts;
+    const bool need_modal_gradient = need_jacobians || need_curls || need_divergence;
+
+    auto& modal_values = scratch.modal_values_batched;
+    auto& modal_jacobians = scratch.modal_jacobians_batched;
+    auto& modal_curls = scratch.modal_curls_batched;
+    auto& modal_divergence = scratch.modal_divergence_batched;
+
+    for (std::size_t p = 0; p < n; ++p) {
+        if (need_values) {
+            modal_values.assign(3u * num_qpts, Real(0));
+        }
+        if (need_jacobians) {
+            modal_jacobians.assign(9u * num_qpts, Real(0));
+        }
+        if (need_curls) {
+            modal_curls.assign(3u * num_qpts, Real(0));
+        }
+        if (need_divergence) {
+            modal_divergence.assign(num_qpts, Real(0));
+        }
+
+        const auto& poly = monomials[p];
+        for (int term_index = 0; term_index < poly.num_terms; ++term_index) {
+            const auto& term = poly.terms[static_cast<std::size_t>(term_index)];
+            const std::size_t component = static_cast<std::size_t>(term.component);
+            Real* modal_value_row = need_values
+                ? modal_values.data() + component * num_qpts
+                : nullptr;
+            Real* modal_jacobian_row = need_jacobians
+                ? modal_jacobians.data() + component * 3u * num_qpts
+                : nullptr;
+            Real* modal_curl_rows = need_curls ? modal_curls.data() : nullptr;
+            Real* modal_divergence_row =
+                need_divergence ? modal_divergence.data() : nullptr;
+
+            if (need_values) {
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    modal_value_row[q] +=
+                        term.coefficient *
+                        batched_power_product(px,
+                                              py,
+                                              pz,
+                                              power_stride,
+                                              term.px,
+                                              term.py,
+                                              term.pz,
+                                              q);
+                }
+            }
+
+            if (need_modal_gradient) {
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const Real dphidx =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  0,
+                                                  q);
+                    const Real dphidy =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  1,
+                                                  q);
+                    const Real dphidz =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  2,
+                                                  q);
+
+                    if (need_jacobians) {
+                        modal_jacobian_row[q] += dphidx;
+                        modal_jacobian_row[num_qpts + q] += dphidy;
+                        modal_jacobian_row[2u * num_qpts + q] += dphidz;
+                    }
+                    if (need_curls) {
+                        const Vec3 curl =
+                            curl_from_component_gradient(term.component,
+                                                         dphidx,
+                                                         dphidy,
+                                                         dphidz);
+                        modal_curl_rows[q] += curl[0];
+                        modal_curl_rows[num_qpts + q] += curl[1];
+                        modal_curl_rows[2u * num_qpts + q] += curl[2];
+                    }
+                    if (need_divergence) {
+                        const Real div = term.component == 0 ? dphidx
+                                       : term.component == 1 ? dphidy
+                                                            : dphidz;
+                        modal_divergence_row[q] += div;
+                    }
+                }
+            }
+        }
+
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            if (need_values) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    axpy_qpoints(values_out + (dof * 3u + component) * output_stride,
+                                 modal_values.data() + component * num_qpts,
+                                 c,
+                                 num_qpts);
+                }
+            }
+            if (need_jacobians) {
+                for (std::size_t row = 0; row < 3u; ++row) {
+                    for (std::size_t col = 0; col < 3u; ++col) {
+                        axpy_qpoints(jacobians_out +
+                                         (dof * 9u + row * 3u + col) * output_stride,
+                                     modal_jacobians.data() +
+                                         (row * 3u + col) * num_qpts,
+                                     c,
+                                     num_qpts);
+                    }
+                }
+            }
+            if (need_curls) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    axpy_qpoints(curls_out + (dof * 3u + component) * output_stride,
+                                 modal_curls.data() + component * num_qpts,
+                                 c,
+                                 num_qpts);
+                }
+            }
+            if (need_divergence) {
+                axpy_qpoints(divergence_out + dof * output_stride,
+                             modal_divergence.data(),
+                             c,
+                             num_qpts);
+            }
+        }
+    }
+}
+
+std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType type,
+                                                                  int order);
+Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz);
+Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
+                                       const std::vector<Real>& px,
+                                       const std::vector<Real>& py,
+                                       const std::vector<Real>& pz);
+
+
+} // namespace vector_common
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
new file mode 100644
index 000000000..6e1a7202b
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
@@ -0,0 +1,77 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
+#define SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
+
+#include "Types.h"
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct VectorBasisModalTerm {
+    int component{0}; // 0=x, 1=y, 2=z
+    int px{0};
+    int py{0};
+    int pz{0};
+    Real coefficient{Real(1)};
+};
+
+struct VectorBasisModalPolynomial {
+    std::array<VectorBasisModalTerm, 4> terms{};
+    int num_terms{0};
+};
+
+inline bool modal_terms_equal(const VectorBasisModalTerm& lhs,
+                              const VectorBasisModalTerm& rhs) noexcept {
+    return lhs.component == rhs.component &&
+           lhs.px == rhs.px &&
+           lhs.py == rhs.py &&
+           lhs.pz == rhs.pz &&
+           lhs.coefficient == rhs.coefficient;
+}
+
+inline bool modal_polynomials_equal(const VectorBasisModalPolynomial& lhs,
+                                    const VectorBasisModalPolynomial& rhs) noexcept {
+    if (lhs.num_terms != rhs.num_terms) {
+        return false;
+    }
+    for (int term = 0; term < lhs.num_terms; ++term) {
+        const auto index = static_cast<std::size_t>(term);
+        if (!modal_terms_equal(lhs.terms[index], rhs.terms[index])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline bool append_unique_modal_polynomial(
+    std::vector<VectorBasisModalPolynomial>& polynomials,
+    const VectorBasisModalPolynomial& polynomial) {
+    const auto found = std::find_if(
+        polynomials.begin(),
+        polynomials.end(),
+        [&](const VectorBasisModalPolynomial& existing) {
+            return modal_polynomials_equal(existing, polynomial);
+        });
+    if (found != polynomials.end()) {
+        return false;
+    }
+    polynomials.push_back(polynomial);
+    return true;
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
diff --git a/Code/Source/solver/FE/Common/Alignment.h b/Code/Source/solver/FE/Common/Alignment.h
new file mode 100644
index 000000000..8d33a7a7a
--- /dev/null
+++ b/Code/Source/solver/FE/Common/Alignment.h
@@ -0,0 +1,23 @@
+#ifndef SVMP_FE_CORE_ALIGNMENT_H
+#define SVMP_FE_CORE_ALIGNMENT_H
+
+/**
+ * @file Alignment.h
+ * @brief Global alignment constants used across FE modules.
+ */
+
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+
+/// Preferred cache-line/SIMD alignment for performance-critical arrays.
+inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
+
+/// Alignment for small fixed-size math objects that are commonly passed by value.
+inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
+
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_CORE_ALIGNMENT_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
new file mode 100644
index 000000000..60312a524
--- /dev/null
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -0,0 +1,532 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See Copyright-SimVascular.txt for additional details.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SVMP_FE_TYPES_H
+#define SVMP_FE_TYPES_H
+
+/**
+ * @file Types.h
+ * @brief Fundamental type definitions for the finite element library
+ *
+ * This header provides core type aliases, enumerations, and strong type
+ * definitions used throughout the FE library. It establishes a consistent
+ * type system that integrates with the Mesh library while maintaining
+ * independence from backend-specific types.
+ */
+
+#if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
+#  include "Mesh/Core/MeshTypes.h"
+#  define SVMP_FE_HAS_MESH_TYPES 1
+#else
+// Build FE without Mesh types unless explicitly enabled.
+#  define SVMP_FE_HAS_MESH_TYPES 0
+#endif
+
+#if !SVMP_FE_HAS_MESH_TYPES
+namespace svmp {
+// Minimal fallback when the Mesh library is not available.
+// Keeps FE compilation self-contained while preserving the same namespace.
+#ifndef SVMP_CELL_FAMILY_DEFINED
+#define SVMP_CELL_FAMILY_DEFINED 1
+enum class CellFamily {
+    Point,
+    Line,
+    Triangle,
+    Quad,
+    Tetra,
+    Hex,
+    Wedge,
+    Pyramid,
+    Polygon,
+    Polyhedron
+};
+#endif
+} // namespace svmp
+#endif
+#include <cstdint>
+#include <array>
+#include <string>
+#include <type_traits>
+#include <limits>
+
+#if defined(_MSC_VER)
+#  define SVMP_RESTRICT __restrict
+#elif defined(__clang__) || defined(__GNUC__)
+#  define SVMP_RESTRICT __restrict__
+#else
+#  define SVMP_RESTRICT
+#endif
+
+namespace svmp {
+namespace FE {
+
+// ============================================================================
+// Index Types
+// ============================================================================
+
+/**
+ * @brief Local index type for element-level operations
+ *
+ * Used for local node numbering within elements, local DOF indices,
+ * and other element-local indexing. Unsigned for safety.
+ */
+using LocalIndex = std::uint32_t;
+
+/**
+ * @brief Global index type for distributed DOF numbering
+ *
+ * Signed 64-bit for compatibility with PETSc and Trilinos.
+ * Negative values can indicate special conditions or invalid indices.
+ */
+using GlobalIndex = std::int64_t;
+
+/**
+ * @brief DOF-specific index type
+ *
+ * Strong type alias to prevent mixing DOF indices with other indices.
+ * Provides type safety at compile time.
+ */
+struct DofIndex {
+    GlobalIndex value;
+
+    constexpr explicit DofIndex(GlobalIndex v = -1) noexcept : value(v) {}
+    constexpr operator GlobalIndex() const noexcept { return value; }
+    constexpr bool is_valid() const noexcept { return value >= 0; }
+};
+
+/**
+ * @brief Field identifier type
+ *
+ * Used to distinguish between different physical fields in multi-field problems.
+ */
+using FieldId = std::uint16_t;
+
+/**
+ * @brief Block identifier for block-structured systems
+ */
+using BlockId = std::uint16_t;
+
+// Import mesh library scalar/index types when available (optional dependency).
+#if SVMP_FE_HAS_MESH_TYPES
+using MeshIndex = svmp::index_t;
+using MeshOffset = svmp::offset_t;
+using MeshGlobalId = svmp::gid_t;
+using Real = svmp::real_t;  // Use same precision as Mesh library
+#else
+using MeshIndex = std::int32_t;
+using MeshOffset = std::int64_t;
+using MeshGlobalId = std::int64_t;
+using Real = double;
+#endif
+
+// ============================================================================
+// Constants
+// ============================================================================
+
+constexpr LocalIndex INVALID_LOCAL_INDEX = std::numeric_limits<LocalIndex>::max();
+constexpr GlobalIndex INVALID_GLOBAL_INDEX = -1;
+constexpr FieldId INVALID_FIELD_ID = std::numeric_limits<FieldId>::max();
+/// Sentinel FieldId for geometry-only quantities (no DOF dependence).
+/// Uses first registered field's space for quadrature, but logically decoupled
+/// from any specific field's DOFs.
+constexpr FieldId GEOMETRY_FIELD_ID = std::numeric_limits<FieldId>::max() - 1;
+constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
+
+/**
+ * @brief Sentinel FieldId representing "the current solution state" in tangent forms.
+ *
+ * When differentiating a residual form to obtain the tangent (Jacobian), undifferentiated
+ * TrialFunction occurrences are rewritten to StateField nodes. Those that represent the
+ * block's own primary unknown (rather than a named external field) use this sentinel
+ * FieldId. The assembler maps it to the current solution coefficients at each quadrature
+ * point, regardless of which physics or field variables are involved.
+ *
+ * This is distinct from INVALID_FIELD_ID, which means "uninitialized / no field."
+ * CURRENT_SOLUTION_FIELD_ID uses the same numeric value for backward compatibility
+ * with existing KernelIR encodings, but carries explicit semantic intent.
+ */
+constexpr FieldId CURRENT_SOLUTION_FIELD_ID = std::numeric_limits<FieldId>::max();
+
+// ============================================================================
+// Field Value Entry (for point evaluation of field-dependent expressions)
+// ============================================================================
+
+/// Maximum number of components in a FieldValueEntry (3x3 tensor).
+constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
+
+/**
+ * @brief Field value at an evaluation point — scalar, vector, or tensor.
+ *
+ * Used by PointEvaluator and the auxiliary assembly path to supply FE
+ * field values at entity locations (e.g., nodal DOF values for
+ * Node-scoped auxiliary models with Lagrange Kronecker delta).
+ */
+struct FieldValueEntry {
+    FieldId field{INVALID_FIELD_ID};
+    int n_components{0};
+    Real components[MAX_FIELD_VALUE_COMPONENTS]{};
+};
+
+// ============================================================================
+// Element Type Enumerations
+// ============================================================================
+
+/**
+ * @brief Reference element types supported by the FE library
+ *
+ * Maps to svmp::CellFamily from the Mesh library but provides
+ * FE-specific categorization including higher-order variants.
+ */
+enum class ElementType : std::uint8_t {
+    // Linear elements
+    Line2      = 0,   // 2-node line
+    Triangle3  = 1,   // 3-node triangle
+    Quad4      = 2,   // 4-node quadrilateral
+    Tetra4     = 3,   // 4-node tetrahedron
+    Hex8       = 4,   // 8-node hexahedron
+    Wedge6     = 5,   // 6-node wedge/prism
+    Pyramid5   = 6,   // 5-node pyramid
+
+    // Quadratic elements
+    Line3      = 10,  // 3-node line
+    Triangle6  = 11,  // 6-node triangle
+    Quad9      = 12,  // 9-node quadrilateral (bi-quadratic)
+    Quad8      = 13,  // 8-node quadrilateral (serendipity)
+    Tetra10    = 14,  // 10-node tetrahedron
+    Hex27      = 15,  // 27-node hexahedron (tri-quadratic)
+    Hex20      = 16,  // 20-node hexahedron (serendipity)
+    Wedge15    = 17,  // 15-node wedge
+    Wedge18    = 18,  // 18-node wedge (complete quadratic)
+    Pyramid13  = 19,  // 13-node pyramid
+    Pyramid14  = 20,  // 14-node pyramid
+
+    // Special elements
+    Point1     = 30,  // 1-node point element
+
+    Unknown    = 255
+};
+
+/**
+ * @brief Quadrature rule types
+ */
+enum class QuadratureType : std::uint8_t {
+    GaussLegendre,     // Standard Gaussian quadrature
+    GaussLobatto,      // Includes endpoints (for spectral elements)
+    Newton,            // Newton-Cotes rules
+    Reduced,           // Order-based reduced integration for locking
+    PositionBased,     // Position-based reduced integration (legacy compatible)
+    Composite,         // Composite rules for adaptivity
+    Custom             // User-defined quadrature points
+};
+
+/**
+ * @brief Basis function families
+ */
+enum class BasisType : std::uint8_t {
+    Lagrange,          // Standard nodal Lagrange basis
+    Hierarchical,      // Hierarchical/modal basis
+    Bernstein,         // Bernstein polynomials
+    NURBS,             // Non-uniform rational B-splines
+    BSpline,           // Non-rational B-spline basis
+    Spectral,          // Spectral element basis
+    Serendipity,       // Serendipity elements
+    Hermite,           // Hermite C1 continuity basis
+    RaviartThomas,     // H(div) Raviart-Thomas family
+    Nedelec,           // H(curl) Nedelec edge elements
+    BDM,               // H(div) Brezzi-Douglas-Marini family
+    Bubble,            // Interior bubble functions for enrichment
+    Custom             // User-defined basis
+};
+
+/**
+ * @brief Field types for function spaces
+ */
+enum class FieldType : std::uint8_t {
+    Scalar,            // Scalar field (temperature, pressure)
+    Vector,            // Vector field (velocity, displacement)
+    Tensor,            // Tensor field (stress, strain)
+    SymmetricTensor,   // Symmetric tensor field
+    Mixed              // Mixed/composite field
+};
+
+/**
+ * @brief Continuity requirements for function spaces
+ */
+enum class Continuity : std::uint8_t {
+    C0,                // Continuous (standard FEM)
+    C1,                // C1 continuous (for plates/shells)
+    L2,                // L2 (discontinuous)
+    H_div,             // H(div) conforming
+    H_curl,            // H(curl) conforming
+    Custom
+};
+
+/**
+ * @brief Assembly strategies
+ */
+enum class AssemblyStrategy : std::uint8_t {
+    ElementByElement,  // Traditional element loop
+    Vectorized,        // SIMD vectorized assembly
+    MatrixFree,        // Matrix-free operators
+    Hybrid             // Mixed strategy
+};
+
+/**
+ * @brief Status codes for FE operations
+ */
+enum class FEStatus : std::uint8_t {
+    Success           = 0,
+    InvalidArgument   = 1,
+    InvalidElement    = 2,
+    SingularMapping   = 3,
+    QuadratureError   = 4,
+    AssemblyError     = 5,
+    BackendError      = 6,
+    NotImplemented    = 7,
+    ConvergenceError  = 8,
+    AllocationError   = 9,
+    MPIError          = 10,
+    IOError           = 11,
+    Unknown           = 255
+};
+
+// ============================================================================
+// Geometric Types
+// ============================================================================
+
+/**
+ * @brief Point in reference element coordinates
+ */
+template<int Dim>
+using ReferencePoint = std::array<Real, static_cast<std::size_t>(Dim)>;
+
+/**
+ * @brief Point in physical coordinates
+ */
+using PhysicalPoint = std::array<Real, 3>;
+
+/**
+ * @brief Jacobian matrix type
+ */
+template<int SpatialDim, int ReferenceDim = SpatialDim>
+using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
+
+// ============================================================================
+// Strong Type Wrappers (C++17 idiom for type safety)
+// ============================================================================
+
+/**
+ * @brief Strong type wrapper template for type-safe programming
+ *
+ * Prevents accidental mixing of conceptually different types that have
+ * the same underlying representation.
+ */
+template<typename T, typename Tag>
+class StrongType {
+public:
+    using ValueType = T;
+
+    constexpr StrongType() noexcept(std::is_nothrow_default_constructible_v<T>)
+        : value_{} {}
+
+    constexpr explicit StrongType(T value) noexcept(std::is_nothrow_move_constructible_v<T>)
+        : value_(std::move(value)) {}
+
+    constexpr T& get() noexcept { return value_; }
+    constexpr const T& get() const noexcept { return value_; }
+
+    // Explicit conversion
+    constexpr explicit operator T() const noexcept { return value_; }
+
+    // Comparison operators
+    constexpr bool operator==(const StrongType& other) const noexcept {
+        return value_ == other.value_;
+    }
+    constexpr bool operator!=(const StrongType& other) const noexcept {
+        return value_ != other.value_;
+    }
+    constexpr bool operator<(const StrongType& other) const noexcept {
+        return value_ < other.value_;
+    }
+
+private:
+    T value_;
+};
+
+// Specific strong types for common use cases
+struct QuadraturePointTag {};
+struct QuadratureWeightTag {};
+struct BasisValueTag {};
+struct BasisGradientTag {};
+
+using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
+using QuadratureWeight = StrongType<Real, QuadratureWeightTag>;
+
+// ============================================================================
+// Type Traits
+// ============================================================================
+
+/**
+ * @brief Check if a type is a valid index type
+ */
+template<typename T>
+struct is_index_type : std::false_type {};
+
+template<>
+struct is_index_type<LocalIndex> : std::true_type {};
+
+template<>
+struct is_index_type<GlobalIndex> : std::true_type {};
+
+template<>
+struct is_index_type<DofIndex> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_index_type_v = is_index_type<T>::value;
+
+/**
+ * @brief Check if a type represents a field type
+ */
+template<typename T>
+struct is_field_type : std::false_type {};
+
+template<>
+struct is_field_type<FieldType> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_field_type_v = is_field_type<T>::value;
+
+// ============================================================================
+// Utility Functions
+// ============================================================================
+
+/**
+ * @brief Convert FE ElementType to Mesh CellFamily
+ */
+constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
+    switch(elem) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return svmp::CellFamily::Line;
+
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return svmp::CellFamily::Triangle;
+
+        case ElementType::Quad4:
+        case ElementType::Quad8:
+        case ElementType::Quad9:
+            return svmp::CellFamily::Quad;
+
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return svmp::CellFamily::Tetra;
+
+        case ElementType::Hex8:
+        case ElementType::Hex20:
+        case ElementType::Hex27:
+            return svmp::CellFamily::Hex;
+
+        case ElementType::Wedge6:
+        case ElementType::Wedge15:
+        case ElementType::Wedge18:
+            return svmp::CellFamily::Wedge;
+
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            return svmp::CellFamily::Pyramid;
+
+        case ElementType::Point1:
+            return svmp::CellFamily::Point;
+
+        default:
+            return svmp::CellFamily::Point;  // Fallback
+    }
+}
+
+/**
+ * @brief Get spatial dimension of element type
+ */
+constexpr int element_dimension(ElementType elem) noexcept {
+    switch(elem) {
+        case ElementType::Point1:
+            return 0;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return 1;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+        case ElementType::Quad4:
+        case ElementType::Quad8:
+        case ElementType::Quad9:
+            return 2;
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+        case ElementType::Hex8:
+        case ElementType::Hex20:
+        case ElementType::Hex27:
+        case ElementType::Wedge6:
+        case ElementType::Wedge15:
+        case ElementType::Wedge18:
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            return 3;
+        default:
+            return -1;
+    }
+}
+
+/**
+ * @brief Convert status code to string for error reporting
+ */
+inline const char* status_to_string(FEStatus status) noexcept {
+    switch(status) {
+        case FEStatus::Success:          return "Success";
+        case FEStatus::InvalidArgument:  return "Invalid argument";
+        case FEStatus::InvalidElement:   return "Invalid element";
+        case FEStatus::SingularMapping:  return "Singular mapping";
+        case FEStatus::QuadratureError:  return "Quadrature error";
+        case FEStatus::AssemblyError:    return "Assembly error";
+        case FEStatus::BackendError:     return "Backend error";
+        case FEStatus::NotImplemented:   return "Not implemented";
+        case FEStatus::ConvergenceError: return "Convergence error";
+        case FEStatus::AllocationError:  return "Allocation error";
+        case FEStatus::MPIError:         return "MPI error";
+        case FEStatus::IOError:          return "I/O error";
+        default:                         return "Unknown error";
+    }
+}
+
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_TYPES_H
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
new file mode 100644
index 000000000..7d909fa0c
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -0,0 +1,480 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "DenseLinearAlgebra.h"
+
+#include "FEException.h"
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+#include <Eigen/Dense>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <utility>
+
+#define DENSE_LINALG_CHECK(condition, message) \
+    ::svmp::FE::throw_if<::svmp::FE::FEException>(!(condition), SVMP_HERE, (message))
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+namespace {
+
+constexpr std::size_t kDenseSolveRhsBlock = 32u;
+
+void materialize_inverse_from_solver(const DenseLUSolver& solver,
+                                     std::vector<Real>& inverse) {
+    const std::size_t n = solver.n;
+    inverse.assign(n * n, Real(0));
+    for (std::size_t diag = 0; diag < n; ++diag) {
+        inverse[diag * n + diag] = Real(1);
+    }
+    solver.solve_in_place(std::span<Real>(inverse.data(), inverse.size()), n);
+}
+
+} // namespace
+
+Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept {
+    Real max_abs = Real(0);
+    for (const Real value : matrix) {
+        max_abs = std::max(max_abs, std::abs(value));
+    }
+    return max_abs;
+}
+
+Real dense_matrix_pivot_tolerance(std::size_t rows,
+                                  std::size_t cols,
+                                  Real max_abs,
+                                  Real multiplier) noexcept {
+    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
+    const Real value_scale = std::max(Real(1), max_abs);
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           std::max(Real(1), size_scale) * value_scale;
+}
+
+Real dense_matrix_singular_value_tolerance(std::size_t rows,
+                                           std::size_t cols,
+                                           Real largest_singular_value,
+                                           Real multiplier) noexcept {
+    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           std::max(Real(1), size_scale) *
+           std::max(Real(1), largest_singular_value);
+}
+
+Real dense_matrix_condition_fallback_threshold() noexcept {
+    return Real(1.0e12);
+}
+
+Real dense_matrix_condition_error_threshold() noexcept {
+    return Real(1.0e14);
+}
+
+void DenseLUSolver::solve_in_place(std::span<Real> rhs) const {
+    solve_in_place(rhs, 1u);
+}
+
+void DenseLUSolver::solve_in_place(std::span<Real> rhs,
+                                   std::size_t rhs_count) const {
+    DENSE_LINALG_CHECK(rhs_count > 0,
+                             label + ": dense solve requires at least one right-hand side");
+    DENSE_LINALG_CHECK(rhs.size() == n * rhs_count,
+                             label + ": dense multi-RHS solve size mismatch");
+    DENSE_LINALG_CHECK(lu.size() == n * n && pivots.size() == n,
+                             label + ": dense solver is not factorized");
+
+    for (std::size_t k = 0; k < n; ++k) {
+        if (pivots[k] != k) {
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    std::swap(rhs[k * rhs_count + r],
+                              rhs[pivots[k] * rhs_count + r]);
+                }
+            }
+        }
+    }
+
+    for (std::size_t row = 0; row < n; ++row) {
+        for (std::size_t col = 0; col < row; ++col) {
+            const Real factor = lu[row * n + col];
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
+                }
+            }
+        }
+    }
+
+    for (std::size_t rev = 0; rev < n; ++rev) {
+        const std::size_t row = n - 1u - rev;
+        for (std::size_t col = row + 1u; col < n; ++col) {
+            const Real factor = lu[row * n + col];
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
+                }
+            }
+        }
+        const Real pivot = lu[row * n + row];
+        DENSE_LINALG_CHECK(
+            std::abs(pivot) > pivot_tolerance,
+            label + ": zero pivot during dense solve");
+        for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+            const std::size_t end =
+                std::min(rhs_count, block + kDenseSolveRhsBlock);
+            for (std::size_t r = block; r < end; ++r) {
+                rhs[row * rhs_count + r] /= pivot;
+            }
+        }
+    }
+}
+
+std::vector<Real> DenseLUSolver::solve(std::span<const Real> rhs) const {
+    std::vector<Real> x(rhs.begin(), rhs.end());
+    solve_in_place(std::span<Real>(x.data(), x.size()));
+    return x;
+}
+
+DenseMatrixDiagnostics dense_matrix_diagnostics(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             std::string(label) + ": diagnostic size mismatch");
+    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
+                             std::string(label) + ": diagnostics require a nonempty matrix");
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                             static_cast<Eigen::Index>(rows),
+                                             static_cast<Eigen::Index>(cols));
+    const Matrix dense = A;
+    Eigen::JacobiSVD<Matrix> svd(dense);
+
+    DenseMatrixDiagnostics diagnostics;
+    const auto& singular_values = svd.singularValues();
+    diagnostics.largest_singular_value =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    diagnostics.tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols,
+                                              diagnostics.largest_singular_value);
+
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        const Real sigma = singular_values[i];
+        if (sigma <= diagnostics.tolerance) {
+            continue;
+        }
+        ++diagnostics.rank;
+        diagnostics.smallest_retained_singular_value = sigma;
+    }
+
+    const std::size_t full_rank = std::min(rows, cols);
+    if (diagnostics.rank == full_rank &&
+        diagnostics.smallest_retained_singular_value > Real(0)) {
+        diagnostics.condition_estimate =
+            diagnostics.largest_singular_value /
+            diagnostics.smallest_retained_singular_value;
+    }
+    return diagnostics;
+#else
+    DenseMatrixDiagnostics diagnostics;
+    diagnostics.largest_singular_value = dense_matrix_max_abs(matrix);
+    diagnostics.tolerance =
+        dense_matrix_pivot_tolerance(rows, cols, diagnostics.largest_singular_value);
+    diagnostics.rank =
+        dense_matrix_rank(std::vector<Real>(matrix.begin(), matrix.end()), rows, cols);
+    const std::size_t full_rank = std::min(rows, cols);
+    if (diagnostics.rank == full_rank) {
+        diagnostics.smallest_retained_singular_value = diagnostics.tolerance;
+    }
+    // Exact condition estimates require SVD diagnostics. In Eigen-disabled
+    // builds this stays explicit instead of relying on a misleading estimate.
+    diagnostics.condition_estimate = std::numeric_limits<Real>::infinity();
+    return diagnostics;
+#endif
+}
+
+DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+                                  std::size_t n,
+                                  std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == n * n,
+                             std::string(label) + ": dense factorization size mismatch");
+
+    DenseLUSolver solver;
+    solver.n = n;
+    solver.lu = std::move(matrix);
+    solver.pivots.resize(n);
+    const Real max_abs = dense_matrix_max_abs(solver.lu);
+    solver.pivot_tolerance =
+        dense_matrix_pivot_tolerance(n, n, max_abs);
+    solver.label = std::string(label);
+
+    Real max_pivot_abs = Real(0);
+    Real min_pivot_abs = std::numeric_limits<Real>::infinity();
+    for (std::size_t col = 0; col < n; ++col) {
+        std::size_t pivot_row = col;
+        Real pivot_abs = std::abs(solver.lu[col * n + col]);
+        for (std::size_t row = col + 1; row < n; ++row) {
+            const Real candidate = std::abs(solver.lu[row * n + col]);
+            if (candidate > pivot_abs) {
+                pivot_abs = candidate;
+                pivot_row = row;
+            }
+        }
+
+        DENSE_LINALG_CHECK(
+            pivot_abs > solver.pivot_tolerance,
+            solver.label + ": rank-deficient matrix (rank " +
+                std::to_string(col) + " of " + std::to_string(n) +
+                ", pivot below scale-aware tolerance " +
+                std::to_string(solver.pivot_tolerance) + ")");
+
+        solver.pivots[col] = pivot_row;
+        if (pivot_row != col) {
+            for (std::size_t j = 0; j < n; ++j) {
+                std::swap(solver.lu[col * n + j], solver.lu[pivot_row * n + j]);
+            }
+        }
+
+        const Real pivot = solver.lu[col * n + col];
+        DENSE_LINALG_CHECK(
+            std::abs(pivot) > solver.pivot_tolerance,
+            solver.label + ": zero pivot after row exchange");
+        const Real pivot_magnitude = std::abs(pivot);
+        max_pivot_abs = std::max(max_pivot_abs, pivot_magnitude);
+        min_pivot_abs = std::min(min_pivot_abs, pivot_magnitude);
+
+        for (std::size_t row = col + 1; row < n; ++row) {
+            const Real factor = solver.lu[row * n + col] / pivot;
+            solver.lu[row * n + col] = factor;
+            for (std::size_t j = col + 1; j < n; ++j) {
+                solver.lu[row * n + j] -= factor * solver.lu[col * n + j];
+            }
+        }
+    }
+
+    solver.diagnostics.rank = n;
+    solver.diagnostics.tolerance = solver.pivot_tolerance;
+    solver.diagnostics.largest_singular_value = max_abs;
+    solver.diagnostics.smallest_retained_singular_value =
+        std::isfinite(min_pivot_abs) ? min_pivot_abs : Real(0);
+    if (solver.diagnostics.smallest_retained_singular_value > Real(0)) {
+        solver.diagnostics.condition_estimate =
+            max_pivot_abs / solver.diagnostics.smallest_retained_singular_value;
+    }
+    return solver;
+}
+
+DenseInverseResult invert_dense_matrix_with_diagnostics(
+    std::vector<Real> matrix,
+    std::size_t n,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == n * n,
+                             std::string(label) + ": dense inverse size mismatch");
+    std::vector<Real> matrix_for_lu = matrix;
+    const DenseLUSolver solver =
+        factor_dense_matrix(std::move(matrix_for_lu), n, label);
+
+    DenseInverseResult result;
+    result.diagnostics =
+        dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
+                                 n, n, label);
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    if (std::isfinite(solver.diagnostics.condition_estimate) &&
+        std::isfinite(result.diagnostics.condition_estimate) &&
+        result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
+        using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+        using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+        const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                                 static_cast<Eigen::Index>(n),
+                                                 static_cast<Eigen::Index>(n));
+        const Matrix dense = A;
+        Eigen::JacobiSVD<Matrix> svd(dense,
+                                     Eigen::ComputeFullU | Eigen::ComputeFullV);
+        Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(n),
+                                            static_cast<Eigen::Index>(n));
+        const auto& singular_values = svd.singularValues();
+        for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+            DENSE_LINALG_CHECK(
+                singular_values[i] > solver.diagnostics.tolerance,
+                std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
+            sigma_inverse(i, i) = Real(1) / singular_values[i];
+        }
+        const Matrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+        result.inverse.assign(n * n, Real(0));
+        for (std::size_t row = 0; row < n; ++row) {
+            for (std::size_t col = 0; col < n; ++col) {
+                result.inverse[row * n + col] =
+                    inverse(static_cast<Eigen::Index>(row), static_cast<Eigen::Index>(col));
+            }
+        }
+        result.used_svd_fallback = true;
+        return result;
+    }
+#endif
+
+    materialize_inverse_from_solver(solver, result.inverse);
+    return result;
+}
+
+void validate_dense_inverse_diagnostics(
+    const DenseInverseResult& result,
+    std::size_t expected_rank,
+    std::string_view label,
+    Real max_condition) {
+    DENSE_LINALG_CHECK(
+        result.diagnostics.rank == expected_rank,
+        std::string(label) + ": rank-deficient matrix (rank " +
+            std::to_string(result.diagnostics.rank) + " of " +
+            std::to_string(expected_rank) + ")");
+
+    if (!std::isfinite(result.diagnostics.condition_estimate)) {
+        return;
+    }
+
+    DENSE_LINALG_CHECK(
+        result.diagnostics.condition_estimate <= max_condition,
+        std::string(label) + ": condition estimate " +
+            std::to_string(result.diagnostics.condition_estimate) +
+            " exceeds supported threshold " + std::to_string(max_condition));
+}
+
+std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+                                      std::size_t n,
+                                      std::string_view label) {
+    const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
+    std::vector<Real> inverse;
+    materialize_inverse_from_solver(solver, inverse);
+    return inverse;
+}
+
+std::size_t dense_matrix_rank(std::vector<Real> matrix,
+                              std::size_t rows,
+                              std::size_t cols) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             "dense_matrix_rank: size mismatch");
+    const Real tolerance =
+        dense_matrix_pivot_tolerance(rows, cols, dense_matrix_max_abs(matrix));
+
+    std::size_t rank = 0;
+    std::size_t pivot_row = 0;
+    for (std::size_t col = 0; col < cols && pivot_row < rows; ++col) {
+        std::size_t best_row = pivot_row;
+        Real best_abs = std::abs(matrix[pivot_row * cols + col]);
+        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
+            const Real candidate = std::abs(matrix[row * cols + col]);
+            if (candidate > best_abs) {
+                best_abs = candidate;
+                best_row = row;
+            }
+        }
+        if (best_abs <= tolerance) {
+            continue;
+        }
+
+        if (best_row != pivot_row) {
+            for (std::size_t c = col; c < cols; ++c) {
+                std::swap(matrix[pivot_row * cols + c], matrix[best_row * cols + c]);
+            }
+        }
+
+        const Real pivot = matrix[pivot_row * cols + col];
+        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
+            const Real factor = matrix[row * cols + col] / pivot;
+            if (std::abs(factor) <= tolerance) {
+                matrix[row * cols + col] = Real(0);
+                continue;
+            }
+            matrix[row * cols + col] = Real(0);
+            for (std::size_t c = col + 1; c < cols; ++c) {
+                matrix[row * cols + c] -= factor * matrix[pivot_row * cols + c];
+            }
+        }
+
+        ++rank;
+        ++pivot_row;
+    }
+    return rank;
+}
+
+DensePseudoInverseResult rank_revealing_pseudo_inverse(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             std::string(label) + ": pseudo-inverse size mismatch");
+    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
+                             std::string(label) + ": pseudo-inverse requires a nonempty matrix");
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                             static_cast<Eigen::Index>(rows),
+                                             static_cast<Eigen::Index>(cols));
+    const Matrix dense = A;
+    Eigen::JacobiSVD<Matrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
+
+    DensePseudoInverseResult result;
+    result.inverse.assign(cols * rows, Real(0));
+
+    const auto& singular_values = svd.singularValues();
+    result.largest_singular_value =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    result.tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols, result.largest_singular_value);
+
+    Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(cols),
+                                        static_cast<Eigen::Index>(rows));
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        const Real sigma = singular_values[i];
+        if (sigma <= result.tolerance) {
+            continue;
+        }
+        sigma_inverse(i, i) = Real(1) / sigma;
+        ++result.rank;
+        result.smallest_retained_singular_value = sigma;
+    }
+
+    const Matrix pseudo_inverse =
+        svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+    for (std::size_t r = 0; r < cols; ++r) {
+        for (std::size_t c = 0; c < rows; ++c) {
+            result.inverse[r * rows + c] =
+                pseudo_inverse(static_cast<Eigen::Index>(r), static_cast<Eigen::Index>(c));
+        }
+    }
+    return result;
+#else
+    DENSE_LINALG_CHECK(
+        false,
+        std::string(label) +
+            ": rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN");
+    return {};
+#endif
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#undef DENSE_LINALG_CHECK
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
new file mode 100644
index 000000000..7684439b5
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -0,0 +1,119 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_DENSELINEARALGEBRA_H
+#define SVMP_FE_MATH_DENSELINEARALGEBRA_H
+
+#include "Types.h"
+
+#include <cstddef>
+#include <limits>
+#include <span>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+// Dense solve, inverse, rank, and pseudo-inverse support for FE construction
+// utilities. Matrices are row-major: matrix[row * cols + col].
+[[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
+
+[[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
+                                                std::size_t cols,
+                                                Real max_abs,
+                                                Real multiplier = Real(64)) noexcept;
+
+[[nodiscard]] Real dense_matrix_singular_value_tolerance(std::size_t rows,
+                                                         std::size_t cols,
+                                                         Real largest_singular_value,
+                                                         Real multiplier = Real(64)) noexcept;
+
+struct DensePseudoInverseResult {
+    std::vector<Real> inverse;
+    std::size_t rank{0};
+    Real tolerance{0};
+    Real largest_singular_value{0};
+    Real smallest_retained_singular_value{0};
+};
+
+struct DenseMatrixDiagnostics {
+    std::size_t rank{0};
+    Real tolerance{0};
+    Real largest_singular_value{0};
+    Real smallest_retained_singular_value{0};
+    Real condition_estimate{std::numeric_limits<Real>::infinity()};
+};
+
+struct DenseInverseResult {
+    std::vector<Real> inverse;
+    DenseMatrixDiagnostics diagnostics;
+    bool used_svd_fallback{false};
+};
+
+[[nodiscard]] Real dense_matrix_condition_fallback_threshold() noexcept;
+[[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
+
+struct DenseLUSolver {
+    std::size_t n{0};
+    std::vector<Real> lu;
+    std::vector<std::size_t> pivots;
+    DenseMatrixDiagnostics diagnostics;
+    Real pivot_tolerance{0};
+    std::string label;
+
+    [[nodiscard]] bool empty() const noexcept { return n == 0; }
+
+    void solve_in_place(std::span<Real> rhs) const;
+    void solve_in_place(std::span<Real> rhs, std::size_t rhs_count) const;
+    [[nodiscard]] std::vector<Real> solve(std::span<const Real> rhs) const;
+};
+
+// Inverses and pseudo-inverses keep the same row-major convention for their
+// returned dimensions.
+[[nodiscard]] DenseMatrixDiagnostics dense_matrix_diagnostics(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label = "dense matrix");
+
+[[nodiscard]] DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+                                                std::size_t n,
+                                                std::string_view label = "dense matrix");
+
+[[nodiscard]] std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+                                                    std::size_t n,
+                                                    std::string_view label = "dense matrix");
+
+[[nodiscard]] DenseInverseResult invert_dense_matrix_with_diagnostics(
+    std::vector<Real> matrix,
+    std::size_t n,
+    std::string_view label = "dense matrix");
+
+void validate_dense_inverse_diagnostics(
+    const DenseInverseResult& result,
+    std::size_t expected_rank,
+    std::string_view label = "dense matrix",
+    Real max_condition = dense_matrix_condition_error_threshold());
+
+[[nodiscard]] std::size_t dense_matrix_rank(std::vector<Real> matrix,
+                                            std::size_t rows,
+                                            std::size_t cols);
+
+[[nodiscard]] DensePseudoInverseResult rank_revealing_pseudo_inverse(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label = "dense matrix");
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_DENSELINEARALGEBRA_H
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
new file mode 100644
index 000000000..8bf83ec0b
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -0,0 +1,78 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
+#define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
+
+#include "Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+constexpr std::size_t dense_transform_blocked_min_rows() noexcept { return 32u; }
+constexpr std::size_t dense_transform_blocked_min_rhs() noexcept { return 4u; }
+
+inline void dense_transform_batched_row_major(
+    const Real* SVMP_RESTRICT matrix,
+    std::size_t rows,
+    std::size_t cols,
+    const Real* SVMP_RESTRICT input,
+    std::size_t input_row_stride,
+    Real* SVMP_RESTRICT output,
+    std::size_t output_row_stride,
+    std::size_t rhs_count) {
+    if (rows == 0u || cols == 0u || rhs_count == 0u) {
+        return;
+    }
+
+    if (rows < dense_transform_blocked_min_rows() ||
+        rhs_count < dense_transform_blocked_min_rhs()) {
+        for (std::size_t row = 0; row < rows; ++row) {
+            const Real* matrix_row = matrix + row * cols;
+            Real* output_row = output + row * output_row_stride;
+            for (std::size_t rhs = 0; rhs < rhs_count; ++rhs) {
+                Real value = Real(0);
+                for (std::size_t col = 0; col < cols; ++col) {
+                    value += matrix_row[col] * input[col * input_row_stride + rhs];
+                }
+                output_row[rhs] = value;
+            }
+        }
+        return;
+    }
+
+    constexpr std::size_t kRhsBlock = 32u;
+    for (std::size_t row = 0; row < rows; ++row) {
+        const Real* matrix_row = matrix + row * cols;
+        Real* output_row = output + row * output_row_stride;
+        for (std::size_t rhs_base = 0; rhs_base < rhs_count; rhs_base += kRhsBlock) {
+            const std::size_t block_size = std::min(kRhsBlock, rhs_count - rhs_base);
+            std::array<Real, kRhsBlock> accum{};
+            for (std::size_t col = 0; col < cols; ++col) {
+                const Real coeff = matrix_row[col];
+                const Real* input_row = input + col * input_row_stride + rhs_base;
+                for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
+                    accum[rhs] += coeff * input_row[rhs];
+                }
+            }
+            for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
+                output_row[rhs_base + rhs] = accum[rhs];
+            }
+        }
+    }
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
diff --git a/Code/Source/solver/FE/Math/ExpressionOps.h b/Code/Source/solver/FE/Math/ExpressionOps.h
new file mode 100644
index 000000000..96cea1037
--- /dev/null
+++ b/Code/Source/solver/FE/Math/ExpressionOps.h
@@ -0,0 +1,99 @@
+#ifndef SVMP_FE_MATH_EXPRESSION_OPS_H
+#define SVMP_FE_MATH_EXPRESSION_OPS_H
+
+/**
+ * @file ExpressionOps.h
+ * @brief Common expression template operators for vector and matrix expressions
+ *
+ * This header provides shared operator functors used by both VectorExpr.h and
+ * MatrixExpr.h to avoid code duplication and namespace conflicts. All operators
+ * are defined in the detail::ops namespace for internal use by expression templates.
+ */
+
+#include <cmath>
+
+namespace svmp {
+namespace FE {
+namespace math {
+namespace detail {
+namespace ops {
+
+/**
+ * @brief Addition operator functor
+ */
+struct Add {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a + b;
+    }
+};
+
+/**
+ * @brief Subtraction operator functor
+ */
+struct Sub {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a - b;
+    }
+};
+
+/**
+ * @brief Multiplication operator functor
+ */
+struct Mul {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a * b;
+    }
+};
+
+/**
+ * @brief Division operator functor
+ */
+struct Div {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a / b;
+    }
+};
+
+/**
+ * @brief Negation operator functor
+ */
+struct Negate {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        return -a;
+    }
+};
+
+/**
+ * @brief Absolute value operator functor
+ */
+struct Abs {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::abs;
+        return abs(a);
+    }
+};
+
+/**
+ * @brief Square root operator functor
+ */
+struct Sqrt {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::sqrt;
+        return sqrt(a);
+    }
+};
+
+} // namespace ops
+} // namespace detail
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_EXPRESSION_OPS_H
diff --git a/Code/Source/solver/FE/Math/IntegerMath.h b/Code/Source/solver/FE/Math/IntegerMath.h
new file mode 100644
index 000000000..52a50117f
--- /dev/null
+++ b/Code/Source/solver/FE/Math/IntegerMath.h
@@ -0,0 +1,98 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_INTEGERMATH_H
+#define SVMP_FE_MATH_INTEGERMATH_H
+
+#include "Types.h"
+
+#include <cstddef>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+[[nodiscard]] constexpr Real pow_int_nonnegative(Real base, int exponent) noexcept {
+    Real result = Real(1);
+    Real factor = base;
+    int power = exponent;
+    while (power > 0) {
+        if ((power & 1) != 0) {
+            result *= factor;
+        }
+        power >>= 1;
+        if (power > 0) {
+            factor *= factor;
+        }
+    }
+    return result;
+}
+
+[[nodiscard]] constexpr Real pow_int(Real base, int exponent) noexcept {
+    if (exponent < 0) {
+        return Real(1) / pow_int_nonnegative(base, -exponent);
+    }
+    return pow_int_nonnegative(base, exponent);
+}
+
+[[nodiscard]] constexpr std::size_t binomial_size(int n, int k) {
+    if (n < 0 || k < 0 || k > n) {
+        return 0u;
+    }
+    if (k > n - k) {
+        k = n - k;
+    }
+
+    std::size_t result = 1u;
+    for (int i = 1; i <= k; ++i) {
+        auto numerator = static_cast<std::size_t>(n - (k - i));
+        auto denominator = static_cast<std::size_t>(i);
+
+        const auto numerator_gcd = std::gcd(numerator, denominator);
+        numerator /= numerator_gcd;
+        denominator /= numerator_gcd;
+
+        const auto result_gcd = std::gcd(result, denominator);
+        result /= result_gcd;
+        denominator /= result_gcd;
+        if (denominator != 1u) {
+            throw std::overflow_error(
+                "binomial_size: failed to reduce exact binomial factor");
+        }
+        if (numerator != 0u &&
+            result > std::numeric_limits<std::size_t>::max() / numerator) {
+            throw std::overflow_error("binomial_size: result does not fit in size_t");
+        }
+        result *= numerator;
+    }
+    return result;
+}
+
+[[nodiscard]] constexpr Real binomial_real(int n, int k) noexcept {
+    if (k < 0 || k > n) {
+        return Real(0);
+    }
+    if (k > n - k) {
+        k = n - k;
+    }
+
+    Real result = Real(1);
+    for (int i = 1; i <= k; ++i) {
+        result *= static_cast<Real>(n - (k - i));
+        result /= static_cast<Real>(i);
+    }
+    return result;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_INTEGERMATH_H
diff --git a/Code/Source/solver/FE/Math/MathConstants.h b/Code/Source/solver/FE/Math/MathConstants.h
new file mode 100644
index 000000000..145520ab2
--- /dev/null
+++ b/Code/Source/solver/FE/Math/MathConstants.h
@@ -0,0 +1,388 @@
+#ifndef SVMP_FE_MATH_CONSTANTS_H
+#define SVMP_FE_MATH_CONSTANTS_H
+
+/**
+ * @file MathConstants.h
+ * @brief Mathematical constants and numerical tolerances for FE computations
+ *
+ * This header provides mathematical constants (π, e, √2, etc.) and numerical
+ * tolerances used throughout the FE library. All constants are templated
+ * to support different precision types.
+ */
+
+#include <cmath>
+#include <limits>
+#include <type_traits>
+#include <algorithm>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Mathematical constants templated by type
+ * @tparam T The numeric type (float, double, long double)
+ */
+template<typename T>
+struct Constants {
+    static_assert(std::is_floating_point_v<T>,
+                  "Constants only defined for floating-point types");
+
+    // Mathematical constants
+    static constexpr T pi           = T(3.14159265358979323846264338327950288419716939937510L);
+    static constexpr T two_pi       = T(6.28318530717958647692528676655900576839433879875021L);
+    static constexpr T half_pi      = T(1.57079632679489661923132169163975144209858469968755L);
+    static constexpr T quarter_pi   = T(0.78539816339744830961566084581987572104929234984378L);
+    static constexpr T inv_pi       = T(0.31830988618379067153776752674502872406891929148091L);
+    static constexpr T inv_two_pi   = T(0.15915494309189533576888376337251436203445964574046L);
+
+    static constexpr T e            = T(2.71828182845904523536028747135266249775724709369995L);
+    static constexpr T log2e        = T(1.44269504088896340735992468100189213742664595415299L);
+    static constexpr T log10e       = T(0.43429448190325182765112891891660508229439700580367L);
+    static constexpr T ln2          = T(0.69314718055994530941723212145817656807550013436026L);
+    static constexpr T ln10         = T(2.30258509299404568401799145468436420760110148862877L);
+
+    static constexpr T sqrt2        = T(1.41421356237309504880168872420969807856967187537694L);
+    static constexpr T sqrt3        = T(1.73205080756887729352744634150587236694280525381038L);
+    static constexpr T inv_sqrt2    = T(0.70710678118654752440084436210484903928483593768847L);
+    static constexpr T inv_sqrt3    = T(0.57735026918962576450914878050195745564760175127013L);
+
+    // Golden ratio
+    static constexpr T phi          = T(1.61803398874989484820458683436563811772030917980576L);
+
+    // Degrees to radians conversion
+    static constexpr T deg_to_rad   = pi / T(180);
+    static constexpr T rad_to_deg   = T(180) / pi;
+};
+
+/**
+ * @brief Numerical tolerances and machine epsilon
+ * @tparam T The numeric type
+ */
+template<typename T>
+struct Tolerances {
+    static_assert(std::is_floating_point_v<T>,
+                  "Tolerances only defined for floating-point types");
+
+    // Machine epsilon
+    static constexpr T epsilon      = std::numeric_limits<T>::epsilon();
+
+    // Default tolerance (1000 * machine epsilon)
+    static constexpr T tolerance    = T(1000) * epsilon;
+
+    // Strict tolerance (10 * machine epsilon)
+    static constexpr T strict       = T(10) * epsilon;
+
+    // Loose tolerance (10000 * machine epsilon)
+    static constexpr T loose        = T(10000) * epsilon;
+
+    // Square root of epsilon (useful for finite differences)
+    static inline const T sqrt_epsilon = std::sqrt(epsilon);
+
+    // Cube root of epsilon (useful for numerical derivatives)
+    static inline const T cbrt_epsilon = std::cbrt(epsilon);
+
+    // Smallest positive normalized value
+    static constexpr T min_positive = std::numeric_limits<T>::min();
+
+    // Largest representable value
+    static constexpr T max_value    = std::numeric_limits<T>::max();
+
+    // Infinity
+    static constexpr T infinity     = std::numeric_limits<T>::infinity();
+
+    // Not-a-Number
+    static constexpr T nan          = std::numeric_limits<T>::quiet_NaN();
+};
+
+/**
+ * @brief Convenient aliases for common types
+ */
+template<typename T> inline constexpr T pi           = Constants<T>::pi;
+template<typename T> inline constexpr T two_pi       = Constants<T>::two_pi;
+template<typename T> inline constexpr T half_pi      = Constants<T>::half_pi;
+template<typename T> inline constexpr T quarter_pi   = Constants<T>::quarter_pi;
+template<typename T> inline constexpr T inv_pi       = Constants<T>::inv_pi;
+template<typename T> inline constexpr T inv_two_pi   = Constants<T>::inv_two_pi;
+
+template<typename T> inline constexpr T e            = Constants<T>::e;
+template<typename T> inline constexpr T log2e        = Constants<T>::log2e;
+template<typename T> inline constexpr T log10e       = Constants<T>::log10e;
+template<typename T> inline constexpr T ln2          = Constants<T>::ln2;
+template<typename T> inline constexpr T ln10         = Constants<T>::ln10;
+
+template<typename T> inline constexpr T sqrt2        = Constants<T>::sqrt2;
+template<typename T> inline constexpr T sqrt3        = Constants<T>::sqrt3;
+template<typename T> inline constexpr T inv_sqrt2    = Constants<T>::inv_sqrt2;
+template<typename T> inline constexpr T inv_sqrt3    = Constants<T>::inv_sqrt3;
+
+template<typename T> inline constexpr T phi          = Constants<T>::phi;
+
+template<typename T> inline constexpr T deg_to_rad   = Constants<T>::deg_to_rad;
+template<typename T> inline constexpr T rad_to_deg   = Constants<T>::rad_to_deg;
+
+template<typename T> inline constexpr T epsilon      = Tolerances<T>::epsilon;
+template<typename T> inline constexpr T tolerance    = Tolerances<T>::tolerance;
+template<typename T> inline constexpr T strict_tol   = Tolerances<T>::strict;
+template<typename T> inline constexpr T loose_tol    = Tolerances<T>::loose;
+template<typename T> inline const T sqrt_epsilon = Tolerances<T>::sqrt_epsilon;
+template<typename T> inline const T cbrt_epsilon = Tolerances<T>::cbrt_epsilon;
+template<typename T> inline constexpr T min_positive = Tolerances<T>::min_positive;
+template<typename T> inline constexpr T max_value    = Tolerances<T>::max_value;
+template<typename T> inline constexpr T infinity     = Tolerances<T>::infinity;
+
+/**
+ * @brief Comparison functions with tolerance
+ */
+
+/**
+ * @brief Check if two values are approximately equal
+ * @param a First value
+ * @param b Second value
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if |a - b| <= tol * max(|a|, |b|, 1)
+ */
+template<typename T>
+inline constexpr bool approx_equal(T a, T b, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "approx_equal only defined for floating-point types");
+    const T scale = std::max({std::abs(a), std::abs(b), T(1)});
+    return std::abs(a - b) <= tol * scale;
+}
+
+/**
+ * @brief Check if a value is approximately zero
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if |a| <= tol
+ */
+template<typename T>
+inline constexpr bool approx_zero(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "approx_zero only defined for floating-point types");
+    return std::abs(a) <= tol;
+}
+
+/**
+ * @brief Check if a value is positive (greater than tolerance)
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if a > tol
+ */
+template<typename T>
+inline constexpr bool is_positive(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_positive only defined for floating-point types");
+    return a > tol;
+}
+
+/**
+ * @brief Check if a value is negative (less than -tolerance)
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if a < -tol
+ */
+template<typename T>
+inline constexpr bool is_negative(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_negative only defined for floating-point types");
+    return a < -tol;
+}
+
+/**
+ * @brief Check if a value is finite (not infinite or NaN)
+ * @param a Value to check
+ * @return true if value is finite
+ */
+template<typename T>
+inline constexpr bool is_finite(T a) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_finite only defined for floating-point types");
+    return std::isfinite(a);
+}
+
+/**
+ * @brief Degrees to radians conversion
+ * @param degrees Angle in degrees
+ * @return Angle in radians
+ */
+template<typename T>
+inline constexpr T to_radians(T degrees) {
+    static_assert(std::is_floating_point_v<T>,
+                  "to_radians only defined for floating-point types");
+    return degrees * deg_to_rad<T>;
+}
+
+/**
+ * @brief Radians to degrees conversion
+ * @param radians Angle in radians
+ * @return Angle in degrees
+ */
+template<typename T>
+inline constexpr T to_degrees(T radians) {
+    static_assert(std::is_floating_point_v<T>,
+                  "to_degrees only defined for floating-point types");
+    return radians * rad_to_deg<T>;
+}
+
+// =============================================================================
+// Constants namespace for compatibility with test expectations
+// =============================================================================
+namespace constants {
+
+// Mathematical constants (double precision defaults)
+inline constexpr double PI         = Constants<double>::pi;
+inline constexpr double PI_2       = Constants<double>::half_pi;
+inline constexpr double PI_4       = Constants<double>::quarter_pi;
+inline constexpr double TWO_PI     = Constants<double>::two_pi;
+inline constexpr double INV_PI     = Constants<double>::inv_pi;
+
+inline constexpr double E          = Constants<double>::e;
+inline constexpr double LN_2       = Constants<double>::ln2;
+inline constexpr double LN_10      = Constants<double>::ln10;
+inline constexpr double LOG10_E    = Constants<double>::log10e;
+inline constexpr double LOG2_E     = Constants<double>::log2e;
+
+inline constexpr double SQRT_2     = Constants<double>::sqrt2;
+inline constexpr double SQRT_3     = Constants<double>::sqrt3;
+inline constexpr double SQRT_5     = 2.2360679774997896964091736687312L;
+inline constexpr double INV_SQRT_2  = Constants<double>::inv_sqrt2;
+inline constexpr double INV_SQRT_3  = Constants<double>::inv_sqrt3;
+
+inline constexpr double PHI        = Constants<double>::phi;
+
+// Angle conversion functions
+template<typename T>
+inline constexpr T deg_to_rad(T degrees) {
+    return degrees * Constants<T>::deg_to_rad;
+}
+
+template<typename T>
+inline constexpr T rad_to_deg(T radians) {
+    return radians * Constants<T>::rad_to_deg;
+}
+
+// Templated tolerances
+template<typename T>
+inline constexpr T tolerance() {
+    return Tolerances<T>::tolerance;
+}
+
+template<typename T>
+inline constexpr T machine_epsilon() {
+    return Tolerances<T>::epsilon;
+}
+
+// Additional constants and utility functions for tests
+inline constexpr double DEFAULT_TOLERANCE = Tolerances<double>::tolerance;
+inline constexpr double DEFAULT_REL_TOLERANCE = 1e-12;
+inline constexpr double GEOMETRY_TOLERANCE = 1e-10;
+inline constexpr double SOLVER_TOLERANCE = Tolerances<double>::strict;
+inline constexpr double EPSILON = Tolerances<double>::epsilon;
+inline constexpr double INF_VALUE = Tolerances<double>::infinity;  // Renamed from INFINITY
+inline constexpr double NOT_A_NUMBER = Tolerances<double>::nan;  // Renamed from NAN
+inline constexpr double MAX_DOUBLE = Tolerances<double>::max_value;
+inline constexpr double MIN_DOUBLE = Tolerances<double>::min_positive;
+inline constexpr double LOWEST_DOUBLE = -Tolerances<double>::max_value;
+
+// Physical constants
+inline constexpr double SPEED_OF_LIGHT = 299792458.0;         // m/s
+inline constexpr double GRAVITATIONAL_CONSTANT = 6.67430e-11;  // m³/(kg·s²)
+inline constexpr double PLANCK_CONSTANT = 6.62607015e-34;      // J·s
+inline constexpr double AVOGADRO_NUMBER = 6.02214076e23;       // mol⁻¹
+inline constexpr double BOLTZMANN_CONSTANT = 1.380649e-23;     // J/K
+inline constexpr double STANDARD_GRAVITY = 9.80665;            // m/s²
+
+// Float and long double versions
+inline constexpr float PI_F = static_cast<float>(PI);
+inline constexpr float E_F = static_cast<float>(E);
+inline constexpr float SQRT_2_F = static_cast<float>(SQRT_2);
+inline constexpr float EPSILON_F = Tolerances<float>::epsilon;
+
+inline constexpr long double PI_L = static_cast<long double>(PI);
+inline constexpr long double E_L = static_cast<long double>(E);
+inline constexpr long double SQRT_2_L = static_cast<long double>(SQRT_2);
+inline constexpr long double EPSILON_L = Tolerances<long double>::epsilon;
+
+// Additional mathematical constants
+inline constexpr double SQRT_PI = 1.7724538509055160272981674833411L;
+
+// Utility functions
+template<typename T>
+inline constexpr int sign(T value) {
+    return (T(0) < value) - (value < T(0));
+}
+
+template<typename T>
+inline constexpr bool is_zero(T value, T tol = DEFAULT_TOLERANCE) {
+    return std::abs(value) <= tol;
+}
+
+template<typename T>
+inline bool near(T a, T b, T tol = DEFAULT_TOLERANCE) {
+    return std::abs(a - b) <= tol;
+}
+
+template<typename T>
+inline bool near_relative(T a, T b, T rel_tol = DEFAULT_REL_TOLERANCE) {
+    T scale = std::max(std::abs(a), std::abs(b));
+    return std::abs(a - b) <= rel_tol * scale;
+}
+
+template<typename T>
+inline constexpr T clamp(T value, T min_val, T max_val) {
+    return value < min_val ? min_val : (value > max_val ? max_val : value);
+}
+
+template<typename T>
+inline constexpr T lerp(T a, T b, T t) {
+    return a + t * (b - a);
+}
+
+template<typename T>
+inline T safe_divide(T numerator, T denominator, T default_val = T(0)) {
+    return is_zero(denominator) ? default_val : numerator / denominator;
+}
+
+template<typename T>
+inline bool isinf(T value) {
+    return std::isinf(value);
+}
+
+template<typename T>
+inline bool isnan(T value) {
+    return std::isnan(value);
+}
+
+} // namespace constants
+
+// Physical constants for FE analysis
+namespace physical_constants {
+
+// Material properties (SI units)
+inline constexpr double water_density = 1000.0;         // kg/m³
+inline constexpr double steel_density = 7850.0;         // kg/m³
+inline constexpr double aluminum_density = 2700.0;      // kg/m³
+
+inline constexpr double water_viscosity = 0.001;        // Pa·s at 20°C
+inline constexpr double air_viscosity = 1.81e-5;        // Pa·s at 20°C
+
+inline constexpr double steel_youngs_modulus = 200e9;   // Pa
+inline constexpr double aluminum_youngs_modulus = 70e9; // Pa
+
+inline constexpr double steel_poisson_ratio = 0.3;      // dimensionless
+inline constexpr double aluminum_poisson_ratio = 0.33;  // dimensionless
+
+// Physical constants
+inline constexpr double gravity = 9.80665;              // m/s²
+inline constexpr double gas_constant = 8.314462618;     // J/(mol·K)
+inline constexpr double boltzmann = 1.380649e-23;       // J/K
+inline constexpr double avogadro = 6.02214076e23;       // mol⁻¹
+
+} // namespace physical_constants
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_CONSTANTS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
new file mode 100644
index 000000000..0b80091f9
--- /dev/null
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -0,0 +1,1487 @@
+#ifndef SVMP_FE_MATH_MATRIX_H
+#define SVMP_FE_MATH_MATRIX_H
+
+/**
+ * @file Matrix.h
+ * @brief Fixed-size matrices with expression templates and specializations for FE computations
+ *
+ * This header provides optimized fixed-size matrix operations for element-level
+ * computations. Includes specialized analytical formulas for 2x2 and 3x3 matrices
+ * (determinant, inverse using Cramer's rule) and Gauss elimination for larger matrices.
+ * All operations use expression templates to eliminate temporaries.
+ */
+
+#include "MatrixExpr.h"
+#include "Vector.h"
+#include "MathConstants.h"
+#include "../Common/Alignment.h"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <type_traits>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Fixed-size matrix for element-level computations
+ * @tparam T Scalar type (float, double)
+ * @tparam M Number of rows
+ * @tparam N Number of columns
+ *
+ * Storage is row-major for cache efficiency. Memory is aligned for SIMD operations.
+ * Specializations exist for 2x2, 3x3, 4x4 matrices with analytical algorithms.
+ */
+template<typename T, std::size_t M, std::size_t N>
+class Matrix : public MatrixExpr<Matrix<T, M, N>> {
+    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
+    static_assert(M > 0 && N > 0, "Matrix dimensions must be positive");
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[M * N];  // Row-major, SIMD-friendly storage
+
+    // Helper to compute linear index from (i,j)
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * N + j;
+    }
+
+public:
+    // Type definitions
+    using value_type = T;
+    using size_type = std::size_t;
+    using reference = T&;
+    using const_reference = const T&;
+    using pointer = T*;
+    using const_pointer = const T*;
+
+    /**
+     * @brief Default constructor - zero initializes all elements
+     */
+    constexpr Matrix() : data_{} {}
+
+    /**
+     * @brief Fill constructor - initializes all elements with same value
+     * @param value Value to fill matrix with
+     */
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Initializer list constructor for row-wise initialization
+     * @param init Nested initializer lists {{row0}, {row1}, ...}
+     */
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= M) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= N) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    /**
+     * @brief Constructor from expression template
+     * @tparam Expr Expression type
+     * @param expr Matrix expression to evaluate
+     */
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    /**
+     * @brief Copy constructor
+     */
+    constexpr Matrix(const Matrix&) = default;
+
+    /**
+     * @brief Move constructor
+     */
+    constexpr Matrix(Matrix&&) noexcept = default;
+
+    /**
+     * @brief Copy assignment
+     */
+    Matrix& operator=(const Matrix&) = default;
+
+    /**
+     * @brief Move assignment
+     */
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    /**
+     * @brief Assignment from expression template
+     * @tparam Expr Expression type
+     * @param expr Matrix expression to evaluate
+     * @return Reference to this
+     */
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Get number of rows (compile-time constant)
+     * @return Number of rows
+     */
+    static constexpr size_type rows() { return M; }
+
+    /**
+     * @brief Get number of columns (compile-time constant)
+     * @return Number of columns
+     */
+    static constexpr size_type cols() { return N; }
+
+    /**
+     * @brief Get total number of elements
+     * @return M * N
+     */
+    static constexpr size_type size() { return M * N; }
+
+    /**
+     * @brief Element access (no bounds checking)
+     * @param i Row index
+     * @param j Column index
+     * @return Reference to element
+     */
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+
+    /**
+     * @brief Element access (no bounds checking) - const version
+     * @param i Row index
+     * @param j Column index
+     * @return Const reference to element
+     */
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    /**
+     * @brief Element access with bounds checking
+     * @param i Row index
+     * @param j Column index
+     * @return Reference to element
+     * @throws std::out_of_range if indices are out of bounds
+     */
+    T& at(size_type i, size_type j) {
+        if (i >= M || j >= N) {
+            throw std::out_of_range("Matrix::at: index out of range");
+        }
+        return (*this)(i, j);
+    }
+
+    /**
+     * @brief Element access with bounds checking - const version
+     * @param i Row index
+     * @param j Column index
+     * @return Const reference to element
+     * @throws std::out_of_range if indices are out of bounds
+     */
+    const T& at(size_type i, size_type j) const {
+        if (i >= M || j >= N) {
+            throw std::out_of_range("Matrix::at: index out of range");
+        }
+        return (*this)(i, j);
+    }
+
+    /**
+     * @brief Get row as vector
+     * @param i Row index
+     * @return Vector containing row elements
+     */
+    Vector<T, N> row(size_type i) const {
+        Vector<T, N> result;
+        for (size_type j = 0; j < N; ++j) {
+            result[j] = (*this)(i, j);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column as vector
+     * @param j Column index
+     * @return Vector containing column elements
+     */
+    Vector<T, M> column(size_type j) const {
+        Vector<T, M> result;
+        for (size_type i = 0; i < M; ++i) {
+            result[i] = (*this)(i, j);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column as vector (alias for column)
+     * @param j Column index
+     * @return Vector containing column elements
+     */
+    Vector<T, M> col(size_type j) const {
+        return column(j);
+    }
+
+    /**
+     * @brief Set row from vector
+     * @param i Row index
+     * @param v Vector of values
+     */
+    void set_row(size_type i, const Vector<T, N>& v) {
+        for (size_type j = 0; j < N; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    /**
+     * @brief Set column from vector
+     * @param j Column index
+     * @param v Vector of values
+     */
+    void set_column(size_type j, const Vector<T, M>& v) {
+        for (size_type i = 0; i < M; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    /**
+     * @brief Set column from vector (alias for set_column)
+     * @param j Column index
+     * @param v Vector of values
+     */
+    void set_col(size_type j, const Vector<T, M>& v) {
+        set_column(j, v);
+    }
+
+    /**
+     * @brief Get pointer to underlying data
+     * @return Pointer to first element
+     */
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    /**
+     * @brief Fill matrix with value
+     * @param value Value to fill with
+     */
+    void fill(T value) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Set all elements to zero
+     */
+    void set_zero() {
+        fill(T{0});
+    }
+
+    // Arithmetic operators
+
+    /**
+     * @brief In-place addition
+     * @param other Matrix to add
+     * @return Reference to this
+     */
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place subtraction
+     * @param other Matrix to subtract
+     * @return Reference to this
+     */
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar multiplication
+     * @param scalar Scalar to multiply by
+     * @return Reference to this
+     */
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar division
+     * @param scalar Scalar to divide by
+     * @return Reference to this
+     */
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    // Matrix operations
+
+    /**
+     * @brief Compute transpose
+     * @return Transposed matrix
+     */
+    Matrix<T, N, M> transpose() const {
+        Matrix<T, N, M> result;
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                result(j, i) = (*this)(i, j);
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute trace (sum of diagonal elements)
+     * @return Trace (only valid for square matrices)
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, T> trace() const {
+        T result = T(0);
+        for (size_type i = 0; i < M; ++i) {
+            result += (*this)(i, i);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute Frobenius norm squared
+     * @return Sum of squares of all elements
+     */
+    T frobenius_norm_squared() const {
+        T result = T(0);
+        for (size_type i = 0; i < M * N; ++i) {
+            result += data_[i] * data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute Frobenius norm
+     * @return Square root of sum of squares
+     */
+    T frobenius_norm() const {
+        using std::sqrt;
+        return sqrt(frobenius_norm_squared());
+    }
+
+    /**
+     * @brief Compute infinity norm (maximum absolute row sum)
+     * @return Infinity norm
+     */
+    T infinity_norm() const {
+        T max_row_sum = T(0);
+        for (size_type i = 0; i < M; ++i) {
+            T row_sum = T(0);
+            for (size_type j = 0; j < N; ++j) {
+                using std::abs;
+                row_sum += abs((*this)(i, j));
+            }
+            max_row_sum = std::max(max_row_sum, row_sum);
+        }
+        return max_row_sum;
+    }
+
+    /**
+     * @brief Compute one norm (maximum absolute column sum)
+     * @return One norm
+     */
+    T one_norm() const {
+        T max_col_sum = T(0);
+        for (size_type j = 0; j < N; ++j) {
+            T col_sum = T(0);
+            for (size_type i = 0; i < M; ++i) {
+                using std::abs;
+                col_sum += abs((*this)(i, j));
+            }
+            max_col_sum = std::max(max_col_sum, col_sum);
+        }
+        return max_col_sum;
+    }
+
+    /**
+     * @brief Get minimum element
+     * @return Minimum value
+     */
+    T min() const {
+        return *std::min_element(data_, data_ + M * N);
+    }
+
+    /**
+     * @brief Get maximum element
+     * @return Maximum value
+     */
+    T max() const {
+        return *std::max_element(data_, data_ + M * N);
+    }
+
+    /**
+     * @brief Get sum of all elements
+     * @return Sum of elements
+     */
+    T sum() const {
+        T result = T(0);
+        for (size_type i = 0; i < M * N; ++i) {
+            result += data_[i];
+        }
+        return result;
+    }
+
+    // Static factory functions
+
+    /**
+     * @brief Create zero matrix
+     * @return Matrix with all elements zero
+     */
+    static constexpr Matrix zeros() {
+        return Matrix();
+    }
+
+    /**
+     * @brief Create matrix with all elements one
+     * @return Matrix with all elements one
+     */
+    static constexpr Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    /**
+     * @brief Create identity matrix (only for square matrices)
+     * @return Identity matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    static std::enable_if_t<M2 == N2, Matrix> identity() {
+        Matrix result;
+        for (size_type i = 0; i < M; ++i) {
+            result(i, i) = T(1);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Create diagonal matrix from vector (only for square matrices)
+     * @param diag Vector of diagonal elements
+     * @return Diagonal matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    static std::enable_if_t<M2 == N2, Matrix> diagonal(const Vector<T, M>& diag) {
+        Matrix result;
+        for (size_type i = 0; i < M; ++i) {
+            result(i, i) = diag[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Create zero matrix (static factory)
+     * @return Zero matrix
+     */
+    static Matrix zero() {
+        return zeros();
+    }
+
+    // Property checking methods
+
+    /**
+     * @brief Check if matrix is symmetric (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if symmetric
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_symmetric(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = i + 1; j < N; ++j) {
+                using std::abs;
+                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Check if matrix is skew-symmetric (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if skew-symmetric
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_skew_symmetric(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            // Diagonal must be zero
+            using std::abs;
+            if (abs((*this)(i, i)) > tol) {
+                return false;
+            }
+            for (size_type j = i + 1; j < N; ++j) {
+                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Check if matrix is diagonal (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if diagonal
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_diagonal(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                if (i != j) {
+                    using std::abs;
+                    if (abs((*this)(i, j)) > tol) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    // Determinant (general template, specialized for 2x2, 3x3)
+    /**
+     * @brief Compute determinant (only for square matrices)
+     * @return Determinant value
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, T> determinant() const {
+        // For 4x4 and larger, use LU decomposition
+        return determinant_lu();
+    }
+
+    // Inverse (general template, specialized for 2x2, 3x3)
+    /**
+     * @brief Compute matrix inverse (only for square matrices)
+     * @return Inverse matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, Matrix> inverse() const {
+        // For 4x4 and larger, use Gauss-Jordan elimination
+        return inverse_gauss_jordan();
+    }
+
+private:
+    // LU decomposition for determinant (4x4 and larger)
+    T determinant_lu() const {
+        Matrix<T, M, M> lu = *this;
+        T det = T(1);
+
+        for (size_type k = 0; k < M - 1; ++k) {
+            // Find pivot
+            size_type pivot = k;
+            T max_val = std::abs(lu(k, k));
+            for (size_type i = k + 1; i < M; ++i) {
+                T val = std::abs(lu(i, k));
+                if (val > max_val) {
+                    max_val = val;
+                    pivot = i;
+                }
+            }
+
+            // Swap rows if needed
+            if (pivot != k) {
+                for (size_type j = 0; j < M; ++j) {
+                    std::swap(lu(k, j), lu(pivot, j));
+                }
+                det = -det;  // Row swap changes sign
+            }
+
+            // Check for singularity
+            if (approx_zero(lu(k, k))) {
+                return T(0);
+            }
+
+            // Eliminate column
+            for (size_type i = k + 1; i < M; ++i) {
+                T factor = lu(i, k) / lu(k, k);
+                for (size_type j = k + 1; j < M; ++j) {
+                    lu(i, j) -= factor * lu(k, j);
+                }
+            }
+
+            det *= lu(k, k);
+        }
+        det *= lu(M - 1, M - 1);
+
+        return det;
+    }
+
+    // Gauss-Jordan elimination for inverse (4x4 and larger)
+    Matrix inverse_gauss_jordan() const {
+        Matrix<T, M, M> aug;  // Augmented matrix [A | I]
+        Matrix<T, M, M> result = Matrix::identity();
+
+        // Copy this matrix to augmented matrix
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < M; ++j) {
+                aug(i, j) = (*this)(i, j);
+            }
+        }
+
+        // Forward elimination with partial pivoting
+        for (size_type k = 0; k < M; ++k) {
+            // Find pivot
+            size_type pivot = k;
+            T max_val = std::abs(aug(k, k));
+            for (size_type i = k + 1; i < M; ++i) {
+                T val = std::abs(aug(i, k));
+                if (val > max_val) {
+                    max_val = val;
+                    pivot = i;
+                }
+            }
+
+            // Swap rows
+            if (pivot != k) {
+                for (size_type j = 0; j < M; ++j) {
+                    std::swap(aug(k, j), aug(pivot, j));
+                    std::swap(result(k, j), result(pivot, j));
+                }
+            }
+
+            // Check for singularity
+            if (approx_zero(aug(k, k))) {
+                throw std::runtime_error("Matrix is singular");
+            }
+
+            // Scale pivot row
+            T pivot_val = aug(k, k);
+            for (size_type j = 0; j < M; ++j) {
+                aug(k, j) /= pivot_val;
+                result(k, j) /= pivot_val;
+            }
+
+            // Eliminate column
+            for (size_type i = 0; i < M; ++i) {
+                if (i != k) {
+                    T factor = aug(i, k);
+                    for (size_type j = 0; j < M; ++j) {
+                        aug(i, j) -= factor * aug(k, j);
+                        result(i, j) -= factor * result(k, j);
+                    }
+                }
+            }
+        }
+
+        return result;
+    }
+
+    // Iterators
+public:
+    T* begin() { return data_; }
+    T* end() { return data_ + M * N; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + M * N; }
+    const T* cbegin() const { return data_; }
+    const T* cend() const { return data_ + M * N; }
+};
+
+// Specialization for 2x2 determinant (analytical formula)
+template<typename T>
+inline T determinant_2x2(const Matrix<T, 2, 2>& m) {
+    return m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
+}
+
+// Specialization for 2x2 inverse (Cramer's rule)
+template<typename T>
+inline Matrix<T, 2, 2> inverse_2x2(const Matrix<T, 2, 2>& m) {
+    T det = determinant_2x2(m);
+    if (approx_zero(det)) {
+        throw std::runtime_error("Matrix is singular");
+    }
+
+    T inv_det = T(1) / det;
+    return Matrix<T, 2, 2>{
+        { m(1, 1) * inv_det, -m(0, 1) * inv_det},
+        {-m(1, 0) * inv_det,  m(0, 0) * inv_det}
+    };
+}
+
+// Specialization for 3x3 determinant (Sarrus rule)
+template<typename T>
+inline T determinant_3x3(const Matrix<T, 3, 3>& m) {
+    return m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1))
+         - m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0))
+         + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+}
+
+// Specialization for 3x3 inverse (Cramer's rule / adjugate method)
+template<typename T>
+inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
+    T det = determinant_3x3(m);
+    if (approx_zero(det)) {
+        throw std::runtime_error("Matrix is singular");
+    }
+
+    T inv_det = T(1) / det;
+
+    // Compute adjugate matrix (transpose of cofactor matrix)
+    Matrix<T, 3, 3> adj;
+    adj(0, 0) =  (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1));
+    adj(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1));
+    adj(0, 2) =  (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
+
+    adj(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0));
+    adj(1, 1) =  (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0));
+    adj(1, 2) = -(m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0));
+
+    adj(2, 0) =  (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+    adj(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0));
+    adj(2, 2) =  (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
+
+    return adj * inv_det;
+}
+
+// Template specializations for 2x2 Matrix determinant and inverse
+template<typename T>
+class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
+    static constexpr std::size_t M = 2;
+    static constexpr std::size_t N = 2;
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[4];
+
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * 2 + j;
+    }
+
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+
+    // Include all the same constructors and methods as the general template
+    constexpr Matrix() : data_{} {}
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] = value;
+        }
+    }
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= 2) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= 2) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 2; ++i) {
+            for (size_type j = 0; j < 2; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    constexpr Matrix(const Matrix&) = default;
+    constexpr Matrix(Matrix&&) noexcept = default;
+    Matrix& operator=(const Matrix&) = default;
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 2; ++i) {
+            for (size_type j = 0; j < 2; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    static constexpr size_type rows() { return 2; }
+    static constexpr size_type cols() { return 2; }
+    static constexpr size_type size() { return 4; }
+
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    void fill(T value) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    void set_zero() { fill(T{0}); }
+
+    void set_row(size_type i, const Vector<T, 2>& v) {
+        for (size_type j = 0; j < 2; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    void set_column(size_type j, const Vector<T, 2>& v) {
+        for (size_type i = 0; i < 2; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    void set_col(size_type j, const Vector<T, 2>& v) {
+        set_column(j, v);
+    }
+
+    Vector<T, 2> col(size_type j) const {
+        return column(j);
+    }
+
+    static Matrix zero() {
+        return zeros();
+    }
+
+    static Matrix diagonal(const Vector<T, 2>& diag) {
+        Matrix result;
+        result(0, 0) = diag[0];
+        result(1, 1) = diag[1];
+        return result;
+    }
+
+    bool is_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        return abs((*this)(0, 1) - (*this)(1, 0)) <= tol;
+    }
+
+    bool is_skew_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        // Diagonal must be zero
+        if (abs((*this)(0, 0)) > tol || abs((*this)(1, 1)) > tol) {
+            return false;
+        }
+        // Off-diagonal must be opposite
+        return abs((*this)(0, 1) + (*this)(1, 0)) <= tol;
+    }
+
+    bool is_diagonal(T tol = tolerance<T>) const {
+        using std::abs;
+        return abs((*this)(0, 1)) <= tol && abs((*this)(1, 0)) <= tol;
+    }
+
+    T frobenius_norm() const {
+        using std::sqrt;
+        T sum = T(0);
+        for (size_type i = 0; i < 4; ++i) {
+            sum += data_[i] * data_[i];
+        }
+        return sqrt(sum);
+    }
+
+    T infinity_norm() const {
+        using std::abs;
+        T row0 = abs((*this)(0, 0)) + abs((*this)(0, 1));
+        T row1 = abs((*this)(1, 0)) + abs((*this)(1, 1));
+        return std::max(row0, row1);
+    }
+
+    T one_norm() const {
+        using std::abs;
+        T col0 = abs((*this)(0, 0)) + abs((*this)(1, 0));
+        T col1 = abs((*this)(0, 1)) + abs((*this)(1, 1));
+        return std::max(col0, col1);
+    }
+
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    Matrix<T, 2, 2> transpose() const {
+        return Matrix<T, 2, 2>{
+            {(*this)(0, 0), (*this)(1, 0)},
+            {(*this)(0, 1), (*this)(1, 1)}
+        };
+    }
+
+    T trace() const {
+        return (*this)(0, 0) + (*this)(1, 1);
+    }
+
+    static Matrix identity() {
+        Matrix result;
+        result(0, 0) = T(1);
+        result(1, 1) = T(1);
+        return result;
+    }
+
+    static Matrix zeros() {
+        return Matrix();
+    }
+
+    static Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    // Specialized 2x2 determinant
+    T determinant() const {
+        return determinant_2x2(*this);
+    }
+
+    // Specialized 2x2 inverse
+    Matrix inverse() const {
+        return inverse_2x2(*this);
+    }
+
+    Vector<T, 2> row(size_type i) const {
+        return Vector<T, 2>{(*this)(i, 0), (*this)(i, 1)};
+    }
+
+    Vector<T, 2> column(size_type j) const {
+        return Vector<T, 2>{(*this)(0, j), (*this)(1, j)};
+    }
+
+    T* begin() { return data_; }
+    T* end() { return data_ + 4; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + 4; }
+};
+
+// Template specialization for 3x3 Matrix
+template<typename T>
+class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
+    static constexpr std::size_t M = 3;
+    static constexpr std::size_t N = 3;
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[9];
+
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * 3 + j;
+    }
+
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+
+    constexpr Matrix() : data_{} {}
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] = value;
+        }
+    }
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= 3) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= 3) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    constexpr Matrix(const Matrix&) = default;
+    constexpr Matrix(Matrix&&) noexcept = default;
+    Matrix& operator=(const Matrix&) = default;
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    static constexpr size_type rows() { return 3; }
+    static constexpr size_type cols() { return 3; }
+    static constexpr size_type size() { return 9; }
+
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    void fill(T value) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    void set_zero() { fill(T{0}); }
+
+    void set_row(size_type i, const Vector<T, 3>& v) {
+        for (size_type j = 0; j < 3; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    void set_column(size_type j, const Vector<T, 3>& v) {
+        for (size_type i = 0; i < 3; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    void set_col(size_type j, const Vector<T, 3>& v) {
+        set_column(j, v);
+    }
+
+    Vector<T, 3> col(size_type j) const {
+        return column(j);
+    }
+
+    static Matrix zero() {
+        return zeros();
+    }
+
+    static Matrix diagonal(const Vector<T, 3>& diag) {
+        Matrix result;
+        result(0, 0) = diag[0];
+        result(1, 1) = diag[1];
+        result(2, 2) = diag[2];
+        return result;
+    }
+
+    bool is_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = i + 1; j < 3; ++j) {
+                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool is_skew_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        // Diagonal must be zero
+        for (size_type i = 0; i < 3; ++i) {
+            if (abs((*this)(i, i)) > tol) {
+                return false;
+            }
+        }
+        // Off-diagonal must be opposite
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = i + 1; j < 3; ++j) {
+                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool is_diagonal(T tol = tolerance<T>) const {
+        using std::abs;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                if (i != j && abs((*this)(i, j)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    T frobenius_norm() const {
+        using std::sqrt;
+        T sum = T(0);
+        for (size_type i = 0; i < 9; ++i) {
+            sum += data_[i] * data_[i];
+        }
+        return sqrt(sum);
+    }
+
+    T infinity_norm() const {
+        using std::abs;
+        T max_row_sum = T(0);
+        for (size_type i = 0; i < 3; ++i) {
+            T row_sum = T(0);
+            for (size_type j = 0; j < 3; ++j) {
+                row_sum += abs((*this)(i, j));
+            }
+            max_row_sum = std::max(max_row_sum, row_sum);
+        }
+        return max_row_sum;
+    }
+
+    T one_norm() const {
+        using std::abs;
+        T max_col_sum = T(0);
+        for (size_type j = 0; j < 3; ++j) {
+            T col_sum = T(0);
+            for (size_type i = 0; i < 3; ++i) {
+                col_sum += abs((*this)(i, j));
+            }
+            max_col_sum = std::max(max_col_sum, col_sum);
+        }
+        return max_col_sum;
+    }
+
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    Matrix<T, 3, 3> transpose() const {
+        Matrix<T, 3, 3> result;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                result(j, i) = (*this)(i, j);
+            }
+        }
+        return result;
+    }
+
+    T trace() const {
+        return (*this)(0, 0) + (*this)(1, 1) + (*this)(2, 2);
+    }
+
+    static Matrix identity() {
+        Matrix result;
+        result(0, 0) = T(1);
+        result(1, 1) = T(1);
+        result(2, 2) = T(1);
+        return result;
+    }
+
+    static Matrix zeros() {
+        return Matrix();
+    }
+
+    static Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    // Specialized 3x3 determinant
+    T determinant() const {
+        return determinant_3x3(*this);
+    }
+
+    // Specialized 3x3 inverse
+    Matrix inverse() const {
+        return inverse_3x3(*this);
+    }
+
+    Vector<T, 3> row(size_type i) const {
+        return Vector<T, 3>{(*this)(i, 0), (*this)(i, 1), (*this)(i, 2)};
+    }
+
+    Vector<T, 3> column(size_type j) const {
+        return Vector<T, 3>{(*this)(0, j), (*this)(1, j), (*this)(2, j)};
+    }
+
+    T* begin() { return data_; }
+    T* end() { return data_ + 9; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + 9; }
+};
+
+// Type aliases for common matrix types
+template<typename T> using Matrix2x2 = Matrix<T, 2, 2>;
+template<typename T> using Matrix3x3 = Matrix<T, 3, 3>;
+template<typename T> using Matrix4x4 = Matrix<T, 4, 4>;
+template<typename T> using Matrix2x3 = Matrix<T, 2, 3>;
+template<typename T> using Matrix3x2 = Matrix<T, 3, 2>;
+template<typename T> using Matrix3x4 = Matrix<T, 3, 4>;
+template<typename T> using Matrix4x3 = Matrix<T, 4, 3>;
+
+// Double precision aliases
+using Matrix2x2d = Matrix2x2<double>;
+using Matrix3x3d = Matrix3x3<double>;
+using Matrix4x4d = Matrix4x4<double>;
+
+// Single precision aliases
+using Matrix2x2f = Matrix2x2<float>;
+using Matrix3x3f = Matrix3x3<float>;
+using Matrix4x4f = Matrix4x4<float>;
+
+// Matrix-vector multiplication
+template<typename T, std::size_t M, std::size_t N>
+inline Vector<T, M> operator*(const Matrix<T, M, N>& A, const Vector<T, N>& x) {
+    Vector<T, M> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        T sum = T(0);
+        for (std::size_t j = 0; j < N; ++j) {
+            sum += A(i, j) * x[j];
+        }
+        result[i] = sum;
+    }
+    return result;
+}
+
+// Vector-matrix multiplication (row vector * matrix)
+template<typename T, std::size_t M, std::size_t N>
+inline Vector<T, N> operator*(const Vector<T, M>& x, const Matrix<T, M, N>& A) {
+    Vector<T, N> result;
+    for (std::size_t j = 0; j < N; ++j) {
+        T sum = T(0);
+        for (std::size_t i = 0; i < M; ++i) {
+            sum += x[i] * A(i, j);
+        }
+        result[j] = sum;
+    }
+    return result;
+}
+
+// Matrix-matrix multiplication
+template<typename T, std::size_t M, std::size_t N, std::size_t P>
+inline Matrix<T, M, P> operator*(const Matrix<T, M, N>& A, const Matrix<T, N, P>& B) {
+    Matrix<T, M, P> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t k = 0; k < N; ++k) {
+            T a_ik = A(i, k);
+            for (std::size_t j = 0; j < P; ++j) {
+                result(i, j) += a_ik * B(k, j);
+            }
+        }
+    }
+    return result;
+}
+
+// Free functions
+
+/**
+ * @brief Compute matrix transpose
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, N, M> transpose(const Matrix<T, M, N>& m) {
+    return m.transpose();
+}
+
+/**
+ * @brief Compute matrix trace
+ */
+template<typename T, std::size_t N>
+inline T trace(const Matrix<T, N, N>& m) {
+    return m.trace();
+}
+
+/**
+ * @brief Compute matrix determinant
+ */
+template<typename T, std::size_t N>
+inline T determinant(const Matrix<T, N, N>& m) {
+    return m.determinant();
+}
+
+/**
+ * @brief Compute matrix inverse
+ */
+template<typename T, std::size_t N>
+inline Matrix<T, N, N> inverse(const Matrix<T, N, N>& m) {
+    return m.inverse();
+}
+
+/**
+ * @brief Compute Frobenius norm
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline T frobenius_norm(const Matrix<T, M, N>& m) {
+    return m.frobenius_norm();
+}
+
+/**
+ * @brief Component-wise absolute value
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> abs(const Matrix<T, M, N>& m) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            using std::abs;
+            result(i, j) = abs(m(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise minimum
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> min(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = std::min(a(i, j), b(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise maximum
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> max(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = std::max(a(i, j), b(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Outer product of two vectors
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> outer_product(const Vector<T, M>& u, const Vector<T, N>& v) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = u[i] * v[j];
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Check if two matrices are approximately equal
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline bool approx_equal(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b, T tol = tolerance<T>) {
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            if (!approx_equal(a(i, j), b(i, j), tol)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Stream output operator for matrices
+ * @tparam T Scalar type
+ * @tparam M Number of rows
+ * @tparam N Number of columns
+ * @param os Output stream
+ * @param m Matrix to output
+ * @return Reference to output stream
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const Matrix<T, M, N>& m) {
+    os << "[";
+    for (std::size_t i = 0; i < M; ++i) {
+        if (i > 0) os << "\n ";
+        os << "[";
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j > 0) os << ", ";
+            os << m(i, j);
+        }
+        os << "]";
+    }
+    os << "]";
+    return os;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_MATRIX_H
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
new file mode 100644
index 000000000..da2f8c8d6
--- /dev/null
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -0,0 +1,626 @@
+#ifndef SVMP_FE_MATH_MATRIX_EXPR_H
+#define SVMP_FE_MATH_MATRIX_EXPR_H
+
+/**
+ * @file MatrixExpr.h
+ * @brief Expression template infrastructure for lazy evaluation of matrix operations
+ *
+ * This header provides expression templates that enable compound matrix operations
+ * without creating temporary objects. Operations are evaluated lazily at the point
+ * of assignment, eliminating intermediate allocations and improving performance.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <cmath>
+#include "ExpressionOps.h"
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Base class for all matrix expressions using CRTP
+ * @tparam Derived The derived expression type
+ *
+ * This uses the Curiously Recurring Template Pattern (CRTP) to provide
+ * static polymorphism for expression templates.
+ */
+template<typename Derived>
+class MatrixExpr {
+public:
+    /**
+     * @brief Get the derived expression
+     * @return Reference to the derived type
+     */
+    const Derived& derived() const {
+        return static_cast<const Derived&>(*this);
+    }
+
+    /**
+     * @brief Get the derived expression (non-const)
+     * @return Reference to the derived type
+     */
+    Derived& derived() {
+        return static_cast<Derived&>(*this);
+    }
+
+    /**
+     * @brief Access element by row and column indices
+     * @param i Row index
+     * @param j Column index
+     * @return Value at (i,j)
+     */
+    auto operator()(std::size_t i, std::size_t j) const {
+        return derived()(i, j);
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    std::size_t rows() const {
+        return derived().rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    std::size_t cols() const {
+        return derived().cols();
+    }
+};
+
+/**
+ * @brief Binary expression for element-wise operations between two matrix expressions
+ * @tparam LHS Left-hand side expression type
+ * @tparam RHS Right-hand side expression type
+ * @tparam Op Binary operation functor
+ */
+template<typename LHS, typename RHS, typename Op>
+class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct binary expression
+     * @param lhs Left operand
+     * @param rhs Right operand
+     * @param op Operation to apply
+     */
+    constexpr MatrixBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
+        : lhs_(lhs), rhs_(rhs), op_(op) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Result of operation on elements at (i,j)
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return op_(lhs_(i, j), rhs_(i, j));
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return lhs_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return lhs_.cols();
+    }
+};
+
+/**
+ * @brief Unary expression for element-wise operations on a single matrix expression
+ * @tparam Expr Expression type
+ * @tparam Op Unary operation functor
+ */
+template<typename Expr, typename Op>
+class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
+private:
+    const Expr& expr_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct unary expression
+     * @param expr Operand expression
+     * @param op Operation to apply
+     */
+    constexpr MatrixUnaryExpr(const Expr& expr, Op op = Op{})
+        : expr_(expr), op_(op) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Result of operation on element at (i,j)
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return op_(expr_(i, j));
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Scalar multiplication expression
+ * @tparam Expr Matrix expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar multiplication expression
+     * @param expr Matrix expression
+     * @param scalar Scalar value
+     */
+    constexpr MatrixScalarExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Element multiplied by scalar
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(i, j) * scalar_;
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Scalar division expression
+ * @tparam Expr Matrix expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar division expression
+     * @param expr Matrix expression
+     * @param scalar Scalar divisor
+     */
+    constexpr MatrixScalarDivExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Element divided by scalar
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(i, j) / scalar_;
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Matrix multiplication expression (lazy evaluation)
+ * @tparam LHS Left matrix expression type
+ * @tparam RHS Right matrix expression type
+ *
+ * Computes matrix multiplication A*B lazily
+ */
+template<typename LHS, typename RHS>
+class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+
+public:
+    /**
+     * @brief Construct matrix multiplication expression
+     * @param lhs Left matrix
+     * @param rhs Right matrix
+     */
+    constexpr MatrixMulExpr(const LHS& lhs, const RHS& rhs)
+        : lhs_(lhs), rhs_(rhs) {}
+
+    /**
+     * @brief Compute element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Dot product of row i of lhs and column j of rhs
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        using result_type = decltype(lhs_(0, 0) * rhs_(0, 0));
+        result_type sum = result_type{0};
+        const auto n = lhs_.cols();
+        for (std::size_t k = 0; k < n; ++k) {
+            sum += lhs_(i, k) * rhs_(k, j);
+        }
+        return sum;
+    }
+
+    /**
+     * @brief Get number of rows (from left matrix)
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return lhs_.rows();
+    }
+
+    /**
+     * @brief Get number of columns (from right matrix)
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return rhs_.cols();
+    }
+};
+
+/**
+ * @brief Transpose expression (lazy evaluation)
+ * @tparam Expr Matrix expression type
+ */
+template<typename Expr>
+class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
+private:
+    const Expr& expr_;
+
+public:
+    /**
+     * @brief Construct transpose expression
+     * @param expr Matrix expression to transpose
+     */
+    constexpr explicit TransposeExpr(const Expr& expr)
+        : expr_(expr) {}
+
+    /**
+     * @brief Access transposed element
+     * @param i Row index (becomes column in original)
+     * @param j Column index (becomes row in original)
+     * @return Element at (j,i) of original matrix
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(j, i);
+    }
+
+    /**
+     * @brief Get number of rows (columns of original)
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.cols();
+    }
+
+    /**
+     * @brief Get number of columns (rows of original)
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.rows();
+    }
+};
+
+/**
+ * @brief Diagonal matrix expression (creates diagonal matrix from vector)
+ * @tparam VecExpr Vector expression type
+ */
+template<typename VecExpr>
+class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
+private:
+    const VecExpr& vec_;
+    std::size_t n_;
+
+public:
+    /**
+     * @brief Construct diagonal matrix from vector
+     * @param vec Vector of diagonal elements
+     * @param n Matrix dimension (default: vector size)
+     */
+    constexpr explicit DiagonalExpr(const VecExpr& vec, std::size_t n = 0)
+        : vec_(vec), n_(n > 0 ? n : vec.size()) {}
+
+    /**
+     * @brief Access element
+     * @param i Row index
+     * @param j Column index
+     * @return Diagonal element if i==j, zero otherwise
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        using result_type = decltype(vec_[0]);
+        return (i == j && i < vec_.size()) ? vec_[i] : result_type{0};
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return n_;
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return n_;
+    }
+};
+
+/**
+ * @brief Addition operator for matrix expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator+(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Add>(
+        lhs.derived(), rhs.derived(), detail::ops::Add{}
+    );
+}
+
+/**
+ * @brief Subtraction operator for matrix expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator-(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Sub>(
+        lhs.derived(), rhs.derived(), detail::ops::Sub{}
+    );
+}
+
+/**
+ * @brief Matrix multiplication operator
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator*(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixMulExpr<LHS, RHS>(lhs.derived(), rhs.derived());
+}
+
+/**
+ * @brief Element-wise multiplication (Hadamard product)
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Mul>(
+        lhs.derived(), rhs.derived(), detail::ops::Mul{}
+    );
+}
+
+/**
+ * @brief Element-wise division
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard_div(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Div>(
+        lhs.derived(), rhs.derived(), detail::ops::Div{}
+    );
+}
+
+/**
+ * @brief Negation operator for matrix expressions
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto operator-(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Negate>(
+        expr.derived(), detail::ops::Negate{}
+    );
+}
+
+/**
+ * @brief Scalar multiplication operator (matrix * scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator*(const MatrixExpr<Expr>& expr, Scalar scalar) {
+    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar multiplication operator (scalar * matrix)
+ */
+template<typename Scalar, typename Expr,
+         typename = std::enable_if_t<
+             std::is_arithmetic_v<Scalar> &&
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto operator*(Scalar scalar, const MatrixExpr<Expr>& expr) {
+    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar division operator (matrix / scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator/(const MatrixExpr<Expr>& expr, Scalar scalar) {
+    return MatrixScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Transpose function
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto transpose(const MatrixExpr<Expr>& expr) {
+    return TransposeExpr<Expr>(expr.derived());
+}
+
+/**
+ * @brief Element-wise absolute value
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto abs(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
+}
+
+/**
+ * @brief Element-wise square root
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto sqrt(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
+}
+
+/**
+ * @brief Compute Frobenius norm squared of matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Square of the Frobenius norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto frobenius_norm_squared(const MatrixExpr<Expr>& expr) {
+    using result_type = decltype(expr.derived()(0, 0) * expr.derived()(0, 0));
+    result_type sum = result_type{0};
+    const auto m = expr.rows();
+    const auto n = expr.cols();
+    for (std::size_t i = 0; i < m; ++i) {
+        for (std::size_t j = 0; j < n; ++j) {
+            auto val = expr.derived()(i, j);
+            sum += val * val;
+        }
+    }
+    return sum;
+}
+
+/**
+ * @brief Compute Frobenius norm of matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Frobenius norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto frobenius_norm(const MatrixExpr<Expr>& expr) {
+    using std::sqrt;
+    return sqrt(frobenius_norm_squared(expr));
+}
+
+/**
+ * @brief Compute trace of square matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Sum of diagonal elements
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto trace(const MatrixExpr<Expr>& expr) {
+    using result_type = decltype(expr.derived()(0, 0));
+    result_type sum = result_type{0};
+    const auto n = std::min(expr.rows(), expr.cols());
+    for (std::size_t i = 0; i < n; ++i) {
+        sum += expr.derived()(i, i);
+    }
+    return sum;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_MATRIX_EXPR_H
\ No newline at end of file
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
new file mode 100644
index 000000000..e272bd6dd
--- /dev/null
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -0,0 +1,831 @@
+#ifndef SVMP_FE_MATH_VECTOR_H
+#define SVMP_FE_MATH_VECTOR_H
+
+/**
+ * @file Vector.h
+ * @brief Fixed-size vectors with expression templates for FE computations
+ *
+ * This header provides optimized fixed-size vector operations for element-level
+ * computations. All operations use expression templates to eliminate temporaries
+ * and are header-only for maximum inlining. Memory is aligned for SIMD operations.
+ */
+
+#include "VectorExpr.h"
+#include "MathConstants.h"
+#include "../Common/Alignment.h"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <type_traits>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Fixed-size vector for element-level computations
+ * @tparam T Scalar type (float, double)
+ * @tparam N Vector dimension
+ *
+ * This class provides small vector operations optimized for
+ * compile-time known dimensions. Memory is aligned for SIMD operations.
+ */
+template<typename T, std::size_t N>
+class Vector : public VectorExpr<Vector<T, N>> {
+    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
+    static_assert(N > 0, "Vector dimension must be positive");
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[N];  // SIMD-friendly alignment
+
+public:
+    // Type definitions
+    using value_type = T;
+    using size_type = std::size_t;
+    using reference = T&;
+    using const_reference = const T&;
+    using pointer = T*;
+    using const_pointer = const T*;
+
+    /**
+     * @brief Default constructor - zero initializes all components
+     */
+    constexpr Vector() : data_{} {}
+
+    /**
+     * @brief Fill constructor - initializes all components with same value
+     * @param value Value to fill vector with
+     */
+    constexpr explicit Vector(T value) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Initializer list constructor
+     * @param init List of values
+     */
+    constexpr Vector(std::initializer_list<T> init) : data_{} {
+        auto it = init.begin();
+        for (size_type i = 0; i < N && it != init.end(); ++i, ++it) {
+            data_[i] = *it;
+        }
+    }
+
+    /**
+     * @brief Constructor from expression template
+     * @tparam Expr Expression type
+     * @param expr Vector expression to evaluate
+     */
+    template<typename Expr>
+    Vector(const VectorExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = e[i];
+        }
+    }
+
+    /**
+     * @brief Copy constructor
+     */
+    constexpr Vector(const Vector&) = default;
+
+    /**
+     * @brief Move constructor
+     */
+    constexpr Vector(Vector&&) noexcept = default;
+
+    /**
+     * @brief Copy assignment
+     */
+    Vector& operator=(const Vector&) = default;
+
+    /**
+     * @brief Move assignment
+     */
+    Vector& operator=(Vector&&) noexcept = default;
+
+    /**
+     * @brief Assignment from expression template
+     * @tparam Expr Expression type
+     * @param expr Vector expression to evaluate
+     * @return Reference to this
+     */
+    template<typename Expr>
+    Vector& operator=(const VectorExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = e[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Get vector size (compile-time constant)
+     * @return Number of elements
+     */
+    static constexpr size_type size() { return N; }
+
+    /**
+     * @brief Element access (no bounds checking)
+     * @param i Element index
+     * @return Reference to element
+     */
+    constexpr T& operator[](size_type i) {
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access (no bounds checking) - const version
+     * @param i Element index
+     * @return Const reference to element
+     */
+    constexpr const T& operator[](size_type i) const {
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access with bounds checking
+     * @param i Element index
+     * @return Reference to element
+     * @throws std::out_of_range if i >= N
+     */
+    T& at(size_type i) {
+        if (i >= N) {
+            throw std::out_of_range("Vector::at: index out of range");
+        }
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access with bounds checking - const version
+     * @param i Element index
+     * @return Const reference to element
+     * @throws std::out_of_range if i >= N
+     */
+    const T& at(size_type i) const {
+        if (i >= N) {
+            throw std::out_of_range("Vector::at: index out of range");
+        }
+        return data_[i];
+    }
+
+    /**
+     * @brief Access first element
+     * @return Reference to first element
+     */
+    T& front() { return data_[0]; }
+    const T& front() const { return data_[0]; }
+
+    /**
+     * @brief Access last element
+     * @return Reference to last element
+     */
+    T& back() { return data_[N-1]; }
+    const T& back() const { return data_[N-1]; }
+
+    /**
+     * @brief Get pointer to underlying data
+     * @return Pointer to first element
+     */
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    /**
+     * @brief Fill vector with value
+     * @param value Value to fill with
+     */
+    void fill(T value) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Set all components to zero
+     */
+    void set_zero() {
+        fill(T{0});
+    }
+
+    // Arithmetic operators
+
+    /**
+     * @brief In-place addition
+     * @param other Vector to add
+     * @return Reference to this
+     */
+    Vector& operator+=(const Vector& other) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place subtraction
+     * @param other Vector to subtract
+     * @return Reference to this
+     */
+    Vector& operator-=(const Vector& other) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar multiplication
+     * @param scalar Scalar to multiply by
+     * @return Reference to this
+     */
+    Vector& operator*=(T scalar) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar division
+     * @param scalar Scalar to divide by
+     * @return Reference to this
+     */
+    Vector& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    // Vector operations
+
+    /**
+     * @brief Compute dot product
+     * @param other Other vector
+     * @return Dot product
+     */
+    T dot(const Vector& other) const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            result += data_[i] * other.data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute squared Euclidean norm
+     * @return Squared norm
+     */
+    T norm_squared() const {
+        return dot(*this);
+    }
+
+    /**
+     * @brief Compute Euclidean norm
+     * @return Norm
+     */
+    T norm() const {
+        using std::sqrt;
+        return sqrt(norm_squared());
+    }
+
+    /**
+     * @brief Get normalized vector
+     * @return Unit vector in same direction
+     */
+    Vector normalized() const {
+        const T n = norm();
+        if (approx_zero(n)) {
+            return Vector();  // Return zero vector
+        }
+        return (*this) / n;
+    }
+
+    /**
+     * @brief Normalize this vector in place
+     * @return Reference to this
+     */
+    Vector& normalize() {
+        const T n = norm();
+        if (!approx_zero(n)) {
+            (*this) /= n;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Compute L1 norm (Manhattan norm)
+     * @return Sum of absolute values
+     */
+    T norm_l1() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            result += abs(data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute L-infinity norm (maximum norm)
+     * @return Maximum absolute value
+     */
+    T norm_inf() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            result = std::max(result, abs(data_[i]));
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get minimum component
+     * @return Minimum value
+     */
+    T min() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result = std::min(result, data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get maximum component
+     * @return Maximum value
+     */
+    T max() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result = std::max(result, data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get sum of all components
+     * @return Sum of components
+     */
+    T sum() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            result += data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get product of all components
+     * @return Product of components
+     */
+    T product() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result *= data_[i];
+        }
+        return result;
+    }
+
+    // Static factory functions
+
+    /**
+     * @brief Create zero vector
+     * @return Vector with all components zero
+     */
+    static constexpr Vector zeros() {
+        return Vector();
+    }
+
+    /**
+     * @brief Create vector with all components one
+     * @return Vector with all components one
+     */
+    static constexpr Vector ones() {
+        return Vector(T(1));
+    }
+
+    /**
+     * @brief Create unit vector along axis
+     * @param axis Axis index (0-based)
+     * @return Unit vector
+     */
+    static Vector unit(size_type axis) {
+        Vector v;
+        if (axis < N) {
+            v[axis] = T(1);
+        }
+        return v;
+    }
+
+    /**
+     * @brief Create basis vector (alias for unit)
+     * @param i Axis index (0-based)
+     * @return Basis vector
+     */
+    static Vector basis(size_type i) {
+        return unit(i);
+    }
+
+    /**
+     * @brief Create zero vector (alias for zeros)
+     * @return Zero vector
+     */
+    static constexpr Vector zero() {
+        return zeros();
+    }
+
+    /**
+     * @brief Get index of minimum element
+     * @return Index of minimum value
+     */
+    size_type min_index() const {
+        size_type idx = 0;
+        T min_val = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            if (data_[i] < min_val) {
+                min_val = data_[i];
+                idx = i;
+            }
+        }
+        return idx;
+    }
+
+    /**
+     * @brief Get index of maximum element
+     * @return Index of maximum value
+     */
+    size_type max_index() const {
+        size_type idx = 0;
+        T max_val = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            if (data_[i] > max_val) {
+                max_val = data_[i];
+                idx = i;
+            }
+        }
+        return idx;
+    }
+
+    /**
+     * @brief Compute mean of all components
+     * @return Average value
+     */
+    T mean() const {
+        return sum() / static_cast<T>(N);
+    }
+
+    /**
+     * @brief Cross product for 3D vectors
+     * @param other Other vector
+     * @return Cross product
+     * @note Only available for 3D vectors
+     */
+    template<typename U = T>
+    std::enable_if_t<N == 3, Vector<U, 3>> cross(const Vector<U, 3>& other) const {
+        return Vector<U, 3>{
+            data_[1] * other[2] - data_[2] * other[1],
+            data_[2] * other[0] - data_[0] * other[2],
+            data_[0] * other[1] - data_[1] * other[0]
+        };
+    }
+
+    /**
+     * @brief Check if vectors are approximately equal
+     * @param other Other vector
+     * @param tol Tolerance
+     * @return true if equal within tolerance
+     */
+    bool approx_equal(const Vector& other, T tol = tolerance<T>) const {
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            if (abs(data_[i] - other.data_[i]) > tol) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Equality comparison
+     * @param other Other vector
+     * @return true if exactly equal
+     */
+    bool operator==(const Vector& other) const {
+        for (size_type i = 0; i < N; ++i) {
+            if (data_[i] != other.data_[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Inequality comparison
+     * @param other Other vector
+     * @return true if not equal
+     */
+    bool operator!=(const Vector& other) const {
+        return !(*this == other);
+    }
+
+    // Iterators
+    T* begin() { return data_; }
+    T* end() { return data_ + N; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + N; }
+    const T* cbegin() const { return data_; }
+    const T* cend() const { return data_ + N; }
+};
+
+// Type aliases for common vector types
+template<typename T> using Vector2 = Vector<T, 2>;
+template<typename T> using Vector3 = Vector<T, 3>;
+template<typename T> using Vector4 = Vector<T, 4>;
+
+// Double precision aliases
+using Vector2d = Vector2<double>;
+using Vector3d = Vector3<double>;
+using Vector4d = Vector4<double>;
+
+// Single precision aliases
+using Vector2f = Vector2<float>;
+using Vector3f = Vector3<float>;
+using Vector4f = Vector4<float>;
+
+// Integer aliases
+using Vector2i = Vector2<int>;
+using Vector3i = Vector3<int>;
+using Vector4i = Vector4<int>;
+
+/**
+ * @brief 3D Cross product
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @return Cross product a × b
+ */
+template<typename T>
+inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
+    return Vector3<T>{
+        a[1] * b[2] - a[2] * b[1],
+        a[2] * b[0] - a[0] * b[2],
+        a[0] * b[1] - a[1] * b[0]
+    };
+}
+
+/**
+ * @brief 2D Cross product (returns scalar - z component of 3D cross)
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @return Scalar cross product
+ */
+template<typename T>
+inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
+    return a[0] * b[1] - a[1] * b[0];
+}
+
+/**
+ * @brief Triple scalar product (a · (b × c))
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @param c Third vector
+ * @return Scalar triple product
+ */
+template<typename T>
+inline T triple_product(const Vector3<T>& a, const Vector3<T>& b, const Vector3<T>& c) {
+    return a.dot(cross(b, c));
+}
+
+// Free functions for common operations
+
+/**
+ * @brief Compute dot product
+ */
+template<typename T, std::size_t N>
+inline T dot(const Vector<T, N>& a, const Vector<T, N>& b) {
+    return a.dot(b);
+}
+
+/**
+ * @brief Compute Euclidean norm
+ */
+template<typename T, std::size_t N>
+inline T norm(const Vector<T, N>& v) {
+    return v.norm();
+}
+
+/**
+ * @brief Compute squared Euclidean norm
+ */
+template<typename T, std::size_t N>
+inline T norm_squared(const Vector<T, N>& v) {
+    return v.norm_squared();
+}
+
+/**
+ * @brief Get normalized vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> normalize(const Vector<T, N>& v) {
+    return v.normalized();
+}
+
+/**
+ * @brief Component-wise absolute value
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> abs(const Vector<T, N>& v) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        using std::abs;
+        result[i] = abs(v[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise minimum
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> min(const Vector<T, N>& a, const Vector<T, N>& b) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::min(a[i], b[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise maximum
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> max(const Vector<T, N>& a, const Vector<T, N>& b) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::max(a[i], b[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise clamp
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> clamp(const Vector<T, N>& v, const Vector<T, N>& min_v, const Vector<T, N>& max_v) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::clamp(v[i], min_v[i], max_v[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Linear interpolation between vectors
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param t Interpolation parameter [0, 1]
+ * @param a Start vector (at t=0)
+ * @param b End vector (at t=1)
+ * @return Interpolated vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> lerp(T t, const Vector<T, N>& a, const Vector<T, N>& b) {
+    return a + t * (b - a);
+}
+
+/**
+ * @brief Spherical linear interpolation (for unit vectors)
+ * @tparam T Scalar type
+ * @param t Interpolation parameter [0, 1]
+ * @param a Start unit vector
+ * @param b End unit vector
+ * @return Interpolated unit vector
+ */
+template<typename T>
+inline Vector3<T> slerp(T t, const Vector3<T>& a, const Vector3<T>& b) {
+    T cos_angle = a.dot(b);
+
+    // Handle numerical issues
+    cos_angle = std::clamp(cos_angle, T(-1), T(1));
+
+    // If vectors are nearly parallel, use linear interpolation
+    if (cos_angle > T(0.9995)) {
+        return normalize(lerp(t, a, b));
+    }
+
+    T angle = std::acos(cos_angle);
+    T sin_angle = std::sin(angle);
+
+    T t0 = std::sin((T(1) - t) * angle) / sin_angle;
+    T t1 = std::sin(t * angle) / sin_angle;
+
+    return t0 * a + t1 * b;
+}
+
+/**
+ * @brief Reflect vector about normal
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Incident vector
+ * @param n Normal vector (should be unit)
+ * @return Reflected vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> reflect(const Vector<T, N>& v, const Vector<T, N>& n) {
+    return v - T(2) * dot(v, n) * n;
+}
+
+/**
+ * @brief Project vector onto another vector
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Vector to project
+ * @param onto Vector to project onto
+ * @return Projection of v onto 'onto'
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> project(const Vector<T, N>& v, const Vector<T, N>& onto) {
+    T denom = onto.norm_squared();
+    if (approx_zero(denom)) {
+        return Vector<T, N>::zeros();
+    }
+    return (dot(v, onto) / denom) * onto;
+}
+
+/**
+ * @brief Get perpendicular component of vector
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Vector
+ * @param direction Direction to remove
+ * @return Component of v perpendicular to direction
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> perpendicular(const Vector<T, N>& v, const Vector<T, N>& direction) {
+    return v - project(v, direction);
+}
+
+/**
+ * @brief Compute angle between two vectors
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param a First vector
+ * @param b Second vector
+ * @return Angle in radians [0, π]
+ */
+template<typename T, std::size_t N>
+inline T angle(const Vector<T, N>& a, const Vector<T, N>& b) {
+    T cos_angle = dot(a, b) / (norm(a) * norm(b));
+    cos_angle = std::clamp(cos_angle, T(-1), T(1));
+    return std::acos(cos_angle);
+}
+
+/**
+ * @brief Check if two vectors are approximately equal
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param a First vector
+ * @param b Second vector
+ * @param tol Tolerance
+ * @return true if vectors are equal within tolerance
+ */
+template<typename T, std::size_t N>
+inline bool approx_equal(const Vector<T, N>& a, const Vector<T, N>& b, T tol = tolerance<T>) {
+    for (std::size_t i = 0; i < N; ++i) {
+        if (!approx_equal(a[i], b[i], tol)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Stream output operator
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param os Output stream
+ * @param v Vector to output
+ * @return Reference to output stream
+ */
+template<typename T, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const Vector<T, N>& v) {
+    os << "[";
+    for (std::size_t i = 0; i < N; ++i) {
+        if (i > 0) os << ", ";
+        os << v[i];
+    }
+    os << "]";
+    return os;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_VECTOR_H
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
new file mode 100644
index 000000000..8b9c8e382
--- /dev/null
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -0,0 +1,418 @@
+#ifndef SVMP_FE_MATH_VECTOR_EXPR_H
+#define SVMP_FE_MATH_VECTOR_EXPR_H
+
+/**
+ * @file VectorExpr.h
+ * @brief Expression template infrastructure for lazy evaluation of vector operations
+ *
+ * This header provides expression templates that enable compound vector operations
+ * without creating temporary objects. Operations are evaluated lazily at the point
+ * of assignment, eliminating intermediate allocations and improving performance.
+ */
+
+#include <cstddef>
+#include <type_traits>
+#include <cmath>
+#include "ExpressionOps.h"
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Base class for all vector expressions using CRTP
+ * @tparam Derived The derived expression type
+ *
+ * This uses the Curiously Recurring Template Pattern (CRTP) to provide
+ * static polymorphism for expression templates.
+ */
+template<typename Derived>
+class VectorExpr {
+public:
+    /**
+     * @brief Get the derived expression
+     * @return Reference to the derived type
+     */
+    const Derived& derived() const {
+        return static_cast<const Derived&>(*this);
+    }
+
+    /**
+     * @brief Get the derived expression (non-const)
+     * @return Reference to the derived type
+     */
+    Derived& derived() {
+        return static_cast<Derived&>(*this);
+    }
+
+    /**
+     * @brief Access element by index
+     * @param i Element index
+     * @return Value at index i
+     */
+    auto operator[](std::size_t i) const {
+        return derived()[i];
+    }
+
+    /**
+     * @brief Get the size of the vector expression
+     * @return Number of elements
+     */
+    std::size_t size() const {
+        return derived().size();
+    }
+};
+
+/**
+ * @brief Binary expression for element-wise operations between two vector expressions
+ * @tparam LHS Left-hand side expression type
+ * @tparam RHS Right-hand side expression type
+ * @tparam Op Binary operation functor
+ */
+template<typename LHS, typename RHS, typename Op>
+class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct binary expression
+     * @param lhs Left operand
+     * @param rhs Right operand
+     * @param op Operation to apply
+     */
+    constexpr VectorBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
+        : lhs_(lhs), rhs_(rhs), op_(op) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Result of operation on elements at index i
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return op_(lhs_[i], rhs_[i]);
+    }
+
+    /**
+     * @brief Get size of expression (from left operand)
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return lhs_.size();
+    }
+};
+
+/**
+ * @brief Unary expression for element-wise operations on a single vector expression
+ * @tparam Expr Expression type
+ * @tparam Op Unary operation functor
+ */
+template<typename Expr, typename Op>
+class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
+private:
+    const Expr& expr_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct unary expression
+     * @param expr Operand expression
+     * @param op Operation to apply
+     */
+    constexpr VectorUnaryExpr(const Expr& expr, Op op = Op{})
+        : expr_(expr), op_(op) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Result of operation on element at index i
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return op_(expr_[i]);
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Scalar multiplication expression
+ * @tparam Expr Vector expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar multiplication expression
+     * @param expr Vector expression
+     * @param scalar Scalar value
+     */
+    constexpr VectorScalarExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Element multiplied by scalar
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return expr_[i] * scalar_;
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Scalar division expression
+ * @tparam Expr Vector expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar division expression
+     * @param expr Vector expression
+     * @param scalar Scalar divisor
+     */
+    constexpr VectorScalarDivExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Element divided by scalar
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return expr_[i] / scalar_;
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Addition operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto operator+(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Add>(
+        lhs.derived(), rhs.derived(), detail::ops::Add{}
+    );
+}
+
+/**
+ * @brief Subtraction operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto operator-(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Sub>(
+        lhs.derived(), rhs.derived(), detail::ops::Sub{}
+    );
+}
+
+/**
+ * @brief Element-wise multiplication operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Mul>(
+        lhs.derived(), rhs.derived(), detail::ops::Mul{}
+    );
+}
+
+/**
+ * @brief Element-wise division operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard_div(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Div>(
+        lhs.derived(), rhs.derived(), detail::ops::Div{}
+    );
+}
+
+/**
+ * @brief Negation operator for vector expressions
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto operator-(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Negate>(
+        expr.derived(), detail::ops::Negate{}
+    );
+}
+
+/**
+ * @brief Scalar multiplication operator (vector * scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator*(const VectorExpr<Expr>& expr, Scalar scalar) {
+    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar multiplication operator (scalar * vector)
+ */
+template<typename Scalar, typename Expr,
+         typename = std::enable_if_t<
+             std::is_arithmetic_v<Scalar> &&
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto operator*(Scalar scalar, const VectorExpr<Expr>& expr) {
+    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar division operator (vector / scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator/(const VectorExpr<Expr>& expr, Scalar scalar) {
+    return VectorScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Element-wise absolute value
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto abs(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
+}
+
+/**
+ * @brief Element-wise square root
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto sqrt(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
+}
+
+/**
+ * @brief Dot product for vector expressions
+ * @tparam LHS Left vector expression type
+ * @tparam RHS Right vector expression type
+ * @param lhs Left operand
+ * @param rhs Right operand
+ * @return Dot product result
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto dot(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    using result_type = decltype(lhs.derived()[0] * rhs.derived()[0]);
+    result_type sum = result_type{0};
+    const auto n = lhs.size();
+    for (std::size_t i = 0; i < n; ++i) {
+        sum += lhs.derived()[i] * rhs.derived()[i];
+    }
+    return sum;
+}
+
+/**
+ * @brief Compute norm squared of vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Square of the Euclidean norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto norm_squared(const VectorExpr<Expr>& expr) {
+    return dot(expr, expr);
+}
+
+/**
+ * @brief Compute norm of vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Euclidean norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto norm(const VectorExpr<Expr>& expr) {
+    using std::sqrt;
+    return sqrt(norm_squared(expr));
+}
+
+/**
+ * @brief Normalize vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Normalized vector expression
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto normalize(const VectorExpr<Expr>& expr) {
+    return expr / norm(expr);
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_VECTOR_EXPR_H
\ No newline at end of file
diff --git a/Code/Source/solver/FE/Quadrature/QuadratureRule.h b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
new file mode 100644
index 000000000..f7d186891
--- /dev/null
+++ b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
@@ -0,0 +1,237 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_QUADRATURE_RULE_H
+#define SVMP_FE_QUADRATURE_RULE_H
+
+/**
+ * @file QuadratureRule.h
+ * @brief Abstracted quadrature rule representation for FE integration
+ *
+ * This header defines the base class for all quadrature rules used by the
+ * finite element infrastructure. Rules are expressed in reference element
+ * space only; mapping to physical space is handled by the Geometry module.
+ *
+ * The interface is intentionally lightweight and header-only to avoid coupling
+ * Quadrature to other modules while remaining compatible with the Mesh library
+ * through shared type aliases provided by FE/Common/Types.h.
+ */
+
+#include "Types.h"
+#include "FEException.h"
+#include "Math/Vector.h"
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace quadrature {
+
+/// Convenience alias for quadrature point representation in reference space
+using QuadPoint = math::Vector<Real, 3>;
+
+struct QuadraturePointFingerprint {
+    int dimension{0};
+    std::size_t num_points{0};
+    std::uint64_t points_hash_a{0};
+    std::uint64_t points_hash_b{0};
+};
+
+/**
+ * @brief Base class for quadrature rules over reference elements
+ *
+ * Derived classes populate the point/weight data via the protected setters.
+ * The class performs lightweight consistency checks (size agreement, basic
+ * reference-measure validation) but leaves element-specific checks to callers.
+ */
+class QuadratureRule {
+public:
+    virtual ~QuadratureRule() = default;
+
+    /// Number of quadrature points
+    std::size_t num_points() const noexcept { return points_.size(); }
+
+    /// Polynomial exactness degree reported by the rule
+    int order() const noexcept { return order_; }
+
+    /// Spatial dimension of the reference domain
+    int dimension() const noexcept { return dimension_; }
+
+    /// Cell family the rule integrates over (line, tri, quad, ...)
+    svmp::CellFamily cell_family() const noexcept { return cell_family_; }
+
+    /// Access a single quadrature point (no bounds checking)
+    QuadPoint point(std::size_t i) const noexcept { return points_[i]; }
+
+    /// Access a single quadrature weight (no bounds checking)
+    Real weight(std::size_t i) const noexcept { return weights_[i]; }
+
+    /// Bulk accessors
+    const std::vector<QuadPoint>& points() const noexcept { return points_; }
+    const std::vector<Real>& weights() const noexcept { return weights_; }
+
+    /// Cached coordinate-only fingerprint for consumers whose values depend on
+    /// reference points but not quadrature weights.
+    QuadraturePointFingerprint point_fingerprint() const noexcept { return point_fingerprint_; }
+
+    /// Stable semantic identity used by BasisCache
+    virtual std::string cache_identity() const;
+
+    /**
+     * @brief Validate rule data for basic consistency
+     * @param tol Relative tolerance for weight sum check
+     * @return True if rule passes size and weight checks
+     */
+    virtual bool is_valid(Real tol = 1e-12) const;
+
+    /**
+     * @brief Reference-domain measure for the element family
+     *
+     * Length/area/volume of the canonical reference element:
+     * - Line   [-1,1]            -> 2
+     * - Quad   [-1,1]^2          -> 4
+     * - Hex    [-1,1]^3          -> 8
+     * - Tri    (0,0)-(1,0)-(0,1) -> 0.5
+     * - Tet    simplex at origin -> 1/6
+     * - Wedge  (triangle x line) -> 1
+     * - Pyramid (x,y in [-1,1], z in [0,1]) -> 4/3
+     */
+    Real reference_measure() const noexcept;
+
+protected:
+    QuadratureRule(svmp::CellFamily family, int dimension, int order = 0)
+        : cell_family_(family), dimension_(dimension), order_(order) {}
+
+    /// Assign point and weight storage (sizes must match)
+    void set_data(std::vector<QuadPoint> pts, std::vector<Real> wts);
+
+    /// Override computed order in derived classes
+    void set_order(int ord) noexcept { order_ = ord; }
+
+private:
+    std::string build_cache_identity() const;
+    QuadraturePointFingerprint build_point_fingerprint() const noexcept;
+
+    svmp::CellFamily cell_family_;
+    int dimension_;
+    int order_;
+    std::vector<QuadPoint> points_;
+    std::vector<Real> weights_;
+    std::string cache_identity_;
+    QuadraturePointFingerprint point_fingerprint_;
+};
+
+// --------------------------------------------------------------------------------
+// Inline implementations
+// --------------------------------------------------------------------------------
+
+inline void QuadratureRule::set_data(std::vector<QuadPoint> pts, std::vector<Real> wts) {
+    if (pts.size() != wts.size()) {
+        throw FEException("QuadratureRule: points/weights size mismatch",
+                          StatusCode::InvalidArgument,
+                          __FILE__, __LINE__, __func__);
+    }
+    points_ = std::move(pts);
+    weights_ = std::move(wts);
+    point_fingerprint_ = build_point_fingerprint();
+    cache_identity_ = build_cache_identity();
+}
+
+inline bool QuadratureRule::is_valid(Real tol) const {
+    if (points_.empty() || points_.size() != weights_.size()) {
+        return false;
+    }
+    Real sum_w = Real(0);
+    for (Real w : weights_) {
+        if (!std::isfinite(w)) {
+            return false;
+        }
+        sum_w += w;
+    }
+    const Real ref = reference_measure();
+    const Real denom = std::max(Real(1), std::abs(ref));
+    return std::abs(sum_w - ref) <= tol * denom;
+}
+
+inline std::string QuadratureRule::cache_identity() const {
+    if (!cache_identity_.empty()) {
+        return cache_identity_;
+    }
+    return build_cache_identity();
+}
+
+inline std::string QuadratureRule::build_cache_identity() const {
+    std::ostringstream oss;
+    oss << "dim=" << dimension_
+        << "|npts=" << points_.size();
+
+    oss << std::setprecision(std::numeric_limits<Real>::max_digits10);
+    for (const auto& pt : points_) {
+        oss << "|pt=" << pt[0] << ',' << pt[1] << ',' << pt[2];
+    }
+    return oss.str();
+}
+
+inline QuadraturePointFingerprint QuadratureRule::build_point_fingerprint() const noexcept {
+    auto real_bits = [](Real value) noexcept {
+        static_assert(sizeof(Real) <= sizeof(std::uint64_t),
+                      "Quadrature point fingerprints assume Real fits in 64 bits");
+        std::uint64_t bits = 0;
+        std::memcpy(&bits, &value, sizeof(Real));
+        return bits;
+    };
+    auto mix_hash = [](std::uint64_t& seed, std::uint64_t value) noexcept {
+        seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
+    };
+
+    QuadraturePointFingerprint fingerprint;
+    fingerprint.dimension = dimension_;
+    fingerprint.num_points = points_.size();
+    fingerprint.points_hash_a = 1469598103934665603ULL;
+    fingerprint.points_hash_b = 1099511628211ULL;
+
+    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.dimension));
+    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.num_points));
+    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.num_points));
+    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.dimension));
+    for (const auto& point : points_) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            const std::uint64_t bits = real_bits(point[component]);
+            mix_hash(fingerprint.points_hash_a, bits);
+            mix_hash(fingerprint.points_hash_b, bits ^ (0xbf58476d1ce4e5b9ULL + component));
+        }
+    }
+    return fingerprint;
+}
+
+inline Real QuadratureRule::reference_measure() const noexcept {
+    switch (cell_family_) {
+        case svmp::CellFamily::Line:      return Real(2);
+        case svmp::CellFamily::Quad:      return Real(4);
+        case svmp::CellFamily::Hex:       return Real(8);
+        case svmp::CellFamily::Triangle:  return Real(0.5);
+        case svmp::CellFamily::Tetra:     return Real(1.0 / 6.0);
+        case svmp::CellFamily::Wedge:     return Real(1.0);     // 0.5 area * length 2
+        case svmp::CellFamily::Pyramid:   return Real(4.0 / 3.0);
+        case svmp::CellFamily::Point:     return Real(1.0);
+        default:                          return Real(1.0);
+    }
+}
+
+} // namespace quadrature
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_QUADRATURE_RULE_H
diff --git a/Code/Source/solver/fs.cpp b/Code/Source/solver/fs.cpp
index d592a8b96..abe1992df 100644
--- a/Code/Source/solver/fs.cpp
+++ b/Code/Source/solver/fs.cpp
@@ -5,10 +5,66 @@
 
 #include "fs.h"
 #include "consts.h"
+#include "FE/Common/FEException.h"
 #include "nn.h"
 
+#include <algorithm>
+#include <string>
+
 namespace fs {
 
+namespace {
+
+namespace fe = svmp::FE;
+
+std::string element_name(consts::ElementType eType)
+{
+  const auto iter = consts::element_type_to_string.find(eType);
+  if (iter != consts::element_type_to_string.end()) {
+    return iter->second;
+  }
+
+  return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
+}
+
+bool supports_reference_hessians(consts::ElementType eType)
+{
+  using namespace consts;
+
+  switch (eType) {
+    case ElementType::LIN1:
+    case ElementType::LIN2:
+    case ElementType::TRI3:
+    case ElementType::TRI6:
+    case ElementType::QUD4:
+    case ElementType::QUD8:
+    case ElementType::QUD9:
+    case ElementType::TET4:
+    case ElementType::TET10:
+    case ElementType::HEX8:
+    case ElementType::HEX20:
+    case ElementType::HEX27:
+    case ElementType::WDG:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void populate_reference_hessians_if_supported(fsType& fs, const int insd)
+{
+  if (fs.Nxx.size() == 0 || !supports_reference_hessians(fs.eType)) {
+    return;
+  }
+
+  const int ind2 = std::max(3 * (insd - 1), 1);
+  for (int g = 0; g < fs.nG; ++g) {
+    nn::get_gn_nxx(insd, ind2, fs.eType, fs.eNoN, g, fs.xi, fs.Nxx);
+  }
+}
+
+} // namespace
+
 
 /// @brief Allocates arrays within the function space type. Assumes that 
 /// fs%eNoN and fs%nG are already defined
@@ -103,6 +159,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[1].eType, fs[1].eNoN, g, fs[1].xi, fs[1].N, fs[1].Nx);
       }
       nn::get_nn_bnds(nsd, fs[1].eType, fs[1].eNoN, fs[1].xib, fs[1].Nb);
+      populate_reference_hessians_if_supported(fs[1], nsd);
 
     } else if (iOpt == 2) {
       fs[1].nG    = lM.fs[1].nG;
@@ -133,6 +190,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[0].eType, fs[0].eNoN, g, fs[0].xi, fs[0].N, fs[0].Nx);
       }
       nn::get_nn_bnds(nsd, fs[0].eType, fs[0].eNoN, fs[0].xib, fs[0].Nb);
+      populate_reference_hessians_if_supported(fs[0], nsd);
     }
   }
 }
@@ -275,14 +333,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
     lM.fs[0].Nb  = lM.Nb;
     lM.fs[0].Nx  = lM.Nx;
   }
-  // Second order derivatives for vector function space
-  //
-  if (!lM.fs[0].lShpF) {
-    int ind2 = std::max(3*(insd-1), 1);
-    for (int g = 0; g < lM.fs[0].nG; g++) {
-      nn::get_gn_nxx(insd, ind2, lM.fs[0].eType, lM.fs[0].eNoN, g, lM.fs[0].xi, lM.fs[0].Nxx);
-    }
-  }
+  populate_reference_hessians_if_supported(lM.fs[0], insd);
 
   // Sets Taylor-Hood basis [fluid, stokes, ustruct, FSI)
   if (lM.nFs == 2) {
@@ -291,6 +342,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
 
     // Initialize the function space
     init_fs(lM.fs[1], nsd, insd);
+    populate_reference_hessians_if_supported(lM.fs[1], insd);
   }
 }
 
@@ -343,7 +395,8 @@ void set_thood_fs(fsType& fs, consts::ElementType eType)
     break;
 
     default:
-      throw std::runtime_error("Cannot choose Taylor-Hood basis");
+      throw fe::InvalidElementException("Cannot choose Taylor-Hood basis",
+          element_name(eType), __FILE__, __LINE__, __func__);
     break;
   }
 }
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 9f12d64e4..51c126708 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -15,15 +15,28 @@
 #include "Array.h"
 #include "Vector.h"
 
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Common/FEException.h"
+
 #include "consts.h"
 #include "mat_fun.h"
 #include "utils.h"
 
 #include "lapack_defs.h"
 
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <cctype>
+#include <exception>
 #include <functional>
 #include <iostream> 
 #include <math.h> 
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
 
 namespace nn {
 
@@ -46,13 +59,510 @@ using namespace consts;
 // Define a map type used to set the bounds of element shape functions.
 #include "nn_elem_nn_bnds.h"
 
+namespace {
+
+namespace fe = svmp::FE;
+namespace febasis = svmp::FE::basis;
+
+struct BasisSelection {
+  fe::ElementType element;
+  fe::BasisType basis;
+  int order;
+};
+
+enum class BasisMode {
+  Auto,
+  Legacy,
+  Fe
+};
+
+std::string normalize_basis_mode_name(std::string value)
+{
+  std::transform(value.begin(), value.end(), value.begin(),
+      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+  return value;
+}
+
+BasisMode parse_basis_mode()
+{
+  const char* mode_env = std::getenv("SVMP_BASIS_MODE");
+  if (mode_env == nullptr || *mode_env == '\0') {
+    return BasisMode::Auto;
+  }
+
+  const std::string mode = normalize_basis_mode_name(mode_env);
+  if (mode == "auto") {
+    return BasisMode::Auto;
+  }
+  if (mode == "legacy") {
+    return BasisMode::Legacy;
+  }
+  if (mode == "fe") {
+    return BasisMode::Fe;
+  }
+
+  throw febasis::BasisConfigurationException(
+      "Invalid SVMP_BASIS_MODE='" + std::string(mode_env) +
+          "'. Expected one of: auto, legacy, fe",
+      __FILE__, __LINE__, __func__);
+}
+
+BasisMode active_basis_mode()
+{
+  static const BasisMode mode = parse_basis_mode();
+  return mode;
+}
+
+const char* basis_mode_name(BasisMode mode)
+{
+  switch (mode) {
+    case BasisMode::Auto:
+      return "auto";
+    case BasisMode::Legacy:
+      return "legacy";
+    case BasisMode::Fe:
+      return "fe";
+  }
+  return "unknown";
+}
+
+void log_basis_mode_once()
+{
+  static const bool logged = []() {
+    std::cout << "[svMultiPhysics] SVMP_BASIS_MODE="
+              << basis_mode_name(active_basis_mode()) << std::endl;
+    return true;
+  }();
+  (void)logged;
+}
+
+bool basis_mode_allows_fe_adapter()
+{
+  return active_basis_mode() != BasisMode::Legacy;
+}
+
+std::string solver_element_name(consts::ElementType eType)
+{
+  auto it = consts::element_type_to_string.find(eType);
+  if (it != consts::element_type_to_string.end()) {
+    return it->second + " (" + std::to_string(static_cast<int>(eType)) + ")";
+  }
+  return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
+}
+
+std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
+{
+  switch (eType) {
+    case consts::ElementType::LIN1:
+      return BasisSelection{fe::ElementType::Line2, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::LIN2:
+      return BasisSelection{fe::ElementType::Line3, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::TRI3:
+      return BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::TRI6:
+      return BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::QUD4:
+      return BasisSelection{fe::ElementType::Quad4, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::QUD8:
+      return BasisSelection{fe::ElementType::Quad8, fe::BasisType::Serendipity, 2};
+    case consts::ElementType::QUD9:
+      return BasisSelection{fe::ElementType::Quad9, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::TET4:
+      return BasisSelection{fe::ElementType::Tetra4, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::TET10:
+      return BasisSelection{fe::ElementType::Tetra10, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::HEX8:
+      return BasisSelection{fe::ElementType::Hex8, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::HEX20:
+      return BasisSelection{fe::ElementType::Hex20, fe::BasisType::Serendipity, 2};
+    case consts::ElementType::HEX27:
+      return BasisSelection{fe::ElementType::Hex27, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::WDG:
+      return BasisSelection{fe::ElementType::Wedge6, fe::BasisType::Lagrange, 1};
+    default:
+      return std::nullopt;
+  }
+}
+
+bool use_basis_adapter_for(consts::ElementType eType)
+{
+  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+}
+
+bool supports_basis_hessian_adapter_for(consts::ElementType eType)
+{
+  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+}
+
+bool supports_face_basis_adapter_for(consts::ElementType eType)
+{
+  if (!basis_mode_allows_fe_adapter()) {
+    return false;
+  }
+
+  switch (eType) {
+    case consts::ElementType::LIN1:
+    case consts::ElementType::LIN2:
+    case consts::ElementType::TRI3:
+    case consts::ElementType::TRI6:
+    case consts::ElementType::QUD4:
+    case consts::ElementType::QUD8:
+    case consts::ElementType::QUD9:
+      return to_basis_selection(eType).has_value();
+    default:
+      return false;
+  }
+}
+
+std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::ElementType eType)
+{
+  auto selection = to_basis_selection(eType);
+  if (!selection) {
+    throw febasis::BasisElementCompatibilityException(
+        "No FE Basis selection for solver element " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  febasis::BasisRequest request;
+  request.element_type = selection->element;
+  request.basis_type = selection->basis;
+  request.order = selection->order;
+  return febasis::basis_factory::create(request);
+}
+
+template <std::size_t NumNodes>
+std::size_t mapped_basis_index(const std::array<std::size_t, NumNodes>& map,
+                               consts::ElementType eType,
+                               const int solver_node)
+{
+  if (solver_node < 0 || static_cast<std::size_t>(solver_node) >= map.size()) {
+    throw febasis::BasisNodeOrderingException(
+        "Solver node " + std::to_string(solver_node) +
+            " is outside node map for " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  return map[static_cast<std::size_t>(solver_node)];
+}
+
+std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
+{
+  if (solver_node < 0) {
+    throw febasis::BasisNodeOrderingException(
+        "Solver node " + std::to_string(solver_node) +
+            " is outside node map for " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto node = static_cast<std::size_t>(solver_node);
+
+  switch (eType) {
+    case consts::ElementType::TRI3: {
+      static constexpr std::array<std::size_t, 3> map{1, 2, 0};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TRI6: {
+      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TET4: {
+      static constexpr std::array<std::size_t, 4> map{1, 2, 3, 0};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TET10: {
+      static constexpr std::array<std::size_t, 10> map{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::WDG: {
+      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::HEX27: {
+      static constexpr std::array<std::size_t, 27> map{
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19,
+        25, 23, 22, 24, 20, 21, 26
+      };
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    default:
+      return node;
+  }
+}
+
+fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
+                                               const int g,
+                                               const Array<double>& xi)
+{
+  if (xi.nrows() < basis.dimension()) {
+    throw febasis::BasisConfigurationException(
+        "xi has " + std::to_string(xi.nrows()) +
+            " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
+            " reference coordinates",
+        __FILE__, __LINE__, __func__);
+  }
+
+  fe::math::Vector<fe::Real, 3> point{};
+  for (int d = 0; d < basis.dimension(); ++d) {
+    point[static_cast<std::size_t>(d)] = xi(d, g);
+  }
+  return point;
+}
+
+void copy_basis_values_to_solver_arrays(consts::ElementType eType,
+                                        const int eNoN,
+                                        const int g,
+                                        const std::vector<fe::Real>& values,
+                                        const std::vector<febasis::Gradient>& gradients,
+                                        Array<double>& N,
+                                        Array3<double>& Nx)
+{
+  if (values.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis value count " + std::to_string(values.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+  if (gradients.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis gradient count " + std::to_string(gradients.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+
+  for (int a = 0; a < eNoN; ++a) {
+    const auto basis_index = basis_index_for_solver_node(eType, a);
+    if (basis_index >= values.size() || basis_index >= gradients.size()) {
+      throw febasis::BasisNodeOrderingException(
+          "Solver node " + std::to_string(a) + " maps to FE Basis node " +
+              std::to_string(basis_index) + " outside basis output for " +
+              solver_element_name(eType),
+          __FILE__, __LINE__, __func__);
+    }
+
+    N(a, g) = values[basis_index];
+
+    for (int d = 0; d < Nx.nrows(); ++d) {
+      Nx(d, a, g) = 0.0;
+    }
+    const int ndim = std::min<int>(Nx.nrows(), 3);
+    for (int d = 0; d < ndim; ++d) {
+      Nx(d, a, g) = gradients[basis_index][static_cast<std::size_t>(d)];
+    }
+  }
+}
+
+void evaluate_basis_values_and_gradients(const int insd,
+                                         consts::ElementType eType,
+                                         const int eNoN,
+                                         const int g,
+                                         Array<double>& xi,
+                                         Array<double>& N,
+                                         Array3<double>& Nx)
+{
+  auto basis = make_basis_for_solver_element(eType);
+  if (insd < basis->dimension()) {
+    throw febasis::BasisConfigurationException(
+        "solver insd " + std::to_string(insd) +
+            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto point = make_basis_point(*basis, g, xi);
+  std::vector<fe::Real> values;
+  std::vector<febasis::Gradient> gradients;
+  basis->evaluate_values(point, values);
+  basis->evaluate_gradients(point, gradients);
+
+  // FE Basis owns the formulas; fsType and mshType remain the solver-facing storage contract.
+  copy_basis_values_to_solver_arrays(eType, eNoN, g, values, gradients, N, Nx);
+}
+
+void evaluate_face_basis_values_and_gradients(const int gaus_pt, faceType& face)
+{
+  evaluate_basis_values_and_gradients(
+      face.xi.nrows(),
+      face.eType,
+      face.eNoN,
+      gaus_pt,
+      face.xi,
+      face.N,
+      face.Nx);
+}
+
+int required_nxx_components_for_dimension(const int dimension)
+{
+  switch (dimension) {
+    case 1:
+      return 1;
+    case 2:
+      return 3;
+    case 3:
+      return 6;
+    default:
+      throw febasis::BasisConfigurationException(
+          "Unsupported FE Basis reference dimension " + std::to_string(dimension),
+          __FILE__, __LINE__, __func__);
+  }
+}
+
+void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
+                                       const int eNoN,
+                                       const int g,
+                                       const int dimension,
+                                       const std::vector<febasis::Hessian>& hessians,
+                                       Array3<double>& Nxx)
+{
+  if (hessians.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis Hessian count " + std::to_string(hessians.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const int required_components = required_nxx_components_for_dimension(dimension);
+  if (Nxx.nrows() < required_components) {
+    throw febasis::BasisConfigurationException(
+        "solver Nxx has " + std::to_string(Nxx.nrows()) +
+            " rows but FE Basis Hessian packing requires " + std::to_string(required_components),
+        __FILE__, __LINE__, __func__);
+  }
+
+  for (int a = 0; a < eNoN; ++a) {
+    for (int i = 0; i < Nxx.nrows(); ++i) {
+      Nxx(i, a, g) = 0.0;
+    }
+
+    const auto basis_index = basis_index_for_solver_node(eType, a);
+    if (basis_index >= hessians.size()) {
+      throw febasis::BasisNodeOrderingException(
+          "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
+              std::to_string(basis_index) + " outside basis output for " +
+              solver_element_name(eType),
+          __FILE__, __LINE__, __func__);
+    }
+
+    const auto& hessian = hessians[basis_index];
+    Nxx(0, a, g) = hessian(0, 0);
+    if (dimension >= 2) {
+      Nxx(1, a, g) = hessian(1, 1);
+      Nxx(2, a, g) = hessian(0, 1);
+    }
+    if (dimension >= 3) {
+      Nxx(2, a, g) = hessian(2, 2);
+      Nxx(3, a, g) = hessian(0, 1);
+      Nxx(4, a, g) = hessian(1, 2);
+      Nxx(5, a, g) = hessian(0, 2);
+    }
+  }
+}
+
+void evaluate_basis_hessians(const int insd,
+                             const int ind2,
+                             consts::ElementType eType,
+                             const int eNoN,
+                             const int gaus_pt,
+                             const Array<double>& xi,
+                             Array3<double>& Nxx)
+{
+  auto basis = make_basis_for_solver_element(eType);
+  if (insd < basis->dimension()) {
+    throw febasis::BasisConfigurationException(
+        "solver insd " + std::to_string(insd) +
+            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const int required_components = required_nxx_components_for_dimension(basis->dimension());
+  if (ind2 < required_components) {
+    throw febasis::BasisConfigurationException(
+        "solver ind2 " + std::to_string(ind2) +
+            " is smaller than packed Hessian component count " + std::to_string(required_components),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto point = make_basis_point(*basis, gaus_pt, xi);
+  std::vector<febasis::Hessian> hessians;
+  basis->evaluate_hessians(point, hessians);
+
+  // Solver Nxx packing is dxx, dyy, dxy in 2D and dxx, dyy, dzz, dxy, dyz, dxz in 3D.
+  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
+}
+
+void call_legacy_get_gnn(const int insd,
+                         consts::ElementType eType,
+                         const int eNoN,
+                         const int g,
+                         Array<double>& xi,
+                         Array<double>& N,
+                         Array3<double>& Nx,
+                         const std::string& basis_failure = "")
+{
+  try {
+    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
+  } catch (const std::bad_function_call&) {
+    std::string message = "[get_gnn] No FE Basis or legacy shape support for element " +
+        solver_element_name(eType) + "; legacy fallback was attempted";
+    if (!basis_failure.empty()) {
+      message += " after FE Basis failure: " + basis_failure;
+    }
+    throw fe::InvalidElementException(message, solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+}
+
+void call_legacy_get_gn_nxx(const int insd,
+                            const int ind2,
+                            consts::ElementType eType,
+                            const int eNoN,
+                            const int gaus_pt,
+                            const Array<double>& xi,
+                            Array3<double>& Nxx,
+                            const std::string& basis_failure = "",
+                            const bool allow_missing_legacy_table = false)
+{
+  try {
+    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
+  } catch (const std::bad_function_call&) {
+    if (allow_missing_legacy_table) {
+      return;
+    }
+
+    std::string message = "[get_gn_nxx] No FE Basis or legacy second-derivative support for element " +
+        solver_element_name(eType) + "; legacy fallback was attempted";
+    if (!basis_failure.empty()) {
+      message += " after FE Basis failure: " + basis_failure;
+    }
+    throw fe::InvalidElementException(message, solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+}
+
+void call_legacy_face_shape_data(const int gaus_pt, faceType& face)
+{
+  auto legacy_shape = set_face_shape_data.find(face.eType);
+  if (legacy_shape == set_face_shape_data.end()) {
+    throw fe::InvalidElementException(
+        "[get_gnn(face)] No FE Basis or legacy face shape support",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+  }
+
+  legacy_shape->second(gaus_pt, face);
+}
+
+} // namespace
+
 void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<double>& w, Array<double>& xi) 
 {
+  log_basis_mode_once();
+
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for element etype " + std::to_string(static_cast<int>(eType)) + 
-        " in 'get_element_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'get_element_gauss_int_data'",
+        solver_element_name(eType), __FILE__, __LINE__, __func__);
   }
 }
 
@@ -62,19 +572,27 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
 //
 void get_gip(mshType& mesh)
 {
+  log_basis_mode_once();
+
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for mesh etype " + std::to_string(static_cast<int>(mesh.eType)) + " in 'set_element_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'set_element_gauss_int_data'",
+        solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
   }
 }
 
 void get_gip(Simulation* simulation, faceType& face)
 {
+  log_basis_mode_once();
+
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for face type " + std::to_string(static_cast<int>(face.eType)) + " in 'set_face_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'set_face_gauss_int_data'",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
   }
 }
 
@@ -83,11 +601,26 @@ void get_gip(Simulation* simulation, faceType& face)
 void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const int g, Array<double>& xi, 
     Array<double>& N, Array3<double>& Nx)
 {
-  try {
-    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gnn] No support for element type " + std::to_string(static_cast<int>(eType)) + " in 'get_element_shape_data'.");
+  log_basis_mode_once();
+
+  if (use_basis_adapter_for(eType)) {
+    try {
+      evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
+      return;
+    } catch (const fe::NotImplementedException& exception) {
+      call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx, exception.what());
+      return;
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gnn] FE Basis adapter failed for element " +
+              solver_element_name(eType) +
+              "; legacy fallback was not attempted for this approved element: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
   }
+
+  call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx);
 }
 
 /// @brief A big fat hack because the Fortran GETNN() operates on primitive types but
@@ -111,20 +644,48 @@ void get_gnn(const int nsd, consts::ElementType eType, const int eNoN, Vector<do
 
 void get_gnn(int gaus_pt, mshType& mesh)
 {
-  try {
-    set_element_shape_data[mesh.eType](gaus_pt, mesh);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gnn] No support for element type " + std::to_string(static_cast<int>(mesh.eType)) + " in 'set_element_shape_data'.");
-  }
+  nn::get_gnn(mesh.xi.nrows(), mesh.eType, mesh.eNoN, gaus_pt, mesh.xi, mesh.N, mesh.Nx);
 }
 
 void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
-  try {
-    set_face_shape_data[face.eType](gaus_pt, face);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for face type " + std::to_string(static_cast<int>(face.eType)) + " in 'set_face_shape_data'.");
+  using consts::ElementType;
+
+  log_basis_mode_once();
+
+  if (active_basis_mode() == BasisMode::Legacy) {
+    call_legacy_face_shape_data(gaus_pt, face);
+    return;
+  }
+
+  if (face.eType == ElementType::NRB) {
+    throw fe::NotImplementedException(
+        "[get_gnn(face)] NRB face shape functions remain unsupported by FE Basis and the legacy face table",
+        __FILE__, __LINE__, __func__);
+  }
+
+  if (supports_face_basis_adapter_for(face.eType)) {
+    try {
+      // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
+      evaluate_face_basis_values_and_gradients(gaus_pt, face);
+      return;
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gnn(face)] FE Basis face adapter failed for mapped face element " +
+              solver_element_name(face.eType) + "; legacy fallback was not attempted: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
   }
+
+  if (face.eType == ElementType::PNT) {
+    // Point faces have no mapped FE Basis representation in this pass; keep the legacy scalar value path.
+    call_legacy_face_shape_data(gaus_pt, face);
+    return;
+  }
+
+  // The legacy face table is retained only for explicitly unsupported paths and future cleanup.
+  call_legacy_face_shape_data(gaus_pt, face);
 }
 
 /// @brief Returns second order derivatives at given natural coords
@@ -136,19 +697,40 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
 {
   using namespace consts;
 
-  // Element types that don't have 2nd derivatives computed for them.
-  static std::set<ElementType> no_derivs{ElementType::NRB, ElementType::QUD4, ElementType::HEX8, 
-                                         ElementType::HEX20, ElementType::HEX27};
+  log_basis_mode_once();
 
-  if (no_derivs.count(eType) != 0) {
+  // NRB/PNT and face-only Hessian paths remain intentionally unsupported here.
+  if (eType == ElementType::NRB || eType == ElementType::PNT) {
     return;
   }
 
-  try {
-    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gn_nxx] No support for element type " + std::to_string(static_cast<int>(eType)) + " in 'get_element_2nd_derivs'.");
+  if (active_basis_mode() == BasisMode::Legacy) {
+    call_legacy_get_gn_nxx(
+        insd, ind2, eType, eNoN, gaus_pt, xi, Nxx, "", true);
+    return;
   }
+
+  if (supports_basis_hessian_adapter_for(eType)) {
+    try {
+      evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
+      return;
+    } catch (const fe::NotImplementedException& exception) {
+      throw fe::NotImplementedException(
+          "[get_gn_nxx] FE Basis Hessian support is required for mapped volume element " +
+              solver_element_name(eType) + " but is not implemented: " + exception.what(),
+          __FILE__, __LINE__, __func__);
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gn_nxx] FE Basis Hessian adapter failed for element " +
+              solver_element_name(eType) +
+              "; legacy fallback was not attempted for this approved element: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
+  }
+
+  // Legacy Hessian tables are reserved for intentionally unsupported families.
+  call_legacy_get_gn_nxx(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
 }
 
 /// @brief Sets bounds on Gauss integration points in parametric space and
@@ -333,7 +915,9 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
   l1 = (l1 && l2 && l3 && l4);
 
   if (!l1) {
-    throw std::runtime_error("Error in computing shape functions");
+    throw fe::InvalidArgumentException(
+        "Error in computing shape functions",
+        __FILE__, __LINE__, __func__);
   }
 }
 
@@ -582,8 +1166,11 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      throw std::runtime_error("[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " + std::to_string(Ac) + 
-          " could not be matched to a node in the '" + msh.name + "' volume mesh.");
+      throw fe::InvalidArgumentException(
+          "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
+              std::to_string(Ac) + " could not be matched to a node in the '" +
+              msh.name + "' volume mesh.",
+          __FILE__, __LINE__, __func__);
     }
 
     ptr(a) = b;
@@ -632,7 +1219,9 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          throw std::runtime_error("gnnb: invalid MechanicalConfigurationType provided");
+          throw fe::InvalidArgumentException(
+              "gnnb: invalid MechanicalConfigurationType provided",
+              __FILE__, __LINE__, __func__);
       }
     }
   }
@@ -821,7 +1410,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
     if (INFO != 0) {
-      throw std::runtime_error("[gn_nxx] Error in Lapack");
+      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
+          __FILE__, __LINE__, __func__);
     }
 
     Nxx = B;
@@ -892,7 +1482,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
     if (INFO != 0) {
-      throw std::runtime_error("[gn_nxx] Error in Lapack");
+      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
+          __FILE__, __LINE__, __func__);
     }
 
     Nxx = B;
@@ -940,8 +1531,10 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      throw std::runtime_error("[select_ele] No support for " + std::to_string(mesh.eNoN) + " noded " + 
-          std::to_string(insd) + "D elements.");
+      throw fe::InvalidElementException(
+          "[select_ele] No support for " + std::to_string(mesh.eNoN) +
+              " noded " + std::to_string(insd) + "D elements.",
+          solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
   }
 
   // Set mesh 'w' and 'xi' arrays used for Gauss integration.
@@ -997,8 +1590,10 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for " + std::to_string(face.eNoN) + " noded " +
-      std::to_string(insd) + "D elements in 'set_face_element_props'.");
+    throw fe::InvalidElementException(
+        "No support for " + std::to_string(face.eNoN) + " noded " +
+            std::to_string(insd) + "D elements in 'set_face_element_props'.",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
   }
 
   // Set face 'w' and 'xi' arrays used for Gauss integration.
@@ -1015,4 +1610,3 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
 }
 
 };
-
diff --git a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
new file mode 100644
index 000000000..216fd0401
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
@@ -0,0 +1,256 @@
+/**
+ * @file test_BasisCacheFactory.cpp
+ * @brief Tests for the migrated Basis cache and factory subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisCache.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/SerendipityBasis.h"
+#include "FE/Quadrature/QuadratureRule.h"
+
+#include <memory>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+using namespace svmp::FE::quadrature;
+
+namespace {
+
+class CustomQuadratureRule final : public QuadratureRule {
+public:
+    CustomQuadratureRule(svmp::CellFamily family,
+                         int dimension,
+                         int order,
+                         std::vector<QuadPoint> points,
+                         std::vector<Real> weights)
+        : QuadratureRule(family, dimension, order)
+    {
+        set_data(std::move(points), std::move(weights));
+    }
+};
+
+CustomQuadratureRule line_rule() {
+    return CustomQuadratureRule(
+        svmp::CellFamily::Line, 1, 3,
+        {
+            QuadPoint{Real(-0.5), Real(0), Real(0)},
+            QuadPoint{Real(0.5), Real(0), Real(0)}
+        },
+        {Real(1), Real(1)});
+}
+
+CustomQuadratureRule quad_rule(Real first_weight = Real(1)) {
+    return CustomQuadratureRule(
+        svmp::CellFamily::Quad, 2, 3,
+        {
+            QuadPoint{Real(-0.5), Real(-0.5), Real(0)},
+            QuadPoint{Real(0.5), Real(-0.25), Real(0)},
+            QuadPoint{Real(0.0), Real(0.5), Real(0)}
+        },
+        {first_weight, Real(1), Real(2)});
+}
+
+class TestCustomScalarBasis final : public BasisFunction {
+public:
+    explicit TestCustomScalarBasis(int tag)
+        : tag_(tag)
+    {
+    }
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    std::string cache_identity() const override {
+        return BasisFunction::cache_identity() + "|tag=" + std::to_string(tag_);
+    }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(2u);
+        const Real shift = Real(tag_) * Real(0.125);
+        values[0] = Real(0.5) * (Real(1) - xi[0]) + shift;
+        values[1] = Real(0.5) * (Real(1) + xi[0]) - shift;
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>&,
+                            std::vector<Gradient>& gradients) const override
+    {
+        gradients.assign(2u, Gradient{});
+        gradients[0][0] = Real(-0.5);
+        gradients[1][0] = Real(0.5);
+    }
+
+private:
+    int tag_{0};
+};
+
+class StructuredIdentityScalarBasis final : public BasisFunction {
+public:
+    explicit StructuredIdentityScalarBasis(int tag)
+        : tag_(tag)
+    {
+    }
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    bool cache_identity_words(std::vector<std::uint64_t>& words) const override {
+        words.push_back(0x7374727563746964ULL);
+        words.push_back(static_cast<std::uint64_t>(tag_));
+        return true;
+    }
+
+    std::string cache_identity() const override {
+        ++string_identity_calls;
+        return BasisFunction::cache_identity() + "|structured-tag=" + std::to_string(tag_);
+    }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(2u);
+        values[0] = Real(1) - xi[0] + Real(tag_);
+        values[1] = xi[0] - Real(tag_);
+    }
+
+    mutable std::size_t string_identity_calls{0};
+
+private:
+    int tag_{0};
+};
+
+} // namespace
+
+TEST(BasisFactory, CreatesLagrangeAndSerendipityBases) {
+    auto lagrange = basis_factory::create(
+        BasisRequest{ElementType::Line2, BasisType::Lagrange, 2});
+    ASSERT_NE(lagrange, nullptr);
+    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
+    EXPECT_EQ(lagrange->element_type(), ElementType::Line2);
+    EXPECT_EQ(lagrange->order(), 2);
+
+    auto serendipity = basis_factory::create(
+        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+    EXPECT_EQ(serendipity->element_type(), ElementType::Quad8);
+    EXPECT_EQ(serendipity->size(), 8u);
+}
+
+TEST(BasisFactory, RejectsOutOfScopeAndInvalidRequests) {
+    EXPECT_THROW(
+        (void)basis_factory::create(BasisRequest{ElementType::Line2, BasisType::Lagrange}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2,
+                         BasisType::Lagrange,
+                         1,
+                         Continuity::H_div,
+                         FieldType::Vector}),
+        BasisConfigurationException);
+}
+
+TEST(BasisFactory, SupportsCustomFactoryRegistration) {
+    basis_factory::clear_custom_registry_for_tests();
+    basis_factory::register_custom(
+        "test-custom",
+        [](const BasisRequest& req) {
+            const int tag = req.order.value_or(0);
+            return std::make_shared<TestCustomScalarBasis>(tag);
+        });
+
+    BasisRequest req{ElementType::Line2, BasisType::Custom, 7};
+    req.custom_id = "test-custom";
+    auto custom = basis_factory::create(req);
+    ASSERT_NE(custom, nullptr);
+    EXPECT_EQ(custom->basis_type(), BasisType::Custom);
+    EXPECT_EQ(custom->size(), 2u);
+
+    basis_factory::unregister_custom("test-custom");
+    EXPECT_THROW((void)basis_factory::create(req), BasisConfigurationException);
+    basis_factory::clear_custom_registry_for_tests();
+}
+
+TEST(BasisCache, ReusesEntriesForSameBasisAndQuadratureCoordinates) {
+    LagrangeBasis basis(ElementType::Line2, 2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry1 = cache.get_or_compute(basis, quad, true, true);
+    const auto& entry2 = cache.get_or_compute(basis, quad, true, true);
+
+    EXPECT_EQ(&entry1, &entry2);
+    EXPECT_EQ(entry1.num_qpts, quad.num_points());
+    EXPECT_EQ(entry1.num_dofs, basis.size());
+    ASSERT_EQ(entry1.scalar_values.size(), basis.size() * quad.num_points());
+    ASSERT_EQ(entry1.gradients.size(), basis.size() * 3u * quad.num_points());
+    ASSERT_EQ(entry1.hessians.size(), basis.size() * 9u * quad.num_points());
+    EXPECT_EQ(cache.size(), 1u);
+}
+
+TEST(BasisCache, ReusesCoordinateIdenticalQuadratureRulesIgnoringWeights) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+    const auto quad_a = quad_rule(Real(1));
+    const auto quad_b = quad_rule(Real(0.25));
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(basis, quad_a, true, false);
+    const auto& entry_b = cache.get_or_compute(basis, quad_b, true, false);
+
+    EXPECT_EQ(&entry_a, &entry_b);
+    EXPECT_EQ(cache.size(), 1u);
+}
+
+TEST(BasisCache, SeparatesStringIdentityCustomBases) {
+    TestCustomScalarBasis custom_a(1);
+    TestCustomScalarBasis custom_b(2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
+    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
+
+    EXPECT_NE(&entry_a, &entry_b);
+    EXPECT_NE(entry_a.scalar_values, entry_b.scalar_values);
+    EXPECT_EQ(cache.size(), 2u);
+}
+
+TEST(BasisCache, StructuredIdentityAvoidsStringFallbackAndSeparatesBases) {
+    StructuredIdentityScalarBasis custom_a(1);
+    StructuredIdentityScalarBasis custom_b(2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
+    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
+
+    EXPECT_NE(&entry_a, &entry_b);
+    EXPECT_EQ(custom_a.string_identity_calls, 0u);
+    EXPECT_EQ(custom_b.string_identity_calls, 0u);
+    EXPECT_EQ(cache.size(), 2u);
+}
+
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
new file mode 100644
index 000000000..967f078aa
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -0,0 +1,203 @@
+/**
+ * @file test_BasisErrorPaths.cpp
+ * @brief Error-path coverage for the migrated Lagrange-focused Basis subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/BasisFunction.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+class MinimalScalarBasis : public BasisFunction {
+public:
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>&,
+                         std::vector<Real>& values) const override
+    {
+        values.assign(size(), Real(0));
+    }
+};
+
+class CompleteFallbackBasis : public BasisFunction {
+public:
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Triangle3; }
+    int dimension() const noexcept override { return 2; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(size());
+        values[0] = Real(1) + xi[0];
+        values[1] = Real(2) + xi[1];
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>&,
+                            std::vector<Gradient>& gradients) const override
+    {
+        gradients.assign(size(), Gradient{});
+        gradients[0][0] = Real(1);
+        gradients[1][1] = Real(1);
+    }
+
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const override
+    {
+        hessians.assign(size(), Hessian{});
+        for (std::size_t d = 0; d < hessians.size(); ++d) {
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    hessians[d](r, c) = Real(100) * static_cast<Real>(d + 1u) +
+                                        Real(10) * static_cast<Real>(r) +
+                                        static_cast<Real>(c) + xi[2];
+                }
+            }
+        }
+    }
+};
+
+} // namespace
+
+TEST(BasisErrorPaths, LagrangeInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW(LagrangeBasis(ElementType::Unknown, 1),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(LagrangeBasis(ElementType::Line2, -1),
+                 BasisConfigurationException);
+    EXPECT_THROW(LagrangeBasis(ElementType::Quad8, 2),
+                 BasisElementCompatibilityException);
+}
+
+TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW(SerendipityBasis(ElementType::Unknown, 2),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2),
+                 BasisElementCompatibilityException);
+}
+
+TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Lagrange}),
+                 BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
+                 BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
+                 BasisConfigurationException);
+
+    auto serendipity = basis_factory::create(
+        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+}
+
+TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
+    try {
+        throw BasisConfigurationException("invalid config", __FILE__, __LINE__, __func__);
+    } catch (const FEException& e) {
+        EXPECT_EQ(e.status(), svmp::StatusCode::InvalidArgument);
+    }
+
+    try {
+        throw BasisConstructionException("construction failure", __FILE__, __LINE__, __func__);
+    } catch (const FEException& e) {
+        EXPECT_EQ(e.status(), svmp::StatusCode::InternalError);
+    }
+}
+
+TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
+    EXPECT_THROW((void)ReferenceNodeLayout::get_node_coords(ElementType::Quad8, 99u),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),
+                 BasisNodeOrderingException);
+}
+
+TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
+    MinimalScalarBasis basis;
+    const math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+
+    EXPECT_THROW(basis.evaluate_gradients(xi, gradients), BasisEvaluationException);
+    EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
+}
+
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
+    CompleteFallbackBasis basis;
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(0.5), Real(-0.25)},
+        {Real(-0.5), Real(0.75), Real(0.125)}
+    };
+    prewarm_basis_function_scratch(basis.size(), points.size());
+
+    std::vector<Real> flat_values(basis.size());
+    std::vector<Real> flat_gradients(basis.size() * 3u);
+    std::vector<Real> flat_hessians(basis.size() * 9u);
+    basis.evaluate_values_to(points.front(), flat_values.data());
+    basis.evaluate_gradients_to(points.front(), flat_gradients.data());
+    basis.evaluate_hessians_to(points.front(), flat_hessians.data());
+
+    std::vector<Real> expected_values;
+    std::vector<Gradient> expected_gradients;
+    std::vector<Hessian> expected_hessians;
+    basis.evaluate_all(points.front(), expected_values, expected_gradients, expected_hessians);
+    for (std::size_t d = 0; d < basis.size(); ++d) {
+        EXPECT_EQ(flat_values[d], expected_values[d]);
+        for (std::size_t c = 0; c < 3u; ++c) {
+            EXPECT_EQ(flat_gradients[d * 3u + c], expected_gradients[d][c]);
+        }
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_EQ(flat_hessians[d * 9u + r * 3u + c], expected_hessians[d](r, c));
+            }
+        }
+    }
+
+    constexpr std::size_t output_stride = 3u;
+    std::vector<Real> values(basis.size() * output_stride, Real(-99));
+    std::vector<Real> gradients(basis.size() * 3u * output_stride, Real(-99));
+    std::vector<Real> hessians(basis.size() * 9u * output_stride, Real(-99));
+    basis.evaluate_at_quadrature_points_strided(
+        points, output_stride, values.data(), gradients.data(), hessians.data());
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        basis.evaluate_all(points[q], expected_values, expected_gradients, expected_hessians);
+        for (std::size_t d = 0; d < basis.size(); ++d) {
+            EXPECT_EQ(values[d * output_stride + q], expected_values[d]);
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_EQ(gradients[(d * 3u + c) * output_stride + q],
+                          expected_gradients[d][c]);
+            }
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_EQ(hessians[(d * 9u + r * 3u + c) * output_stride + q],
+                              expected_hessians[d](r, c));
+                }
+            }
+        }
+    }
+
+    for (std::size_t d = 0; d < basis.size(); ++d) {
+        EXPECT_EQ(values[d * output_stride + 2u], Real(-99));
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
new file mode 100644
index 000000000..0899ce358
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -0,0 +1,314 @@
+/**
+ * @file test_BasisHessians.cpp
+ * @brief Analytical Hessian coverage for the migrated Lagrange basis.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <array>
+#include <limits>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void numerical_hessian_helper(const BasisFunction& basis,
+                              const math::Vector<Real, 3>& xi,
+                              std::vector<Hessian>& hessians,
+                              Real eps = Real(1e-5))
+{
+    hessians.assign(basis.size(), Hessian{});
+    const int dim = basis.dimension();
+
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            math::Vector<Real, 3> xi_p = xi;
+            math::Vector<Real, 3> xi_m = xi;
+            const std::size_t sj = static_cast<std::size_t>(j);
+            xi_p[sj] += eps;
+            xi_m[sj] -= eps;
+
+            std::vector<Gradient> g_p;
+            std::vector<Gradient> g_m;
+            basis.evaluate_gradients(xi_p, g_p);
+            basis.evaluate_gradients(xi_m, g_m);
+
+            for (std::size_t n = 0; n < basis.size(); ++n) {
+                const std::size_t si = static_cast<std::size_t>(i);
+                hessians[n](si, sj) = (g_p[n][si] - g_m[n][si]) / (Real(2) * eps);
+            }
+        }
+    }
+}
+
+std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+            return {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}};
+        case ElementType::Triangle3:
+            return {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}};
+        case ElementType::Quad4:
+            return {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+        case ElementType::Tetra4:
+            return {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}};
+        case ElementType::Hex8:
+            return {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+        case ElementType::Wedge6:
+            return {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}};
+        case ElementType::Pyramid5:
+            return {{Real(0.0), Real(0.0), Real(0.2)}, {Real(0.12), Real(-0.08), Real(0.24)}};
+        default:
+            return {{Real(0), Real(0), Real(0)}};
+    }
+}
+
+void expect_hessians_match_numerical(const LagrangeBasis& basis,
+                                     const std::vector<math::Vector<Real, 3>>& points,
+                                     Real tol,
+                                     Real eps = Real(1e-5))
+{
+    for (const auto& xi : points) {
+        std::vector<Hessian> analytical;
+        std::vector<Hessian> numerical;
+        basis.evaluate_hessians(xi, analytical);
+        numerical_hessian_helper(basis, xi, numerical, eps);
+
+        ASSERT_EQ(analytical.size(), numerical.size());
+        for (std::size_t n = 0; n < analytical.size(); ++n) {
+            for (int i = 0; i < basis.dimension(); ++i) {
+                for (int j = 0; j < basis.dimension(); ++j) {
+                    const std::size_t si = static_cast<std::size_t>(i);
+                    const std::size_t sj = static_cast<std::size_t>(j);
+                    EXPECT_NEAR(analytical[n](si, sj), numerical[n](si, sj), tol)
+                        << "basis " << n << ", component (" << i << "," << j
+                        << "), element " << static_cast<int>(basis.element_type())
+                        << ", order " << basis.order();
+                }
+            }
+        }
+    }
+}
+
+void expect_partition_hessian_sum_zero(const LagrangeBasis& basis,
+                                       const math::Vector<Real, 3>& xi,
+                                       Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    Hessian sum{};
+    for (const auto& hessian : hessians) {
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                sum(r, c) += hessian(r, c);
+            }
+        }
+    }
+
+    for (int r = 0; r < basis.dimension(); ++r) {
+        for (int c = 0; c < basis.dimension(); ++c) {
+            EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
+                        Real(0),
+                        tol)
+                << "element " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order();
+        }
+    }
+}
+
+void expect_hessians_symmetric(const LagrangeBasis& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    for (const auto& hessian : hessians) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = r + 1; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(hessian(sr, sc), hessian(sc, sr), tol);
+            }
+        }
+    }
+}
+
+void expect_partition_hessian_sum_zero(const BasisFunction& basis,
+                                       const math::Vector<Real, 3>& xi,
+                                       Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    Hessian sum{};
+    for (const auto& hessian : hessians) {
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                sum(r, c) += hessian(r, c);
+            }
+        }
+    }
+
+    for (int r = 0; r < basis.dimension(); ++r) {
+        for (int c = 0; c < basis.dimension(); ++c) {
+            EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
+                        Real(0),
+                        tol)
+                << "element " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order();
+        }
+    }
+}
+
+void expect_hessians_symmetric(const BasisFunction& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    for (const auto& hessian : hessians) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = r + 1; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(hessian(sr, sc), hessian(sc, sr), tol);
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+        Real eps;
+    } cases[] = {
+        {ElementType::Line2, 3, Real(1e-7), Real(1e-5)},
+        {ElementType::Triangle3, 3, Real(2e-6), Real(1e-5)},
+        {ElementType::Quad4, 3, Real(1e-6), Real(1e-5)},
+        {ElementType::Tetra4, 2, Real(1e-6), Real(1e-5)},
+        {ElementType::Hex8, 2, Real(1e-6), Real(1e-5)},
+        {ElementType::Wedge6, 2, Real(1e-5), Real(1e-5)},
+        {ElementType::Pyramid5, 1, Real(2e-6), Real(1e-5)},
+        {ElementType::Pyramid5, 3, Real(4e-4), Real(2e-5)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_hessians_match_numerical(basis, sample_points_for(c.type), c.tol, c.eps);
+    }
+}
+
+TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
+    const struct Case {
+        ElementType type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 3, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle3, 3, {Real(0.2), Real(0.25), Real(0)}, Real(1e-10)},
+        {ElementType::Quad4, 3, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Tetra4, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
+        {ElementType::Hex8, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Wedge6, 2, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-10)},
+        {ElementType::Pyramid5, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-8)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_partition_hessian_sum_zero(basis, c.xi, Real(10) * c.tol);
+        expect_hessians_symmetric(basis, c.xi, c.tol);
+    }
+}
+
+TEST(BasisHessians, LagrangePyramidExactApexHessianThrows) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Hessian> hessians;
+        EXPECT_THROW(basis.evaluate_hessians(apex, hessians), BasisEvaluationException)
+            << "order " << c.order;
+    }
+}
+
+TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
+    const struct Case {
+        ElementType type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad8, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
+        {ElementType::Hex20, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
+        {ElementType::Wedge15, 2, {Real(0.2), Real(0.3), Real(0.1)}, Real(1e-10)},
+        {ElementType::Pyramid13, 2, {Real(0.1), Real(-0.2), Real(0.4)}, Real(1e-8)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_partition_hessian_sum_zero(basis, c.xi, c.tol);
+        expect_hessians_symmetric(basis, c.xi, c.tol);
+    }
+}
+
+TEST(BasisHessians, SerendipityPyramidExactApexHessianThrows) {
+    SerendipityBasis basis(ElementType::Pyramid13, 2);
+    std::vector<Hessian> hessians;
+    EXPECT_THROW(basis.evaluate_hessians({Real(0), Real(0), Real(1)}, hessians),
+                 BasisEvaluationException);
+}
+
+TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
+    const struct Case {
+        ElementType type;
+        BasisType basis_type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, BasisType::Lagrange, 1, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Line3, BasisType::Lagrange, 2, {Real(-0.25), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle3, BasisType::Lagrange, 1, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle6, BasisType::Lagrange, 2, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
+        {ElementType::Quad4, BasisType::Lagrange, 1, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Quad8, BasisType::Serendipity, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
+        {ElementType::Quad9, BasisType::Lagrange, 2, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Tetra4, BasisType::Lagrange, 1, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-12)},
+        {ElementType::Tetra10, BasisType::Lagrange, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
+        {ElementType::Hex8, BasisType::Lagrange, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Hex20, BasisType::Serendipity, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
+        {ElementType::Hex27, BasisType::Lagrange, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Wedge6, BasisType::Lagrange, 1, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-12)},
+    };
+
+    int covered = 0;
+    for (const auto& c : cases) {
+        auto basis = basis_factory::create(BasisRequest{c.type, c.basis_type, c.order});
+        expect_partition_hessian_sum_zero(*basis, c.xi, c.tol);
+        expect_hessians_symmetric(*basis, c.xi, c.tol);
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 13);
+}
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
new file mode 100644
index 000000000..a1031fa76
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -0,0 +1,226 @@
+/**
+ * @file test_ConstexprBasis.cpp
+ * @brief Compile-time and lightweight runtime checks for migrated Basis helpers.
+ */
+
+#include "FE/Basis/BasisTolerance.h"
+#include "FE/Basis/BasisTraits.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/LagrangeBasisFast.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace {
+
+static_assert(is_line(ElementType::Line2));
+static_assert(is_line(ElementType::Line3));
+static_assert(is_triangle(ElementType::Triangle6));
+static_assert(is_quadrilateral(ElementType::Quad8));
+static_assert(is_tetrahedron(ElementType::Tetra10));
+static_assert(is_hexahedron(ElementType::Hex20));
+static_assert(is_wedge(ElementType::Wedge18));
+static_assert(is_pyramid(ElementType::Pyramid14));
+static_assert(is_simplex(ElementType::Triangle3));
+static_assert(is_simplex(ElementType::Tetra4));
+static_assert(!is_simplex(ElementType::Wedge6));
+static_assert(is_tensor_product(ElementType::Line2));
+static_assert(is_tensor_product(ElementType::Quad9));
+static_assert(is_tensor_product(ElementType::Hex27));
+static_assert(!is_tensor_product(ElementType::Pyramid5));
+static_assert(reference_dimension(ElementType::Pyramid14) == 3);
+static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
+static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
+static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
+static_assert(complete_lagrange_alias_order(ElementType::Hex20) == -1);
+static_assert(line_lagrange_size(2) == 3u);
+static_assert(triangle_lagrange_size(2) == 6u);
+static_assert(quad_lagrange_size(2) == 9u);
+static_assert(tetra_lagrange_size(2) == 10u);
+static_assert(hex_lagrange_size(2) == 27u);
+static_assert(wedge_lagrange_size(2) == 18u);
+static_assert(pyramid_lagrange_size(2) == 14u);
+static_assert(detail::basis_abs(Real(-2)) == Real(2));
+static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
+static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));
+static_assert(detail::basis_nearly_equal(
+    Real(1),
+    Real(1) + std::numeric_limits<Real>::epsilon() * Real(32)));
+
+constexpr auto kLineFastValues = [] {
+    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::array<Real, LagrangeLineFast<1>::n_dofs> values{};
+    LagrangeLineFast<1>::evaluate(xi, values);
+    return values;
+}();
+static_assert(kLineFastValues[0] == Real(0.5));
+static_assert(kLineFastValues[1] == Real(0.5));
+
+constexpr auto kLineP2FastHessians = [] {
+    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::array<Hessian, LagrangeLineFast<2>::n_dofs> hessians{};
+    LagrangeLineFast<2>::evaluate_hessians(xi, hessians);
+    return hessians;
+}();
+static_assert(kLineP2FastHessians[0](0, 0) == Real(1));
+static_assert(kLineP2FastHessians[1](0, 0) == Real(1));
+static_assert(kLineP2FastHessians[2](0, 0) == Real(-2));
+
+constexpr auto kTriP2FastValues = [] {
+    math::Vector<Real, 3> xi{Real(0.25), Real(0.25), Real(0)};
+    std::array<Real, LagrangeTriFast<2>::n_dofs> values{};
+    LagrangeTriFast<2>::evaluate(xi, values);
+    return values;
+}();
+static_assert(kTriP2FastValues[0] == Real(0));
+static_assert(kTriP2FastValues[3] == Real(0.5));
+static_assert(kTriP2FastValues[4] == Real(0.25));
+
+template<typename Basis>
+constexpr bool overrides_scalar_strided_v =
+    !std::is_same_v<decltype(&Basis::evaluate_at_quadrature_points_strided),
+                    decltype(&BasisFunction::evaluate_at_quadrature_points_strided)>;
+
+template<typename FastBasis>
+void expect_fast_matches_lagrange(ElementType type,
+                                  int order,
+                                  const std::vector<math::Vector<Real, 3>>& points)
+{
+    LagrangeBasis basis(type, order);
+    for (const auto& xi : points) {
+        std::vector<Real> expected_values;
+        std::vector<Gradient> expected_gradients;
+        std::vector<Hessian> expected_hessians;
+        basis.evaluate_all(xi, expected_values, expected_gradients, expected_hessians);
+
+        std::array<Real, FastBasis::n_dofs> values{};
+        std::array<Gradient, FastBasis::n_dofs> gradients{};
+        std::array<Hessian, FastBasis::n_dofs> hessians{};
+        FastBasis::evaluate(xi, values);
+        FastBasis::evaluate_gradients(xi, gradients);
+        FastBasis::evaluate_hessians(xi, hessians);
+
+        ASSERT_EQ(expected_values.size(), values.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_NEAR(values[i], expected_values[i], Real(1e-14));
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(gradients[i][d], expected_gradients[i][d], Real(1e-14));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(hessians[i](d, e), expected_hessians[i](d, e), Real(1e-14));
+                }
+            }
+        }
+    }
+}
+
+TEST(ConstexprBasis, FixedNodeTableSizes) {
+    const std::vector<std::pair<ElementType, std::size_t>> expected = {
+        {ElementType::Line2, 2u},
+        {ElementType::Line3, 3u},
+        {ElementType::Triangle3, 3u},
+        {ElementType::Triangle6, 6u},
+        {ElementType::Quad4, 4u},
+        {ElementType::Quad8, 8u},
+        {ElementType::Quad9, 9u},
+        {ElementType::Tetra4, 4u},
+        {ElementType::Tetra10, 10u},
+        {ElementType::Hex8, 8u},
+        {ElementType::Hex20, 20u},
+        {ElementType::Hex27, 27u},
+        {ElementType::Wedge6, 6u},
+        {ElementType::Wedge15, 15u},
+        {ElementType::Wedge18, 18u},
+        {ElementType::Pyramid5, 5u},
+        {ElementType::Pyramid13, 13u},
+        {ElementType::Pyramid14, 14u},
+    };
+
+    for (const auto& [type, size] : expected) {
+        EXPECT_EQ(ReferenceNodeLayout::num_nodes(type), size);
+    }
+}
+
+TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
+    const Real eps = std::numeric_limits<Real>::epsilon();
+    EXPECT_GT(detail::basis_scaled_tolerance(), eps);
+    EXPECT_TRUE(detail::basis_near_zero(eps * Real(32)));
+    EXPECT_FALSE(detail::basis_near_zero(eps * Real(128)));
+    EXPECT_TRUE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(32)));
+    EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(128)));
+}
+
+TEST(ConstexprBasis, LagrangeOverridesStridedEvaluation) {
+    EXPECT_TRUE(overrides_scalar_strided_v<LagrangeBasis>);
+}
+
+TEST(ConstexprBasis, FastSidecarsMatchRuntimeLagrangeBasis) {
+    expect_fast_matches_lagrange<LagrangeLineFast<1>>(
+        ElementType::Line2, 1,
+        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeLineFast<2>>(
+        ElementType::Line2, 2,
+        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeQuadFast<1>>(
+        ElementType::Quad4, 1,
+        {{Real(-0.2), Real(0.3), Real(0)}, {Real(0.35), Real(-0.45), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeHexFast<1>>(
+        ElementType::Hex8, 1,
+        {{Real(-0.2), Real(0.3), Real(0.1)}, {Real(0.35), Real(-0.45), Real(0.25)}});
+    expect_fast_matches_lagrange<LagrangeTriFast<1>>(
+        ElementType::Triangle3, 1,
+        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeTriFast<2>>(
+        ElementType::Triangle3, 2,
+        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeTetFast<1>>(
+        ElementType::Tetra4, 1,
+        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
+    expect_fast_matches_lagrange<LagrangeTetFast<2>>(
+        ElementType::Tetra4, 2,
+        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
+}
+
+TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line2, ElementType::Line2, 1},
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle3, ElementType::Triangle3, 1},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad4, ElementType::Quad4, 1},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra4, ElementType::Tetra4, 1},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex8, ElementType::Hex8, 1},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge6, ElementType::Wedge6, 1},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
+        {ElementType::Pyramid5, ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
+    };
+
+    for (const auto& [alias, canonical_type, order] : aliases) {
+        const auto nodes = ReferenceNodeLayout::get_lagrange_node_coords(canonical_type, order);
+        ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(alias));
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            const auto direct = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_EQ(nodes[i][0], direct[0]);
+            EXPECT_EQ(nodes[i][1], direct[1]);
+            EXPECT_EQ(nodes[i][2], direct[2]);
+        }
+    }
+}
+
+} // namespace
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
new file mode 100644
index 000000000..26efc4070
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
@@ -0,0 +1,173 @@
+/**
+ * @file test_HigherOrderWedgePyramid.cpp
+ * @brief Focused higher-order wedge and pyramid checks for LagrangeBasis.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+
+#include <cmath>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void expect_nodes_close(const std::vector<math::Vector<Real, 3>>& lhs,
+                        const std::vector<math::Vector<Real, 3>>& rhs,
+                        Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
+    for (std::size_t i = 0; i < lhs.size(); ++i) {
+        EXPECT_NEAR(lhs[i][0], rhs[i][0], tol) << "node " << i;
+        EXPECT_NEAR(lhs[i][1], rhs[i][1], tol) << "node " << i;
+        EXPECT_NEAR(lhs[i][2], rhs[i][2], tol) << "node " << i;
+    }
+}
+
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+{
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    std::vector<Real> values;
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            const Real expected = (i == node) ? Real(1) : Real(0);
+            EXPECT_NEAR(values[i], expected, tol)
+                << "node " << node << ", basis " << i;
+        }
+    }
+}
+
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<math::Vector<Real, 3>>& points,
+                                            Real value_tol,
+                                            Real derivative_tol)
+{
+    for (const auto& xi : points) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(xi, values, gradients, hessians);
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            value_sum += values[i];
+            for (std::size_t d = 0; d < 3u; ++d) {
+                gradient_sum[d] += gradients[i][d];
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    hessian_sum(d, e) += hessians[i](d, e);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), value_tol);
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            for (int e = 0; e < basis.dimension(); ++e) {
+                EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
+                                        static_cast<std::size_t>(e)),
+                            Real(0),
+                            derivative_tol);
+            }
+        }
+    }
+}
+
+void expect_all_entries_finite(const LagrangeBasis& basis,
+                               const math::Vector<Real, 3>& xi)
+{
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(xi, values, gradients, hessians);
+
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        EXPECT_TRUE(std::isfinite(static_cast<double>(values[i]))) << "value " << i;
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_TRUE(std::isfinite(static_cast<double>(gradients[i][d])))
+                << "gradient " << i << ", " << d;
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_TRUE(std::isfinite(static_cast<double>(hessians[i](d, e))))
+                    << "hessian " << i << ", " << d << ", " << e;
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(HigherOrderWedgePyramid, CompleteAliasesMatchGeneratedNodeLayouts) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> cases = {
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
+        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
+    };
+
+    for (const auto& [alias, canonical, order] : cases) {
+        LagrangeBasis alias_basis(alias, order);
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
+        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
+
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node " << i;
+            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node " << i;
+            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node " << i;
+        }
+    }
+}
+
+TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
+    LagrangeBasis wedge(ElementType::Wedge6, 3);
+
+    expect_kronecker_at_nodes(wedge, Real(2e-10));
+    expect_partition_gradient_hessian_sums(
+        wedge,
+        {
+            {Real(0.18), Real(0.22), Real(-0.2)},
+            {Real(0.12), Real(0.16), Real(0.1)},
+            {Real(0.25), Real(0.15), Real(0.45)},
+        },
+        Real(1e-12),
+        Real(1e-9));
+}
+
+TEST(HigherOrderWedgePyramid, PyramidOrderThreeIsNodalAndPartitionsUnity) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, 3);
+
+    expect_kronecker_at_nodes(pyramid, Real(5e-8));
+    expect_partition_gradient_hessian_sums(
+        pyramid,
+        {
+            {Real(0), Real(0), Real(0.2)},
+            {Real(0.12), Real(-0.08), Real(0.24)},
+            {Real(-0.08), Real(0.1), Real(0.55)},
+        },
+        Real(1e-11),
+        Real(5e-7));
+}
+
+TEST(HigherOrderWedgePyramid, PyramidNearApexDerivativeQueriesRemainFinite) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    for (const auto& [type, order] : cases) {
+        LagrangeBasis basis(type, order);
+        expect_all_entries_finite(basis, {Real(0.01), Real(-0.005), Real(0.92)});
+        expect_all_entries_finite(basis, {Real(-0.004), Real(0.007), Real(0.98)});
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
new file mode 100644
index 000000000..a88d860e9
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -0,0 +1,3028 @@
+/**
+ * @file test_LagrangeBasis.cpp
+ * @brief Unit tests for Lagrange basis functions
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+#include "fs.h"
+#include "nn.h"
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <map>
+#include <math.h>
+#include <numeric>
+#include <string>
+#include <vector>
+
+namespace legacy_solver_nn {
+using namespace consts;
+#include "nn_elem_gip.h"
+#include "nn_elem_gnn.h"
+#include "nn_elem_gnnxx.h"
+} // namespace legacy_solver_nn
+
+using svmp::FE::basis::LagrangeBasis;
+using svmp::FE::ElementType;
+using svmp::FE::Real;
+using svmp::FE::basis::Gradient;
+using svmp::FE::basis::Hessian;
+using svmp::FE::basis::ReferenceNodeLayout;
+
+namespace {
+
+using Point = svmp::FE::math::Vector<Real, 3>;
+
+struct SolverBasisAdapterCase {
+    consts::ElementType type;
+    consts::ElementType quadrature_type;
+    int insd;
+    int eNoN;
+    int nG;
+};
+
+std::vector<SolverBasisAdapterCase> solver_basis_adapter_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
+        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
+        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::QUD8, ElementType::QUD9, 2, 8, 9},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+        {ElementType::TET4, ElementType::TET4, 3, 4, 4},
+        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
+        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
+        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
+        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
+        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
+    };
+}
+
+std::vector<SolverBasisAdapterCase> solver_face_basis_adapter_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
+        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
+        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::QUD8, ElementType::QUD8, 2, 8, 9},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+    };
+}
+
+std::vector<SolverBasisAdapterCase> solver_hessian_adapter_cases() {
+    return solver_basis_adapter_cases();
+}
+
+std::vector<SolverBasisAdapterCase> solver_legacy_hessian_parity_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
+    };
+}
+
+int packed_hessian_components(int insd) {
+    if (insd == 1) {
+        return 1;
+    }
+    if (insd == 2) {
+        return 3;
+    }
+    return 6;
+}
+
+void fill_legacy_quadrature(const SolverBasisAdapterCase& c,
+                            Vector<double>& w,
+                            Array<double>& xi) {
+    mshType mesh;
+    mesh.eType = c.quadrature_type;
+    mesh.eNoN = c.eNoN;
+    mesh.nG = c.nG;
+    mesh.w.resize(c.nG);
+    mesh.xi.resize(c.insd, c.nG);
+    legacy_solver_nn::set_element_gauss_int_data.at(c.quadrature_type)(mesh);
+    w = mesh.w;
+    xi = mesh.xi;
+}
+
+faceType initialized_face_for_case(const SolverBasisAdapterCase& c) {
+    faceType face;
+    face.eType = c.type;
+    face.eNoN = c.eNoN;
+    face.nG = c.nG;
+    face.w.resize(c.nG);
+    face.xi.resize(c.insd, c.nG);
+    legacy_solver_nn::set_face_gauss_int_data.at(c.quadrature_type)(face);
+    face.N.resize(c.eNoN, c.nG);
+    face.Nx.resize(c.insd, c.eNoN, c.nG);
+    return face;
+}
+
+void expect_arrays_near(const Array<double>& actual,
+                        const Array<double>& expected,
+                        double tol) {
+    ASSERT_EQ(actual.nrows(), expected.nrows());
+    ASSERT_EQ(actual.ncols(), expected.ncols());
+    for (int col = 0; col < actual.ncols(); ++col) {
+        for (int row = 0; row < actual.nrows(); ++row) {
+            EXPECT_NEAR(actual(row, col), expected(row, col), tol)
+                << "row=" << row << ", col=" << col;
+        }
+    }
+}
+
+void expect_vectors_near(const Vector<double>& actual,
+                         const Vector<double>& expected,
+                         double tol) {
+    ASSERT_EQ(actual.size(), expected.size());
+    for (int i = 0; i < actual.size(); ++i) {
+        EXPECT_NEAR(actual(i), expected(i), tol) << "index=" << i;
+    }
+}
+
+void expect_array3_near(const Array3<double>& actual,
+                        const Array3<double>& expected,
+                        double tol) {
+    ASSERT_EQ(actual.nrows(), expected.nrows());
+    ASSERT_EQ(actual.ncols(), expected.ncols());
+    ASSERT_EQ(actual.nslices(), expected.nslices());
+    for (int slice = 0; slice < actual.nslices(); ++slice) {
+        for (int col = 0; col < actual.ncols(); ++col) {
+            for (int row = 0; row < actual.nrows(); ++row) {
+                EXPECT_NEAR(actual(row, col, slice), expected(row, col, slice), tol)
+                    << "row=" << row << ", col=" << col << ", slice=" << slice;
+            }
+        }
+    }
+}
+
+void fill_array3(Array3<double>& values, double value) {
+    for (int slice = 0; slice < values.nslices(); ++slice) {
+        for (int col = 0; col < values.ncols(); ++col) {
+            for (int row = 0; row < values.nrows(); ++row) {
+                values(row, col, slice) = value;
+            }
+        }
+    }
+}
+
+void expect_face_partition_identities(const SolverBasisAdapterCase& c,
+                                      const faceType& face,
+                                      int g,
+                                      double tol) {
+    double partition = 0.0;
+    std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
+
+    for (int a = 0; a < c.eNoN; ++a) {
+        EXPECT_TRUE(std::isfinite(face.N(a, g)))
+            << "element=" << static_cast<int>(c.type)
+            << ", node=" << a
+            << ", g=" << g;
+        partition += face.N(a, g);
+
+        for (int d = 0; d < c.insd; ++d) {
+            EXPECT_TRUE(std::isfinite(face.Nx(d, a, g)))
+                << "element=" << static_cast<int>(c.type)
+                << ", d=" << d
+                << ", node=" << a
+                << ", g=" << g;
+            gradient_sum[static_cast<std::size_t>(d)] += face.Nx(d, a, g);
+        }
+    }
+
+    EXPECT_NEAR(partition, 1.0, tol)
+        << "element=" << static_cast<int>(c.type) << ", g=" << g;
+    for (int d = 0; d < c.insd; ++d) {
+        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
+            << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
+    }
+}
+
+bool array3_has_nonzero_component(const Array3<double>& values,
+                                  int row,
+                                  double tol) {
+    for (int slice = 0; slice < values.nslices(); ++slice) {
+        for (int col = 0; col < values.ncols(); ++col) {
+            if (std::abs(values(row, col, slice)) > tol) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+Array<double> single_point_xi(const SolverBasisAdapterCase& c,
+                              const Array<double>& xi,
+                              int g) {
+    Array<double> point(c.insd, 1);
+    for (int d = 0; d < c.insd; ++d) {
+        point(d, 0) = xi(d, g);
+    }
+    return point;
+}
+
+std::vector<double> finite_difference_solver_second_derivative(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& point,
+    int gradient_component,
+    int coordinate_component,
+    double eps) {
+    Array<double> xi_plus = point;
+    Array<double> xi_minus = point;
+    xi_plus(coordinate_component, 0) += eps;
+    xi_minus(coordinate_component, 0) -= eps;
+
+    Array<double> N_plus(c.eNoN, 1);
+    Array<double> N_minus(c.eNoN, 1);
+    Array3<double> Nx_plus(c.insd, c.eNoN, 1);
+    Array3<double> Nx_minus(c.insd, c.eNoN, 1);
+
+    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_plus, N_plus, Nx_plus);
+    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_minus, N_minus, Nx_minus);
+
+    std::vector<double> values(static_cast<std::size_t>(c.eNoN));
+    for (int a = 0; a < c.eNoN; ++a) {
+        values[static_cast<std::size_t>(a)] =
+            (Nx_plus(gradient_component, a, 0) - Nx_minus(gradient_component, a, 0)) /
+            (2.0 * eps);
+    }
+    return values;
+}
+
+void expect_packed_hessian_component_matches_finite_difference(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& point,
+    const Array3<double>& Nxx,
+    int g,
+    int packed_row,
+    int first_derivative_component,
+    int second_derivative_component,
+    double tol) {
+    const double eps = 2e-6;
+    const auto numerical = finite_difference_solver_second_derivative(
+        c, point, first_derivative_component, second_derivative_component, eps);
+    for (int a = 0; a < c.eNoN; ++a) {
+        EXPECT_NEAR(Nxx(packed_row, a, g), numerical[static_cast<std::size_t>(a)], tol)
+            << "element=" << static_cast<int>(c.type)
+            << ", packed_row=" << packed_row
+            << ", node=" << a
+            << ", g=" << g;
+    }
+
+    if (first_derivative_component != second_derivative_component) {
+        const auto symmetric_numerical = finite_difference_solver_second_derivative(
+            c, point, second_derivative_component, first_derivative_component, eps);
+        for (int a = 0; a < c.eNoN; ++a) {
+            EXPECT_NEAR(Nxx(packed_row, a, g),
+                        symmetric_numerical[static_cast<std::size_t>(a)],
+                        tol)
+                << "element=" << static_cast<int>(c.type)
+                << ", symmetry packed_row=" << packed_row
+                << ", node=" << a
+                << ", g=" << g;
+        }
+    }
+}
+
+void expect_solver_hessian_matches_gradient_finite_difference(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& xi,
+    int g,
+    const Array3<double>& Nxx,
+    double tol) {
+    const Array<double> point = single_point_xi(c, xi, g);
+
+    expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 0, 0, 0, tol);
+    if (c.insd >= 2) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 1, 1, 1, tol);
+    }
+    if (c.insd == 2) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 0, 1, tol);
+    } else if (c.insd >= 3) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 2, 2, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 3, 0, 1, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 4, 1, 2, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 5, 0, 2, tol);
+    }
+}
+
+void expect_partition_hessian_identity(const SolverBasisAdapterCase& c,
+                                       const Array3<double>& Nxx,
+                                       int g,
+                                       double tol) {
+    for (int row = 0; row < Nxx.nrows(); ++row) {
+        double sum = 0.0;
+        for (int a = 0; a < c.eNoN; ++a) {
+            sum += Nxx(row, a, g);
+        }
+        EXPECT_NEAR(sum, 0.0, tol)
+            << "element=" << static_cast<int>(c.type)
+            << ", packed_row=" << row
+            << ", g=" << g;
+    }
+}
+
+void expect_all_hessians_zero(const SolverBasisAdapterCase& c,
+                              const Array3<double>& Nxx,
+                              int g,
+                              double tol) {
+    for (int row = 0; row < Nxx.nrows(); ++row) {
+        for (int a = 0; a < c.eNoN; ++a) {
+            EXPECT_NEAR(Nxx(row, a, g), 0.0, tol)
+                << "element=" << static_cast<int>(c.type)
+                << ", packed_row=" << row
+                << ", node=" << a
+                << ", g=" << g;
+        }
+    }
+}
+
+mshType initialized_mesh_for_case(const SolverBasisAdapterCase& c, bool force_lShpF) {
+    mshType mesh;
+    mesh.nFs = 1;
+    mesh.eType = c.type;
+    mesh.eNoN = c.eNoN;
+    mesh.nG = c.nG;
+    mesh.lShpF = force_lShpF;
+    mesh.w.resize(c.nG);
+    mesh.xi.resize(c.insd, c.nG);
+    mesh.N.resize(c.eNoN, c.nG);
+    mesh.Nx.resize(c.insd, c.eNoN, c.nG);
+    mesh.xib.resize(2, c.insd);
+    mesh.Nb.resize(2, c.eNoN);
+
+    nn::get_gip(c.insd, c.quadrature_type, c.nG, mesh.w, mesh.xi);
+    for (int g = 0; g < c.nG; ++g) {
+        nn::get_gnn(c.insd, c.type, c.eNoN, g, mesh.xi, mesh.N, mesh.Nx);
+    }
+    nn::get_nn_bnds(c.insd, c.type, c.eNoN, mesh.xib, mesh.Nb);
+    return mesh;
+}
+
+enum class PyramidFace {
+    Base,
+    South,
+    East,
+    North,
+    West
+};
+
+enum class PyramidEdge {
+    BaseSouth,
+    BaseEast,
+    BaseNorth,
+    BaseWest,
+    VerticalSW,
+    VerticalSE,
+    VerticalNE,
+    VerticalNW
+};
+
+struct LagrangeAccuracyCase {
+    ElementType type;
+    int order;
+    std::vector<Point> points;
+};
+
+std::size_t expected_lagrange_size(ElementType type, int order) {
+    switch (type) {
+        case ElementType::Point1:
+            return 1u;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return static_cast<std::size_t>(order + 1);
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 2) / 2;
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 1);
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) *
+                   static_cast<std::size_t>(order + 3) / 6;
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1);
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) / 2;
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) *
+                   static_cast<std::size_t>(2 * order + 3) / 6;
+        default:
+            return 0u;
+    }
+}
+
+int expected_dimension(ElementType type) {
+    switch (type) {
+        case ElementType::Point1:
+            return 0;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return 1;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return 2;
+        default:
+            return 3;
+    }
+}
+
+bool points_close(const Point& a,
+                  const Point& b,
+                  Real tol = Real(1e-12)) {
+    return std::abs(a[0] - b[0]) <= tol &&
+           std::abs(a[1] - b[1]) <= tol &&
+           std::abs(a[2] - b[2]) <= tol;
+}
+
+std::vector<Point> reference_node_coords(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+            return {
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+            };
+        case ElementType::Line3:
+            return {
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Triangle3:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+            };
+        case ElementType::Triangle6:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+            };
+        case ElementType::Quad4:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+            };
+        case ElementType::Quad8:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+            };
+        case ElementType::Quad9:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Tetra4:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+            };
+        case ElementType::Tetra10:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+                Point{Real(0), Real(0), Real(0.5)},
+                Point{Real(0.5), Real(0), Real(0.5)},
+                Point{Real(0), Real(0.5), Real(0.5)},
+            };
+        case ElementType::Hex8:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+            };
+        case ElementType::Hex20:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+                Point{Real(0), Real(-1), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(-1), Real(0), Real(-1)},
+                Point{Real(0), Real(-1), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(-1), Real(0), Real(1)},
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+            };
+        case ElementType::Hex27:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+                Point{Real(0), Real(-1), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(-1), Real(0), Real(-1)},
+                Point{Real(0), Real(-1), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(-1), Real(0), Real(1)},
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Wedge6:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+            };
+        case ElementType::Wedge15:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(0.5), Real(0), Real(-1)},
+                Point{Real(0.5), Real(0.5), Real(-1)},
+                Point{Real(0), Real(0.5), Real(-1)},
+                Point{Real(0.5), Real(0), Real(1)},
+                Point{Real(0.5), Real(0.5), Real(1)},
+                Point{Real(0), Real(0.5), Real(1)},
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+            };
+        case ElementType::Wedge18:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(0.5), Real(0), Real(-1)},
+                Point{Real(0.5), Real(0.5), Real(-1)},
+                Point{Real(0), Real(0.5), Real(-1)},
+                Point{Real(0.5), Real(0), Real(1)},
+                Point{Real(0.5), Real(0.5), Real(1)},
+                Point{Real(0), Real(0.5), Real(1)},
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+            };
+        case ElementType::Pyramid5:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+            };
+        case ElementType::Pyramid13:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(-0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(0.5), Real(0.5)},
+                Point{Real(-0.5), Real(0.5), Real(0.5)},
+            };
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(-0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(0.5), Real(0.5)},
+                Point{Real(-0.5), Real(0.5), Real(0.5)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        default:
+            return {};
+    }
+}
+
+void expect_nodes_match_node_ordering(ElementType canonical_type,
+                                      int order,
+                                      ElementType node_ordering_type) {
+    LagrangeBasis basis(canonical_type, order);
+    const auto& nodes = basis.nodes();
+
+    ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(node_ordering_type));
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        const auto expected = ReferenceNodeLayout::get_node_coords(node_ordering_type, i);
+        EXPECT_NEAR(nodes[i][0], expected[0], 1e-14);
+        EXPECT_NEAR(nodes[i][1], expected[1], 1e-14);
+        EXPECT_NEAR(nodes[i][2], expected[2], 1e-14);
+
+        std::vector<Real> vals;
+        basis.evaluate_values(expected, vals);
+        ASSERT_EQ(vals.size(), nodes.size());
+        for (std::size_t j = 0; j < vals.size(); ++j) {
+            const double expected_delta = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(vals[j], expected_delta, 1e-12);
+        }
+    }
+}
+
+void expect_alias_matches_canonical(ElementType alias_type,
+                                    ElementType canonical_type,
+                                    int canonical_order,
+                                    const std::vector<Point>& points,
+                                    Real tol = Real(1e-12)) {
+    LagrangeBasis alias(alias_type, canonical_order);
+    LagrangeBasis canonical(canonical_type, canonical_order);
+
+    ASSERT_EQ(alias.element_type(), canonical.element_type());
+    ASSERT_EQ(alias.order(), canonical.order());
+    ASSERT_EQ(alias.size(), canonical.size());
+    ASSERT_EQ(alias.nodes().size(), canonical.nodes().size());
+
+    for (std::size_t i = 0; i < alias.nodes().size(); ++i) {
+        EXPECT_NEAR(alias.nodes()[i][0], canonical.nodes()[i][0], tol);
+        EXPECT_NEAR(alias.nodes()[i][1], canonical.nodes()[i][1], tol);
+        EXPECT_NEAR(alias.nodes()[i][2], canonical.nodes()[i][2], tol);
+    }
+
+    for (const auto& xi : points) {
+        std::vector<Real> alias_values;
+        std::vector<Real> canonical_values;
+        std::vector<Gradient> alias_gradients;
+        std::vector<Gradient> canonical_gradients;
+        std::vector<Hessian> alias_hessians;
+        std::vector<Hessian> canonical_hessians;
+
+        alias.evaluate_values(xi, alias_values);
+        canonical.evaluate_values(xi, canonical_values);
+        alias.evaluate_gradients(xi, alias_gradients);
+        canonical.evaluate_gradients(xi, canonical_gradients);
+        alias.evaluate_hessians(xi, alias_hessians);
+        canonical.evaluate_hessians(xi, canonical_hessians);
+
+        ASSERT_EQ(alias_values.size(), canonical_values.size());
+        ASSERT_EQ(alias_gradients.size(), canonical_gradients.size());
+        ASSERT_EQ(alias_hessians.size(), canonical_hessians.size());
+
+        for (std::size_t i = 0; i < alias_values.size(); ++i) {
+            EXPECT_NEAR(alias_values[i], canonical_values[i], tol);
+            for (int d = 0; d < canonical.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(alias_gradients[i][sd], canonical_gradients[i][sd], tol);
+                for (int e = 0; e < canonical.dimension(); ++e) {
+                    const std::size_t se = static_cast<std::size_t>(e);
+                    EXPECT_NEAR(alias_hessians[i](sd, se), canonical_hessians[i](sd, se), Real(5) * tol);
+                }
+            }
+        }
+    }
+}
+
+std::vector<Point> sample_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return {
+                Point{Real(-0.7), Real(0), Real(0)},
+                Point{Real(0.1), Real(0), Real(0)},
+                Point{Real(0.65), Real(0), Real(0)}
+            };
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return {
+                Point{Real(0.15), Real(0.2), Real(0)},
+                Point{Real(0.25), Real(0.1), Real(0)},
+                Point{Real(0.2), Real(0.3), Real(0)}
+            };
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return {
+                Point{Real(0.2), Real(-0.35), Real(0)},
+                Point{Real(-0.4), Real(0.25), Real(0)},
+                Point{Real(0.55), Real(0.1), Real(0)}
+            };
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return {
+                Point{Real(0.1), Real(0.2), Real(0.15)},
+                Point{Real(0.2), Real(0.1), Real(0.25)},
+                Point{Real(0.15), Real(0.15), Real(0.2)}
+            };
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return {
+                Point{Real(0.2), Real(-0.3), Real(0.25)},
+                Point{Real(-0.5), Real(0.4), Real(-0.2)},
+                Point{Real(0.1), Real(0.15), Real(0.6)}
+            };
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return {
+                Point{Real(0.2), Real(0.25), Real(0.0)},
+                Point{Real(0.1), Real(0.2), Real(-0.45)},
+                Point{Real(0.3), Real(0.15), Real(0.5)}
+            };
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(0.0), Real(0.0), Real(0.25)},
+                Point{Real(0.15), Real(-0.1), Real(0.3)},
+                Point{Real(-0.1), Real(0.2), Real(0.4)}
+            };
+        default:
+            return {Point{Real(0), Real(0), Real(0)}};
+    }
+}
+
+std::vector<Point> boundary_stress_points_for(ElementType type);
+
+std::vector<Point> dense_sample_points_for(ElementType type) {
+    const auto interior = sample_points_for(type);
+    const auto boundary = boundary_stress_points_for(type);
+
+    std::vector<Point> points;
+    points.reserve(interior.size() + boundary.size());
+    points.insert(points.end(), interior.begin(), interior.end());
+    points.insert(points.end(), boundary.begin(), boundary.end());
+
+    if (type == ElementType::Pyramid5 || type == ElementType::Pyramid14) {
+        points.push_back(Point{Real(0.0), Real(0.0), Real(0.85)});
+        points.push_back(Point{Real(0.02), Real(-0.015), Real(0.95)});
+    }
+    return points;
+}
+
+std::vector<Point> boundary_stress_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return {
+                Point{Real(-0.999), Real(0), Real(0)},
+                Point{Real(-0.75), Real(0), Real(0)},
+                Point{Real(0.0), Real(0), Real(0)},
+                Point{Real(0.8), Real(0), Real(0)},
+                Point{Real(0.999), Real(0), Real(0)}
+            };
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(0)},
+                Point{Real(0.98), Real(0.01), Real(0)},
+                Point{Real(0.01), Real(0.98), Real(0)},
+                Point{Real(0.25), Real(1e-4), Real(0)},
+                Point{Real(0.49), Real(0.49), Real(0)}
+            };
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return {
+                Point{Real(-0.99), Real(-0.99), Real(0)},
+                Point{Real(0.99), Real(-0.99), Real(0)},
+                Point{Real(0.99), Real(0.99), Real(0)},
+                Point{Real(-0.99), Real(0.99), Real(0)},
+                Point{Real(0.0), Real(0.95), Real(0)}
+            };
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(1e-6)},
+                Point{Real(0.97), Real(0.01), Real(0.01)},
+                Point{Real(0.01), Real(0.97), Real(0.01)},
+                Point{Real(0.01), Real(0.01), Real(0.97)},
+                Point{Real(0.32), Real(0.33), Real(0.01)}
+            };
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return {
+                Point{Real(-0.99), Real(-0.99), Real(-0.99)},
+                Point{Real(0.99), Real(-0.99), Real(0.99)},
+                Point{Real(0.99), Real(0.99), Real(-0.99)},
+                Point{Real(-0.99), Real(0.99), Real(0.99)},
+                Point{Real(0.0), Real(0.0), Real(0.95)}
+            };
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(-0.99)},
+                Point{Real(0.98), Real(0.01), Real(-0.99)},
+                Point{Real(0.01), Real(0.98), Real(0.99)},
+                Point{Real(0.49), Real(0.49), Real(0.0)},
+                Point{Real(0.25), Real(1e-4), Real(0.95)}
+            };
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(0.0), Real(0.0), Real(0.95)},
+                Point{Real(0.01), Real(-0.01), Real(0.98)},
+                Point{Real(0.6), Real(-0.6), Real(0.2)},
+                Point{Real(0.79), Real(0.0), Real(0.2)},
+                Point{Real(0.0), Real(0.79), Real(0.2)}
+            };
+        default:
+            return {Point{Real(0), Real(0), Real(0)}};
+    }
+}
+
+Real monomial_value(const Point& xi, int px, int py, int pz) {
+    return std::pow(xi[0], px) * std::pow(xi[1], py) * std::pow(xi[2], pz);
+}
+
+void expect_gradients_match_finite_difference(const LagrangeAccuracyCase& c,
+                                              Real eps,
+                                              Real tol) {
+    LagrangeBasis basis(c.type, c.order);
+
+    for (const auto& xi : c.points) {
+        std::vector<Gradient> gradients;
+        basis.evaluate_gradients(xi, gradients);
+        ASSERT_EQ(gradients.size(), basis.size());
+
+        for (int d = 0; d < basis.dimension(); ++d) {
+            Point xp = xi;
+            Point xm = xi;
+            xp[d] += eps;
+            xm[d] -= eps;
+
+            std::vector<Real> values_p;
+            std::vector<Real> values_m;
+            basis.evaluate_values(xp, values_p);
+            basis.evaluate_values(xm, values_m);
+
+            ASSERT_EQ(values_p.size(), basis.size());
+            ASSERT_EQ(values_m.size(), basis.size());
+            for (std::size_t i = 0; i < basis.size(); ++i) {
+                const Real fd = (values_p[i] - values_m[i]) / (Real(2) * eps);
+                EXPECT_NEAR(gradients[i][d], fd, tol)
+                    << "type=" << static_cast<int>(c.type)
+                    << ", order=" << c.order
+                    << ", dim=" << d
+                    << ", basis_i=" << i
+                    << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+            }
+        }
+    }
+}
+
+void expect_polynomial_reproduction(const LagrangeAccuracyCase& c,
+                                    const std::vector<std::array<int, 3>>& exponents,
+                                    Real tol) {
+    LagrangeBasis basis(c.type, c.order);
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    for (const auto& exp : exponents) {
+        std::vector<Real> coeffs(basis.size(), Real(0));
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            coeffs[i] = monomial_value(nodes[i], exp[0], exp[1], exp[2]);
+        }
+
+        for (const auto& xi : c.points) {
+            std::vector<Real> values;
+            basis.evaluate_values(xi, values);
+            ASSERT_EQ(values.size(), basis.size());
+
+            Real interpolated = Real(0);
+            for (std::size_t i = 0; i < basis.size(); ++i) {
+                interpolated += coeffs[i] * values[i];
+            }
+
+            const Real exact = monomial_value(xi, exp[0], exp[1], exp[2]);
+            EXPECT_NEAR(interpolated, exact, tol)
+                << "type=" << static_cast<int>(c.type)
+                << ", order=" << c.order
+                << ", monomial=(" << exp[0] << "," << exp[1] << "," << exp[2] << ")"
+                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+        }
+    }
+}
+
+template<typename Container>
+void expect_all_finite(const Container& values) {
+    for (const auto& value : values) {
+        for (std::size_t d = 0; d < 3; ++d) {
+            EXPECT_TRUE(std::isfinite(value[d]));
+        }
+    }
+}
+
+void expect_hessians_finite(const std::vector<Hessian>& hessians,
+                            int dimension) {
+    for (const auto& H : hessians) {
+        for (int i = 0; i < dimension; ++i) {
+            for (int j = 0; j < dimension; ++j) {
+                EXPECT_TRUE(std::isfinite(H(static_cast<std::size_t>(i),
+                                            static_cast<std::size_t>(j))));
+            }
+        }
+    }
+}
+
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<Point>& points,
+                                            Real value_tol,
+                                            Real derivative_tol) {
+    for (const auto& xi : points) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_values(xi, values);
+        basis.evaluate_gradients(xi, gradients);
+        basis.evaluate_hessians(xi, hessians);
+
+        ASSERT_EQ(values.size(), basis.size());
+        ASSERT_EQ(gradients.size(), basis.size());
+        ASSERT_EQ(hessians.size(), basis.size());
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            value_sum += values[i];
+            for (int d = 0; d < basis.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                gradient_sum[sd] += gradients[i][sd];
+                for (int e = 0; e < basis.dimension(); ++e) {
+                    const std::size_t se = static_cast<std::size_t>(e);
+                    hessian_sum(sd, se) += hessians[i](sd, se);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), value_tol)
+            << "Element type " << static_cast<int>(basis.element_type())
+            << ", order " << basis.order()
+            << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+
+        for (int d = 0; d < basis.dimension(); ++d) {
+            const std::size_t sd = static_cast<std::size_t>(d);
+            EXPECT_NEAR(gradient_sum[sd], Real(0), derivative_tol)
+                << "Gradient sum mismatch for element type " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order()
+                << ", dim " << d;
+            for (int e = 0; e < basis.dimension(); ++e) {
+                const std::size_t se = static_cast<std::size_t>(e);
+                EXPECT_NEAR(hessian_sum(sd, se), Real(0), derivative_tol)
+                    << "Hessian sum mismatch for element type " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order()
+                    << ", component (" << d << "," << e << ")";
+            }
+        }
+    }
+}
+
+bool is_on_pyramid_face(const Point& point,
+                        PyramidFace face,
+                        Real tol = Real(1e-12)) {
+    const Real scale = Real(1) - point[2];
+    switch (face) {
+        case PyramidFace::Base:
+            return std::abs(point[2]) <= tol;
+        case PyramidFace::South:
+            return std::abs(point[1] + scale) <= tol;
+        case PyramidFace::East:
+            return std::abs(point[0] - scale) <= tol;
+        case PyramidFace::North:
+            return std::abs(point[1] - scale) <= tol;
+        case PyramidFace::West:
+            return std::abs(point[0] + scale) <= tol;
+    }
+    return false;
+}
+
+Point map_pyramid_face_to_reference(PyramidFace face,
+                                    const Point& point) {
+    const Real scale = Real(1) - point[2];
+    switch (face) {
+        case PyramidFace::Base:
+            return Point{point[0], point[1], Real(0)};
+        case PyramidFace::South:
+            return Point{(scale - point[0]) / Real(2), point[2], Real(0)};
+        case PyramidFace::East:
+            return Point{(scale + point[1]) / Real(2), point[2], Real(0)};
+        case PyramidFace::North:
+            return Point{(scale + point[0]) / Real(2), point[2], Real(0)};
+        case PyramidFace::West:
+            return Point{(scale - point[1]) / Real(2), point[2], Real(0)};
+    }
+    return Point{};
+}
+
+std::vector<Point> sample_points_for_pyramid_face(PyramidFace face) {
+    switch (face) {
+        case PyramidFace::Base:
+            return {
+                Point{Real(0.15), Real(-0.2), Real(0)},
+                Point{Real(-0.55), Real(0.35), Real(0)}
+            };
+        case PyramidFace::South:
+            return {
+                Point{Real(-0.2), Real(-0.8), Real(0.2)},
+                Point{Real(0.05), Real(-0.35), Real(0.65)}
+            };
+        case PyramidFace::East:
+            return {
+                Point{Real(0.8), Real(-0.25), Real(0.2)},
+                Point{Real(0.3), Real(0.08), Real(0.7)}
+            };
+        case PyramidFace::North:
+            return {
+                Point{Real(0.25), Real(0.8), Real(0.2)},
+                Point{Real(-0.08), Real(0.35), Real(0.65)}
+            };
+        case PyramidFace::West:
+            return {
+                Point{Real(-0.8), Real(0.2), Real(0.2)},
+                Point{Real(-0.3), Real(-0.05), Real(0.7)}
+            };
+    }
+    return {};
+}
+
+bool is_on_pyramid_edge(const Point& point,
+                        PyramidEdge edge,
+                        Real tol = Real(1e-12)) {
+    const Real scale = Real(1) - point[2];
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+            return std::abs(point[2]) <= tol && std::abs(point[1] + Real(1)) <= tol;
+        case PyramidEdge::BaseEast:
+            return std::abs(point[2]) <= tol && std::abs(point[0] - Real(1)) <= tol;
+        case PyramidEdge::BaseNorth:
+            return std::abs(point[2]) <= tol && std::abs(point[1] - Real(1)) <= tol;
+        case PyramidEdge::BaseWest:
+            return std::abs(point[2]) <= tol && std::abs(point[0] + Real(1)) <= tol;
+        case PyramidEdge::VerticalSW:
+            return std::abs(point[0] + scale) <= tol && std::abs(point[1] + scale) <= tol;
+        case PyramidEdge::VerticalSE:
+            return std::abs(point[0] - scale) <= tol && std::abs(point[1] + scale) <= tol;
+        case PyramidEdge::VerticalNE:
+            return std::abs(point[0] - scale) <= tol && std::abs(point[1] - scale) <= tol;
+        case PyramidEdge::VerticalNW:
+            return std::abs(point[0] + scale) <= tol && std::abs(point[1] - scale) <= tol;
+    }
+    return false;
+}
+
+Point map_pyramid_edge_to_reference(PyramidEdge edge,
+                                    const Point& point) {
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+        case PyramidEdge::BaseNorth:
+            return Point{point[0], Real(0), Real(0)};
+        case PyramidEdge::BaseEast:
+        case PyramidEdge::BaseWest:
+            return Point{point[1], Real(0), Real(0)};
+        case PyramidEdge::VerticalSW:
+        case PyramidEdge::VerticalSE:
+        case PyramidEdge::VerticalNE:
+        case PyramidEdge::VerticalNW:
+            return Point{Real(2) * point[2] - Real(1), Real(0), Real(0)};
+    }
+    return Point{};
+}
+
+std::vector<Point> sample_points_for_pyramid_edge(PyramidEdge edge) {
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+            return {Point{Real(-0.65), Real(-1), Real(0)}, Point{Real(0.35), Real(-1), Real(0)}};
+        case PyramidEdge::BaseEast:
+            return {Point{Real(1), Real(-0.45), Real(0)}, Point{Real(1), Real(0.55), Real(0)}};
+        case PyramidEdge::BaseNorth:
+            return {Point{Real(-0.55), Real(1), Real(0)}, Point{Real(0.45), Real(1), Real(0)}};
+        case PyramidEdge::BaseWest:
+            return {Point{Real(-1), Real(-0.55), Real(0)}, Point{Real(-1), Real(0.45), Real(0)}};
+        case PyramidEdge::VerticalSW:
+            return {Point{Real(-0.75), Real(-0.75), Real(0.25)}, Point{Real(-0.3), Real(-0.3), Real(0.7)}};
+        case PyramidEdge::VerticalSE:
+            return {Point{Real(0.75), Real(-0.75), Real(0.25)}, Point{Real(0.3), Real(-0.3), Real(0.7)}};
+        case PyramidEdge::VerticalNE:
+            return {Point{Real(0.75), Real(0.75), Real(0.25)}, Point{Real(0.3), Real(0.3), Real(0.7)}};
+        case PyramidEdge::VerticalNW:
+            return {Point{Real(-0.75), Real(0.75), Real(0.25)}, Point{Real(-0.3), Real(0.3), Real(0.7)}};
+    }
+    return {};
+}
+
+std::vector<int> map_pyramid_nodes_to_lower_basis_nodes(
+    const std::vector<Point>& pyramid_nodes,
+    const std::vector<Point>& lower_basis_nodes,
+    const std::function<bool(const Point&)>& selector,
+    const std::function<Point(const Point&)>& mapper) {
+    std::vector<int> mapping(pyramid_nodes.size(), -1);
+    std::size_t face_count = 0;
+    for (std::size_t i = 0; i < pyramid_nodes.size(); ++i) {
+        if (!selector(pyramid_nodes[i])) {
+            continue;
+        }
+
+        ++face_count;
+        const Point mapped = mapper(pyramid_nodes[i]);
+        bool found = false;
+        for (std::size_t j = 0; j < lower_basis_nodes.size(); ++j) {
+            if (points_close(mapped, lower_basis_nodes[j])) {
+                mapping[i] = static_cast<int>(j);
+                found = true;
+                break;
+            }
+        }
+        EXPECT_TRUE(found)
+            << "Failed to match pyramid trace node at (" << pyramid_nodes[i][0] << ","
+            << pyramid_nodes[i][1] << "," << pyramid_nodes[i][2] << ")";
+    }
+
+    EXPECT_EQ(face_count, lower_basis_nodes.size());
+    return mapping;
+}
+
+void expect_pyramid_face_trace_matches_lower_basis(int order,
+                                                   PyramidFace face,
+                                                   Real tol = Real(2e-10)) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, order);
+    const bool base_face = face == PyramidFace::Base;
+    LagrangeBasis lower(base_face ? ElementType::Quad4 : ElementType::Triangle3, order);
+
+    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
+        pyramid.nodes(),
+        lower.nodes(),
+        [&](const Point& point) { return is_on_pyramid_face(point, face); },
+        [&](const Point& point) { return map_pyramid_face_to_reference(face, point); });
+
+    for (const auto& face_point : sample_points_for_pyramid_face(face)) {
+        std::vector<Real> pyramid_values;
+        std::vector<Real> lower_values;
+        pyramid.evaluate_values(face_point, pyramid_values);
+        lower.evaluate_values(map_pyramid_face_to_reference(face, face_point), lower_values);
+
+        ASSERT_EQ(pyramid_values.size(), pyramid.size());
+        ASSERT_EQ(lower_values.size(), lower.size());
+
+        for (std::size_t i = 0; i < pyramid.size(); ++i) {
+            if (mapping[i] >= 0) {
+                EXPECT_NEAR(pyramid_values[i], lower_values[static_cast<std::size_t>(mapping[i])], tol)
+                    << "Face trace mismatch for order " << order
+                    << ", face " << static_cast<int>(face)
+                    << ", basis " << i;
+            } else {
+                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
+                    << "Off-face pyramid basis should vanish on face for order " << order
+                    << ", face " << static_cast<int>(face)
+                    << ", basis " << i;
+            }
+        }
+    }
+}
+
+void expect_pyramid_edge_trace_matches_line_basis(int order,
+                                                  PyramidEdge edge,
+                                                  Real tol = Real(2e-10)) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, order);
+    LagrangeBasis line(ElementType::Line2, order);
+
+    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
+        pyramid.nodes(),
+        line.nodes(),
+        [&](const Point& point) { return is_on_pyramid_edge(point, edge); },
+        [&](const Point& point) { return map_pyramid_edge_to_reference(edge, point); });
+
+    for (const auto& edge_point : sample_points_for_pyramid_edge(edge)) {
+        std::vector<Real> pyramid_values;
+        std::vector<Real> line_values;
+        pyramid.evaluate_values(edge_point, pyramid_values);
+        line.evaluate_values(map_pyramid_edge_to_reference(edge, edge_point), line_values);
+
+        ASSERT_EQ(pyramid_values.size(), pyramid.size());
+        ASSERT_EQ(line_values.size(), line.size());
+
+        for (std::size_t i = 0; i < pyramid.size(); ++i) {
+            if (mapping[i] >= 0) {
+                EXPECT_NEAR(pyramid_values[i], line_values[static_cast<std::size_t>(mapping[i])], tol)
+                    << "Edge trace mismatch for order " << order
+                    << ", edge " << static_cast<int>(edge)
+                    << ", basis " << i;
+            } else {
+                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
+                    << "Off-edge pyramid basis should vanish on edge for order " << order
+                    << ", edge " << static_cast<int>(edge)
+                    << ", basis " << i;
+            }
+        }
+    }
+}
+
+struct StridedOutputRequest {
+    bool values;
+    bool gradients;
+    bool hessians;
+};
+
+void expect_strided_matches_pointwise(ElementType type,
+                                      int order,
+                                      const StridedOutputRequest& request) {
+    LagrangeBasis basis(type, order);
+    const auto points = dense_sample_points_for(type);
+    const std::size_t stride = points.size() + 3u;
+    constexpr Real sentinel = Real(-12345.25);
+
+    std::vector<Real> values(request.values ? basis.size() * stride : 0u, sentinel);
+    std::vector<Real> gradients(request.gradients ? basis.size() * 3u * stride : 0u, sentinel);
+    std::vector<Real> hessians(request.hessians ? basis.size() * 9u * stride : 0u, sentinel);
+
+    basis.evaluate_at_quadrature_points_strided(
+        points,
+        stride,
+        request.values ? values.data() : nullptr,
+        request.gradients ? gradients.data() : nullptr,
+        request.hessians ? hessians.data() : nullptr);
+
+    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
+        ? Real(5e-10)
+        : Real(1e-12);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        if (request.values) {
+            std::vector<Real> expected;
+            basis.evaluate_values(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                EXPECT_NEAR(values[d * stride + q], expected[d], tol)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", dof=" << d
+                    << ", q=" << q;
+            }
+        }
+
+        if (request.gradients) {
+            std::vector<Gradient> expected;
+            basis.evaluate_gradients(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_NEAR(gradients[(d * 3u + c) * stride + q], expected[d][c], tol)
+                        << "type=" << static_cast<int>(type)
+                        << ", order=" << order
+                        << ", dof=" << d
+                        << ", component=" << c
+                        << ", q=" << q;
+                }
+            }
+        }
+
+        if (request.hessians) {
+            std::vector<Hessian> expected;
+            basis.evaluate_hessians(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                for (std::size_t r = 0; r < 3u; ++r) {
+                    for (std::size_t c = 0; c < 3u; ++c) {
+                        EXPECT_NEAR(hessians[(d * 9u + r * 3u + c) * stride + q],
+                                    expected[d](r, c),
+                                    Real(4) * tol)
+                            << "type=" << static_cast<int>(type)
+                            << ", order=" << order
+                            << ", dof=" << d
+                            << ", hessian=(" << r << "," << c << ")"
+                            << ", q=" << q;
+                    }
+                }
+            }
+        }
+    }
+
+    const auto expect_padding_untouched = [&](const std::vector<Real>& buffer,
+                                              std::size_t rows) {
+        for (std::size_t row = 0; row < rows; ++row) {
+            for (std::size_t q = points.size(); q < stride; ++q) {
+                EXPECT_EQ(buffer[row * stride + q], sentinel)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", row=" << row
+                    << ", padding q=" << q;
+            }
+        }
+    };
+
+    if (request.values) {
+        expect_padding_untouched(values, basis.size());
+    }
+    if (request.gradients) {
+        expect_padding_untouched(gradients, basis.size() * 3u);
+    }
+    if (request.hessians) {
+        expect_padding_untouched(hessians, basis.size() * 9u);
+    }
+}
+
+void expect_raw_to_matches_vector_evaluation(ElementType type, int order) {
+    LagrangeBasis basis(type, order);
+    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
+        ? Real(5e-10)
+        : Real(1e-12);
+
+    for (const auto& point : sample_points_for(type)) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(point, values, gradients, hessians);
+
+        std::vector<Real> raw_values(basis.size());
+        std::vector<Real> raw_gradients(basis.size() * 3u);
+        std::vector<Real> raw_hessians(basis.size() * 9u);
+        basis.evaluate_values_to(point, raw_values.data());
+        basis.evaluate_gradients_to(point, raw_gradients.data());
+        basis.evaluate_hessians_to(point, raw_hessians.data());
+
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            EXPECT_NEAR(raw_values[i], values[i], tol)
+                << "type=" << static_cast<int>(type) << ", order=" << order << ", dof=" << i;
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_NEAR(raw_gradients[i * 3u + c], gradients[i][c], tol)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", dof=" << i
+                    << ", gradient component=" << c;
+            }
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_NEAR(raw_hessians[i * 9u + r * 3u + c], hessians[i](r, c), Real(4) * tol)
+                        << "type=" << static_cast<int>(type)
+                        << ", order=" << order
+                        << ", dof=" << i
+                        << ", hessian=(" << r << "," << c << ")";
+                }
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(SolverBasisAdapter, ShapeValuesGradientsAndMeshOverloadMatchLegacy) {
+    constexpr double tol = 2e-12;
+
+    for (const auto& c : solver_basis_adapter_cases()) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        Array<double> legacy_N(c.eNoN, c.nG);
+        Array<double> adapter_N(c.eNoN, c.nG);
+        Array3<double> legacy_Nx(c.insd, c.eNoN, c.nG);
+        Array3<double> adapter_Nx(c.insd, c.eNoN, c.nG);
+        auto legacy_shape = legacy_solver_nn::get_element_shape_data.find(c.type);
+
+        faceType legacy_face;
+        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
+            ASSERT_EQ(c.type, consts::ElementType::QUD8);
+            legacy_face.eType = c.type;
+            legacy_face.eNoN = c.eNoN;
+            legacy_face.nG = c.nG;
+            legacy_face.xi = xi;
+            legacy_face.N.resize(c.eNoN, c.nG);
+            legacy_face.Nx.resize(c.insd, c.eNoN, c.nG);
+        }
+
+        for (int g = 0; g < c.nG; ++g) {
+            if (legacy_shape != legacy_solver_nn::get_element_shape_data.end()) {
+                legacy_shape->second(c.insd, c.eNoN, g, xi, legacy_N, legacy_Nx);
+            } else {
+                legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
+            }
+            nn::get_gnn(c.insd, c.type, c.eNoN, g, xi, adapter_N, adapter_Nx);
+
+            double partition = 0.0;
+            std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
+            for (int a = 0; a < c.eNoN; ++a) {
+                partition += adapter_N(a, g);
+                for (int d = 0; d < c.insd; ++d) {
+                    gradient_sum[static_cast<std::size_t>(d)] += adapter_Nx(d, a, g);
+                }
+            }
+
+            EXPECT_NEAR(partition, 1.0, tol)
+                << "element=" << static_cast<int>(c.type) << ", g=" << g;
+            for (int d = 0; d < c.insd; ++d) {
+                EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
+                    << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
+            }
+        }
+
+        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
+            legacy_N = legacy_face.N;
+            legacy_Nx = legacy_face.Nx;
+        }
+
+        expect_arrays_near(adapter_N, legacy_N, tol);
+        expect_array3_near(adapter_Nx, legacy_Nx, tol);
+
+        mshType mesh;
+        mesh.eType = c.type;
+        mesh.eNoN = c.eNoN;
+        mesh.nG = c.nG;
+        mesh.xi = xi;
+        mesh.N.resize(c.eNoN, c.nG);
+        mesh.Nx.resize(c.insd, c.eNoN, c.nG);
+        for (int g = 0; g < c.nG; ++g) {
+            nn::get_gnn(g, mesh);
+        }
+
+        expect_arrays_near(mesh.N, legacy_N, tol);
+        expect_array3_near(mesh.Nx, legacy_Nx, tol);
+    }
+}
+
+TEST(SolverFaceBasisAdapter, ShapeValuesGradientsAndDispatchMatchLegacyFaceTable) {
+    constexpr double tol = 2e-12;
+
+    int covered = 0;
+    for (const auto& c : solver_face_basis_adapter_cases()) {
+        SCOPED_TRACE("face element=" + std::to_string(static_cast<int>(c.type)));
+
+        faceType legacy_face = initialized_face_for_case(c);
+        faceType basis_face = initialized_face_for_case(c);
+
+        for (int g = 0; g < c.nG; ++g) {
+            legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
+            nn::get_gnn(nullptr, g, basis_face);
+            expect_face_partition_identities(c, basis_face, g, tol);
+        }
+
+        expect_arrays_near(basis_face.N, legacy_face.N, tol);
+        expect_array3_near(basis_face.Nx, legacy_face.Nx, tol);
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 7);
+}
+
+TEST(SolverFaceBasisAdapter, MappedFacesFailClosedWithoutLegacyFallback) {
+    using consts::ElementType;
+
+    SolverBasisAdapterCase c{ElementType::LIN1, ElementType::LIN1, 1, 3, 2};
+    faceType face = initialized_face_for_case(c);
+
+    try {
+        nn::get_gnn(nullptr, 0, face);
+        FAIL() << "Expected mapped face dispatch to reject mismatched eNoN";
+    } catch (const svmp::FE::basis::BasisEvaluationException& exception) {
+        const std::string message = exception.what();
+        EXPECT_NE(message.find("legacy fallback was not attempted"), std::string::npos)
+            << message;
+    }
+}
+
+TEST(SolverFaceBasisAdapter, PointFaceRemainsLegacyValuePath) {
+    faceType face;
+    face.eType = consts::ElementType::PNT;
+    face.eNoN = 1;
+    face.nG = 1;
+    face.N.resize(1, 1);
+    face.Nx.resize(1, 1, 1);
+    face.N(0, 0) = -7.0;
+    face.Nx(0, 0, 0) = 42.0;
+
+    nn::get_gnn(nullptr, 0, face);
+
+    EXPECT_DOUBLE_EQ(face.N(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(face.Nx(0, 0, 0), 42.0);
+}
+
+TEST(SolverFaceBasisAdapter, UnsupportedFacesThrowClearErrors) {
+    faceType nrb_face;
+    nrb_face.eType = consts::ElementType::NRB;
+    nrb_face.eNoN = 1;
+    nrb_face.nG = 1;
+    nrb_face.N.resize(1, 1);
+    nrb_face.Nx.resize(1, 1, 1);
+    EXPECT_THROW(nn::get_gnn(nullptr, 0, nrb_face), svmp::FE::NotImplementedException);
+
+    faceType unknown_face;
+    unknown_face.eType = consts::ElementType::NA;
+    unknown_face.eNoN = 1;
+    unknown_face.nG = 1;
+    unknown_face.N.resize(1, 1);
+    unknown_face.Nx.resize(1, 1, 1);
+    EXPECT_THROW(nn::get_gnn(nullptr, 0, unknown_face), svmp::FE::InvalidElementException);
+}
+
+TEST(SolverBasisAdapter, QuadraturePathsRemainLegacyCompatible) {
+    constexpr double tol = 0.0;
+
+    for (const auto& c : solver_basis_adapter_cases()) {
+        auto mesh_it = legacy_solver_nn::set_element_gauss_int_data.find(c.type);
+        if (mesh_it != legacy_solver_nn::set_element_gauss_int_data.end()) {
+            mshType legacy_mesh;
+            legacy_mesh.eType = c.type;
+            legacy_mesh.eNoN = c.eNoN;
+            legacy_mesh.nG = c.nG;
+            legacy_mesh.w.resize(c.nG);
+            legacy_mesh.xi.resize(c.insd, c.nG);
+            mesh_it->second(legacy_mesh);
+
+            mshType adapter_mesh;
+            adapter_mesh.eType = c.type;
+            adapter_mesh.eNoN = c.eNoN;
+            adapter_mesh.nG = c.nG;
+            adapter_mesh.w.resize(c.nG);
+            adapter_mesh.xi.resize(c.insd, c.nG);
+            nn::get_gip(adapter_mesh);
+
+            expect_vectors_near(adapter_mesh.w, legacy_mesh.w, tol);
+            expect_arrays_near(adapter_mesh.xi, legacy_mesh.xi, tol);
+        }
+
+        auto scalar_it = legacy_solver_nn::get_element_gauss_int_data.find(c.type);
+        if (scalar_it != legacy_solver_nn::get_element_gauss_int_data.end()) {
+            Vector<double> legacy_w(c.nG);
+            Vector<double> adapter_w(c.nG);
+            Array<double> legacy_xi(c.insd, c.nG);
+            Array<double> adapter_xi(c.insd, c.nG);
+
+            scalar_it->second(c.insd, c.nG, legacy_w, legacy_xi);
+            nn::get_gip(c.insd, c.type, c.nG, adapter_w, adapter_xi);
+
+            expect_vectors_near(adapter_w, legacy_w, tol);
+            expect_arrays_near(adapter_xi, legacy_xi, tol);
+        }
+    }
+
+    mshType legacy_tet;
+    legacy_tet.eType = consts::ElementType::TET4;
+    legacy_tet.eNoN = 4;
+    legacy_tet.nG = 4;
+    legacy_tet.qmTET4 = 0.25;
+    legacy_tet.w.resize(4);
+    legacy_tet.xi.resize(3, 4);
+    legacy_solver_nn::set_element_gauss_int_data.at(consts::ElementType::TET4)(legacy_tet);
+
+    mshType adapter_tet;
+    adapter_tet.eType = consts::ElementType::TET4;
+    adapter_tet.eNoN = 4;
+    adapter_tet.nG = 4;
+    adapter_tet.qmTET4 = 0.25;
+    adapter_tet.w.resize(4);
+    adapter_tet.xi.resize(3, 4);
+    nn::get_gip(adapter_tet);
+
+    expect_vectors_near(adapter_tet.w, legacy_tet.w, tol);
+    expect_arrays_near(adapter_tet.xi, legacy_tet.xi, tol);
+}
+
+TEST(SolverBasisAdapter, HessiansCoverEveryMappedScalarVolumeElement) {
+    constexpr double partition_tol = 2e-10;
+    constexpr double finite_difference_tol = 2e-5;
+    constexpr double zero_tol = 2e-12;
+
+    int covered = 0;
+    for (const auto& c : solver_hessian_adapter_cases()) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        const int ind2 = packed_hessian_components(c.insd);
+        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+        fill_array3(adapter_Nxx, std::numeric_limits<double>::quiet_NaN());
+
+        for (int g = 0; g < c.nG; ++g) {
+            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+            expect_partition_hessian_identity(c, adapter_Nxx, g, partition_tol);
+            expect_solver_hessian_matches_gradient_finite_difference(
+                c, xi, g, adapter_Nxx, finite_difference_tol);
+
+            if (c.type == consts::ElementType::LIN1 ||
+                c.type == consts::ElementType::TRI3 ||
+                c.type == consts::ElementType::TET4) {
+                expect_all_hessians_zero(c, adapter_Nxx, g, zero_tol);
+            }
+        }
+
+        if (c.type == consts::ElementType::QUD4) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 2, zero_tol));
+        } else if (c.type == consts::ElementType::HEX8) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 3, zero_tol));
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 4, zero_tol));
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
+        } else if (c.type == consts::ElementType::WDG) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
+        }
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 13);
+}
+
+TEST(SolverBasisAdapter, HessianPackingMatchesLegacyWhereLegacyIsApproved) {
+    constexpr double tol = 2e-12;
+
+    for (const auto& c : solver_legacy_hessian_parity_cases()) {
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        const int ind2 = packed_hessian_components(c.insd);
+        Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
+        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+
+        for (int g = 0; g < c.nG; ++g) {
+            legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
+                c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
+            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+        }
+
+        expect_array3_near(adapter_Nxx, legacy_Nxx, tol);
+    }
+}
+
+TEST(SolverBasisAdapter, Qud8HessiansDoNotUseLegacyFallback) {
+    using consts::ElementType;
+    SolverBasisAdapterCase c{ElementType::QUD8, ElementType::QUD9, 2, 8, 9};
+
+    Vector<double> weights;
+    Array<double> xi;
+    fill_legacy_quadrature(c, weights, xi);
+
+    const int ind2 = packed_hessian_components(c.insd);
+    Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
+    Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+    fill_array3(legacy_Nxx, 0.0);
+    fill_array3(adapter_Nxx, 0.0);
+
+    for (int g = 0; g < c.nG; ++g) {
+        legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
+            c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
+        nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+    }
+
+    double max_abs_difference = 0.0;
+    for (int g = 0; g < c.nG; ++g) {
+        for (int a = 0; a < c.eNoN; ++a) {
+            for (int row = 0; row < ind2; ++row) {
+                max_abs_difference = std::max(
+                    max_abs_difference,
+                    std::abs(adapter_Nxx(row, a, g) - legacy_Nxx(row, a, g)));
+            }
+        }
+    }
+
+    EXPECT_GT(max_abs_difference, 1e-8);
+}
+
+TEST(SolverBasisAdapter, UnsupportedHessianFamiliesRemainNoOp) {
+    Array<double> xi(1, 1);
+    xi(0, 0) = 0.0;
+    Array3<double> Nxx(1, 1, 1);
+
+    for (const auto unsupported : {consts::ElementType::NRB, consts::ElementType::PNT}) {
+        fill_array3(Nxx, 42.0);
+        nn::get_gn_nxx(1, 1, unsupported, 1, 0, xi, Nxx);
+        EXPECT_DOUBLE_EQ(Nxx(0, 0, 0), 42.0)
+            << "element=" << static_cast<int>(unsupported);
+    }
+}
+
+TEST(SolverBasisAdapter, InitFsMshPopulatesMappedHessiansWithoutLShpFGate) {
+    using consts::ElementType;
+    const SolverBasisAdapterCase cases[] = {
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
+        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
+        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
+        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
+    };
+
+    for (const auto& c : cases) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        ComMod com_mod;
+        com_mod.nsd = c.insd;
+        mshType mesh = initialized_mesh_for_case(c, true);
+
+        fs::init_fs_msh(com_mod, mesh);
+
+        ASSERT_EQ(mesh.fs.size(), 1u);
+        ASSERT_EQ(mesh.fs[0].Nxx.nrows(), packed_hessian_components(c.insd));
+        if (c.type == ElementType::QUD4) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 2, 2e-12));
+        } else if (c.type == ElementType::HEX8) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 3, 2e-12));
+        } else if (c.type == ElementType::WDG) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 5, 2e-12));
+        } else {
+            bool has_nonzero = false;
+            for (int row = 0; row < mesh.fs[0].Nxx.nrows(); ++row) {
+                has_nonzero = has_nonzero ||
+                    array3_has_nonzero_component(mesh.fs[0].Nxx, row, 2e-12);
+            }
+            EXPECT_TRUE(has_nonzero);
+        }
+    }
+}
+
+TEST(LagrangeBasis, QuadPartitionOfUnity) {
+    LagrangeBasis basis(ElementType::Quad4, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.2, -0.3, 0.0};
+
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, LineGradientLinear) {
+    LagrangeBasis basis(ElementType::Line2, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.0, 0.0, 0.0};
+    std::vector<Gradient> grad;
+    basis.evaluate_gradients(xi, grad);
+
+    ASSERT_EQ(grad.size(), 2u);
+    EXPECT_NEAR(grad[0][0], -0.5, 1e-12);
+    EXPECT_NEAR(grad[1][0], 0.5, 1e-12);
+}
+
+TEST(LagrangeBasis, TrianglePartitionOfUnity) {
+    LagrangeBasis basis(ElementType::Triangle3, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.2, 0.3, 0.0};
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, SizeFormulasPerElement) {
+    for (int order = 0; order <= 3; ++order) {
+        {
+            LagrangeBasis line(ElementType::Line2, order);
+            EXPECT_EQ(line.size(), static_cast<std::size_t>(order + 1));
+        }
+        {
+            LagrangeBasis quad(ElementType::Quad4, order);
+            const std::size_t n1d = static_cast<std::size_t>(order + 1);
+            EXPECT_EQ(quad.size(), n1d * n1d);
+        }
+        {
+            LagrangeBasis hex(ElementType::Hex8, order);
+            const std::size_t n1d = static_cast<std::size_t>(order + 1);
+            EXPECT_EQ(hex.size(), n1d * n1d * n1d);
+        }
+        {
+            LagrangeBasis tri(ElementType::Triangle3, order);
+            const std::size_t expected =
+                static_cast<std::size_t>(order + 1) *
+                static_cast<std::size_t>(order + 2) / 2;
+            EXPECT_EQ(tri.size(), expected);
+        }
+        {
+            LagrangeBasis tet(ElementType::Tetra4, order);
+            const std::size_t expected =
+                static_cast<std::size_t>(order + 1) *
+                static_cast<std::size_t>(order + 2) *
+                static_cast<std::size_t>(order + 3) / 6;
+            EXPECT_EQ(tet.size(), expected);
+        }
+    }
+}
+
+TEST(LagrangeBasis, KroneckerDeltaAtNodes) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 1},
+        {ElementType::Quad4, 1},
+        {ElementType::Triangle3, 1},
+        {ElementType::Tetra4, 1},
+        {ElementType::Hex8, 1},
+        {ElementType::Triangle3, 2},
+        {ElementType::Tetra4, 2},
+        {ElementType::Quad4, 2},
+        {ElementType::Hex8, 2},
+        {ElementType::Wedge6, 2}
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.first, c.second);
+        const auto& nodes = basis.nodes();
+        ASSERT_EQ(nodes.size(), basis.size());
+
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            std::vector<Real> vals;
+            basis.evaluate_values(nodes[i], vals);
+            ASSERT_EQ(vals.size(), nodes.size());
+            for (std::size_t j = 0; j < nodes.size(); ++j) {
+                if (i == j) {
+                    EXPECT_NEAR(vals[j], 1.0, 1e-12);
+                } else {
+                    EXPECT_NEAR(vals[j], 0.0, 1e-12);
+                }
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, MatchesNodeOrderingConventionsForLinearAndQuadratic) {
+    // Tensor-product elements
+    expect_nodes_match_node_ordering(ElementType::Line2, 1, ElementType::Line2);
+    expect_nodes_match_node_ordering(ElementType::Line2, 2, ElementType::Line3);
+    expect_nodes_match_node_ordering(ElementType::Quad4, 1, ElementType::Quad4);
+    expect_nodes_match_node_ordering(ElementType::Quad4, 2, ElementType::Quad9);
+    expect_nodes_match_node_ordering(ElementType::Hex8, 1, ElementType::Hex8);
+    expect_nodes_match_node_ordering(ElementType::Hex8, 2, ElementType::Hex27);
+
+    // Simplex elements
+    expect_nodes_match_node_ordering(ElementType::Triangle3, 1, ElementType::Triangle3);
+    expect_nodes_match_node_ordering(ElementType::Triangle3, 2, ElementType::Triangle6);
+    expect_nodes_match_node_ordering(ElementType::Tetra4, 1, ElementType::Tetra4);
+    expect_nodes_match_node_ordering(ElementType::Tetra4, 2, ElementType::Tetra10);
+
+    // Mixed topology
+    expect_nodes_match_node_ordering(ElementType::Wedge6, 1, ElementType::Wedge6);
+    expect_nodes_match_node_ordering(ElementType::Wedge6, 2, ElementType::Wedge18);
+
+    // Pyramid
+    expect_nodes_match_node_ordering(ElementType::Pyramid5, 1, ElementType::Pyramid5);
+    expect_nodes_match_node_ordering(ElementType::Pyramid14, 2, ElementType::Pyramid14);
+}
+
+TEST(LagrangeBasis, WedgeAndPyramidPartitionOfUnity) {
+    {
+        LagrangeBasis wedge(ElementType::Wedge6, 1);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(0.3)};
+        std::vector<Real> vals;
+        wedge.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+    }
+
+    {
+        LagrangeBasis wedge_q(ElementType::Wedge18, 2);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(-0.25)};
+        std::vector<Real> vals;
+        wedge_q.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+
+        // Wedge18 should report 18 nodes in ReferenceNodeLayout
+        EXPECT_EQ(ReferenceNodeLayout::num_nodes(ElementType::Wedge18), 18u);
+        // Corner nodes should match Wedge6 vertices
+        auto v0 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 0);
+        auto v1 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 1);
+        auto v2 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 2);
+        EXPECT_NEAR(v0[0], Real(0), 1e-14);
+        EXPECT_NEAR(v0[1], Real(0), 1e-14);
+        EXPECT_NEAR(v0[2], Real(-1), 1e-14);
+        EXPECT_NEAR(v1[0], Real(1), 1e-14);
+        EXPECT_NEAR(v1[1], Real(0), 1e-14);
+        EXPECT_NEAR(v1[2], Real(-1), 1e-14);
+        EXPECT_NEAR(v2[0], Real(0), 1e-14);
+        EXPECT_NEAR(v2[1], Real(1), 1e-14);
+        EXPECT_NEAR(v2[2], Real(-1), 1e-14);
+    }
+
+    {
+        LagrangeBasis pyr(ElementType::Pyramid5, 1);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.4)};
+        std::vector<Real> vals;
+        pyr.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, NonTensorStridedEvaluationMatchesPointwise) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Triangle3, 3},
+        {ElementType::Tetra4, 3},
+        {ElementType::Wedge6, 3},
+        {ElementType::Pyramid5, 3},
+    };
+    const std::vector<StridedOutputRequest> requests = {
+        {true, false, false},
+        {false, true, false},
+        {false, false, true},
+        {true, true, false},
+        {true, false, true},
+        {false, true, true},
+        {true, true, true},
+    };
+
+    for (const auto& [type, order] : cases) {
+        for (const auto& request : requests) {
+            SCOPED_TRACE(static_cast<int>(type));
+            SCOPED_TRACE(order);
+            SCOPED_TRACE(request.values ? "values" : "no values");
+            SCOPED_TRACE(request.gradients ? "gradients" : "no gradients");
+            SCOPED_TRACE(request.hessians ? "hessians" : "no hessians");
+            expect_strided_matches_pointwise(type, order, request);
+        }
+    }
+}
+
+TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 4},
+        {ElementType::Quad4, 3},
+        {ElementType::Hex8, 3},
+        {ElementType::Triangle3, 4},
+        {ElementType::Tetra4, 3},
+        {ElementType::Wedge6, 3},
+        {ElementType::Pyramid5, 3},
+    };
+
+    for (const auto& [type, order] : cases) {
+        SCOPED_TRACE(static_cast<int>(type));
+        SCOPED_TRACE(order);
+        expect_raw_to_matches_vector_evaluation(type, order);
+    }
+}
+
+TEST(LagrangeBasis, CanonicalConstructorsSupportArbitraryOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            EXPECT_EQ(basis.element_type(), c.type);
+            EXPECT_EQ(basis.order(), order);
+            EXPECT_EQ(basis.dimension(), expected_dimension(c.type));
+            EXPECT_EQ(basis.size(), expected_lagrange_size(c.type, order));
+            EXPECT_EQ(basis.nodes().size(), basis.size());
+        }
+    }
+}
+
+TEST(LagrangeBasis, AliasVariantsNormalizeToCanonicalPaths) {
+    expect_alias_matches_canonical(
+        ElementType::Line3, ElementType::Line2, 2, sample_points_for(ElementType::Line2));
+    expect_alias_matches_canonical(
+        ElementType::Triangle6, ElementType::Triangle3, 2, sample_points_for(ElementType::Triangle3));
+    expect_alias_matches_canonical(
+        ElementType::Quad9, ElementType::Quad4, 2, sample_points_for(ElementType::Quad4));
+    expect_alias_matches_canonical(
+        ElementType::Tetra10, ElementType::Tetra4, 2, sample_points_for(ElementType::Tetra4));
+    expect_alias_matches_canonical(
+        ElementType::Hex27, ElementType::Hex8, 2, sample_points_for(ElementType::Hex8));
+    expect_alias_matches_canonical(
+        ElementType::Wedge18, ElementType::Wedge6, 2, sample_points_for(ElementType::Wedge6));
+    expect_alias_matches_canonical(
+        ElementType::Pyramid14, ElementType::Pyramid5, 2, sample_points_for(ElementType::Pyramid5),
+        Real(2e-10));
+}
+
+TEST(LagrangeBasis, SerendipityVariantsRemainRejected) {
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Quad8, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Hex20, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Wedge15, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Pyramid13, 2), svmp::FE::FEException);
+}
+
+TEST(LagrangeBasis, GeneratedNodeOrderingIsDeterministicAcrossOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            const auto generated_a = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
+            const auto generated_b = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
+            ASSERT_EQ(generated_a.size(), expected_lagrange_size(c.type, order));
+            ASSERT_EQ(generated_a.size(), generated_b.size());
+            for (std::size_t i = 0; i < generated_a.size(); ++i) {
+                EXPECT_TRUE(points_close(generated_a[i], generated_b[i]));
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, NodeOrderingMatchesReferenceCoordinateOracles) {
+    const std::array<ElementType, 18> cases = {
+        ElementType::Line2, ElementType::Line3,
+        ElementType::Triangle3, ElementType::Triangle6,
+        ElementType::Quad4, ElementType::Quad8, ElementType::Quad9,
+        ElementType::Tetra4, ElementType::Tetra10,
+        ElementType::Hex8, ElementType::Hex20, ElementType::Hex27,
+        ElementType::Wedge6, ElementType::Wedge15, ElementType::Wedge18,
+        ElementType::Pyramid5, ElementType::Pyramid13, ElementType::Pyramid14,
+    };
+
+    for (ElementType type : cases) {
+        const auto expected = reference_node_coords(type);
+        ASSERT_FALSE(expected.empty());
+        ASSERT_EQ(ReferenceNodeLayout::num_nodes(type), expected.size());
+        for (std::size_t i = 0; i < expected.size(); ++i) {
+            const auto actual = ReferenceNodeLayout::get_node_coords(type, i);
+            EXPECT_TRUE(points_close(actual, expected[i]))
+                << "Element type " << static_cast<int>(type)
+                << ", node " << i;
+        }
+    }
+}
+
+TEST(LagrangeBasis, GeneratedLowOrderOrderingMatchesPublicAliasPaths) {
+    const struct Case {
+        ElementType type;
+        int order;
+        ElementType public_alias;
+    } cases[] = {
+        {ElementType::Line2, 1, ElementType::Line2},
+        {ElementType::Line2, 2, ElementType::Line3},
+        {ElementType::Triangle3, 1, ElementType::Triangle3},
+        {ElementType::Triangle3, 2, ElementType::Triangle6},
+        {ElementType::Quad4, 1, ElementType::Quad4},
+        {ElementType::Quad4, 2, ElementType::Quad9},
+        {ElementType::Tetra4, 1, ElementType::Tetra4},
+        {ElementType::Tetra4, 2, ElementType::Tetra10},
+        {ElementType::Hex8, 1, ElementType::Hex8},
+        {ElementType::Hex8, 2, ElementType::Hex27},
+        {ElementType::Wedge6, 1, ElementType::Wedge6},
+        {ElementType::Wedge6, 2, ElementType::Wedge18},
+        {ElementType::Pyramid5, 1, ElementType::Pyramid5},
+        {ElementType::Pyramid5, 2, ElementType::Pyramid14},
+    };
+
+    for (const auto& c : cases) {
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(c.type, c.order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(c.public_alias));
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_alias = ReferenceNodeLayout::get_node_coords(c.public_alias, i);
+            EXPECT_TRUE(points_close(generated[i], public_alias));
+        }
+    }
+}
+
+TEST(LagrangeBasis, KroneckerDeltaAcrossCanonicalTopologiesAndOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            ASSERT_EQ(basis.size(), expected_lagrange_size(c.type, order));
+
+            std::vector<Real> values;
+            for (std::size_t node_i = 0; node_i < basis.size(); ++node_i) {
+                basis.evaluate_values(basis.nodes()[node_i], values);
+                ASSERT_EQ(values.size(), basis.size());
+                for (std::size_t basis_i = 0; basis_i < basis.size(); ++basis_i) {
+                    EXPECT_NEAR(values[basis_i], basis_i == node_i ? Real(1) : Real(0), Real(2e-10))
+                        << "Element type " << static_cast<int>(c.type)
+                        << ", order " << order
+                        << ", node " << node_i
+                        << ", basis " << basis_i;
+                }
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, PartitionGradientAndHessianSumsAcrossCanonicalTopologiesAndOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 8, Real(1e-11)},
+        {ElementType::Triangle3, 6, Real(1e-10)},
+        {ElementType::Quad4, 6, Real(1e-10)},
+        {ElementType::Tetra4, 5, Real(2e-10)},
+        {ElementType::Hex8, 5, Real(2e-10)},
+        {ElementType::Wedge6, 5, Real(5e-10)},
+        {ElementType::Pyramid5, 5, Real(5e-7)},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            expect_partition_gradient_hessian_sums(basis, dense_sample_points_for(c.type), c.tol, c.tol);
+        }
+    }
+}
+
+TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Point point;
+        Real tolerance;
+    } cases[] = {
+        {ElementType::Triangle3, 13, Point{Real(0.19), Real(0.31), Real(0)}, Real(1e-8)},
+        {ElementType::Tetra4, 13, Point{Real(0.13), Real(0.17), Real(0.19)}, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(c.point, values, gradients, hessians);
+
+        ASSERT_EQ(values.size(), basis.size());
+        ASSERT_EQ(gradients.size(), basis.size());
+        ASSERT_EQ(hessians.size(), basis.size());
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            value_sum += values[i];
+            for (std::size_t d = 0; d < 3u; ++d) {
+                gradient_sum[d] += gradients[i][d];
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    hessian_sum(d, e) += hessians[i](d, e);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), c.tolerance);
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_NEAR(gradient_sum[d], Real(0), c.tolerance);
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_NEAR(hessian_sum(d, e), Real(0), Real(10) * c.tolerance);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, HighOrderAxisNearNodeMaintainsPartitionAndDerivativeSums) {
+    const int order = 16;
+    const LagrangeBasis basis(ElementType::Line2, order);
+    const Real node = Real(-1) + Real(2 * 5) / static_cast<Real>(order);
+    const Point point{node + Real(1e-7), Real(0), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(point, values, gradients, hessians);
+    ASSERT_EQ(values.size(), basis.size());
+
+    Real value_sum = Real(0);
+    Real gradient_sum = Real(0);
+    Real hessian_sum = Real(0);
+    for (std::size_t i = 0; i < basis.size(); ++i) {
+        value_sum += values[i];
+        gradient_sum += gradients[i][0];
+        hessian_sum += hessians[i](0, 0);
+    }
+
+    EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
+    EXPECT_NEAR(gradient_sum, Real(0), Real(1e-8));
+    EXPECT_NEAR(hessian_sum, Real(0), Real(1e-5));
+}
+
+TEST(LagrangeBasis, PyramidFaceTracesMatchLowerDimensionalLagrangeBases) {
+    const PyramidFace faces[] = {
+        PyramidFace::Base,
+        PyramidFace::South,
+        PyramidFace::East,
+        PyramidFace::North,
+        PyramidFace::West,
+    };
+
+    for (int order = 1; order <= 5; ++order) {
+        for (const auto face : faces) {
+            expect_pyramid_face_trace_matches_lower_basis(
+                order, face, face == PyramidFace::Base ? Real(2e-10) : Real(5e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, PyramidEdgeTracesMatchLineLagrangeBasis) {
+    const PyramidEdge edges[] = {
+        PyramidEdge::BaseSouth,
+        PyramidEdge::BaseEast,
+        PyramidEdge::BaseNorth,
+        PyramidEdge::BaseWest,
+        PyramidEdge::VerticalSW,
+        PyramidEdge::VerticalSE,
+        PyramidEdge::VerticalNE,
+        PyramidEdge::VerticalNW,
+    };
+
+    for (int order = 1; order <= 5; ++order) {
+        for (const auto edge : edges) {
+            expect_pyramid_edge_trace_matches_line_basis(order, edge, Real(5e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, Pyramid14RationalNodalAndPartition) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    EXPECT_EQ(basis.dimension(), 3);
+    EXPECT_EQ(basis.size(), 14u);
+
+    // Kronecker nodal property at all Pyramid14 nodes
+    for (std::size_t i = 0; i < basis.size(); ++i) {
+        auto xi = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
+        std::vector<Real> vals;
+        basis.evaluate_values(xi, vals);
+        ASSERT_EQ(vals.size(), basis.size());
+        for (std::size_t j = 0; j < basis.size(); ++j) {
+            const double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(vals[j], expected, 1e-12);
+        }
+    }
+
+    // Partition of unity at an interior point
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.3)};
+    std::vector<Real> vals;
+    basis.evaluate_values(xi, vals);
+    const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, Pyramid14GradientSumZero) {
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.15), Real(-0.1), Real(0.3)};
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    ASSERT_EQ(grads.size(), basis.size());
+
+    Gradient sum{};
+    for (const auto& g : grads) {
+        sum[0] += g[0];
+        sum[1] += g[1];
+        sum[2] += g[2];
+    }
+    EXPECT_NEAR(sum[0], 0.0, 1e-8);
+    EXPECT_NEAR(sum[1], 0.0, 1e-8);
+    EXPECT_NEAR(sum[2], 0.0, 1e-8);
+}
+
+TEST(LagrangeBasis, HigherOrderP4KroneckerAndPartition) {
+    struct Case {
+        ElementType type;
+        int order;
+        svmp::FE::math::Vector<Real, 3> xi;
+    };
+
+    const std::vector<Case> cases = {
+        {ElementType::Line2, 4, {Real(0.11), Real(0), Real(0)}},
+        {ElementType::Quad4, 4, {Real(0.2), Real(-0.3), Real(0)}},
+        {ElementType::Triangle3, 4, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Hex8, 4, {Real(0.2), Real(-0.3), Real(0.4)}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+
+        // Partition of unity at an interior point
+        std::vector<Real> values;
+        basis.evaluate_values(c.xi, values);
+        const double sum = std::accumulate(values.begin(), values.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+
+        // Kronecker delta property at all nodes
+        const auto& nodes = basis.nodes();
+        ASSERT_EQ(nodes.size(), basis.size());
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            basis.evaluate_values(nodes[i], values);
+            ASSERT_EQ(values.size(), nodes.size());
+            for (std::size_t j = 0; j < nodes.size(); ++j) {
+                const double expected = (i == j) ? 1.0 : 0.0;
+                EXPECT_NEAR(values[j], expected, 1e-12);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, Pyramid14InterpolatesQuadraticPolynomials) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    const std::size_t n = basis.size();
+
+    // Precompute nodal coordinates
+    std::vector<svmp::FE::math::Vector<Real,3>> nodes;
+    nodes.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        nodes.push_back(ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i));
+    }
+
+    auto interpolate_and_check = [&](auto f, Real tol) {
+        // Nodal coefficients
+        std::vector<Real> coeffs(n);
+        for (std::size_t i = 0; i < n; ++i) {
+            const auto& x = nodes[i];
+            coeffs[i] = f(x[0], x[1], x[2]);
+        }
+
+        // Test at a few interior points
+        const svmp::FE::math::Vector<Real,3> test_pts[] = {
+            {Real(0.1), Real(-0.2), Real(0.2)},
+            {Real(-0.2), Real(0.15), Real(0.4)},
+            {Real(0.05), Real(0.05), Real(0.3)}
+        };
+
+        for (const auto& xi : test_pts) {
+            std::vector<Real> vals;
+            basis.evaluate_values(xi, vals);
+            ASSERT_EQ(vals.size(), n);
+
+            Real u_interp = Real(0);
+            for (std::size_t i = 0; i < n; ++i) {
+                u_interp += coeffs[i] * vals[i];
+            }
+
+            const Real u_exact = f(xi[0], xi[1], xi[2]);
+            EXPECT_NEAR(u_interp, u_exact, tol);
+        }
+    };
+
+    // Constant, linear and quadratic monomials
+    interpolate_and_check([](Real, Real, Real) { return Real(1); }, Real(1e-12));
+    interpolate_and_check([](Real x, Real, Real) { return x; }, Real(1e-11));
+    interpolate_and_check([](Real, Real y, Real) { return y; }, Real(1e-11));
+    interpolate_and_check([](Real, Real, Real z) { return z; }, Real(1e-11));
+    interpolate_and_check([](Real x, Real y, Real) { return x * y; }, Real(1e-10));
+    interpolate_and_check([](Real x, Real, Real z) { return x * z; }, Real(1e-10));
+    interpolate_and_check([](Real, Real y, Real z) { return y * z; }, Real(1e-10));
+    interpolate_and_check([](Real x, Real, Real) { return x * x; }, Real(1e-10));
+    interpolate_and_check([](Real, Real y, Real) { return y * y; }, Real(1e-10));
+    interpolate_and_check([](Real, Real, Real z) { return z * z; }, Real(1e-10));
+}
+
+TEST(LagrangeBasis, Pyramid14GradientMatchesLinearFunctionGradient) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    const std::size_t n = basis.size();
+
+    // Nodal coordinates and coefficients for f(x,y,z) = ax + by + cz
+    const Real a = Real(1.2);
+    const Real b = Real(-0.7);
+    const Real c = Real(0.5);
+
+    std::vector<Real> coeffs(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        const auto x = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
+        coeffs[i] = a * x[0] + b * x[1] + c * x[2];
+    }
+
+    const svmp::FE::math::Vector<Real,3> xi{Real(0.1), Real(-0.15), Real(0.35)};
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    ASSERT_EQ(grads.size(), n);
+
+    Gradient g_interp{};
+    for (std::size_t i = 0; i < n; ++i) {
+        g_interp[0] += coeffs[i] * grads[i][0];
+        g_interp[1] += coeffs[i] * grads[i][1];
+        g_interp[2] += coeffs[i] * grads[i][2];
+    }
+
+    EXPECT_NEAR(g_interp[0], a, 1e-6);
+    EXPECT_NEAR(g_interp[1], b, 1e-6);
+    EXPECT_NEAR(g_interp[2], c, 1e-6);
+}
+
+TEST(LagrangeBasis, PyramidApexValuesRemainExactAcrossRepresentativeOrders) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> values;
+        basis.evaluate_values(apex, values);
+        ASSERT_EQ(values.size(), basis.size());
+
+        const auto& nodes = basis.nodes();
+        auto apex_it = std::find_if(
+            nodes.begin(), nodes.end(),
+            [](const auto& node) {
+                return std::abs(node[0]) <= Real(1e-14) &&
+                       std::abs(node[1]) <= Real(1e-14) &&
+                       std::abs(node[2] - Real(1)) <= Real(1e-14);
+            });
+        ASSERT_NE(apex_it, nodes.end());
+        const std::size_t apex_index = static_cast<std::size_t>(
+            std::distance(nodes.begin(), apex_it));
+
+        Real sum = Real(0);
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_TRUE(std::isfinite(static_cast<double>(values[i])));
+            sum += values[i];
+            const Real expected = (i == apex_index) ? Real(1) : Real(0);
+            EXPECT_NEAR(values[i], expected, 1e-12)
+                << "order " << c.order << ", basis " << i;
+        }
+        EXPECT_NEAR(sum, Real(1), 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, PyramidGradientAtExactApexThrowsWhenLimitIsNotUnique) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Gradient> gradients;
+        EXPECT_THROW(basis.evaluate_gradients(apex, gradients), svmp::FE::basis::BasisEvaluationException)
+            << "order " << c.order;
+    }
+}
+
+TEST(LagrangeBasis, PyramidApexValuesMatchDirectionalNearApexLimits) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Pyramid5, 1, Real(3e-6)},
+        {ElementType::Pyramid14, 2, Real(4e-6)},
+        {ElementType::Pyramid5, 4, Real(1e-5)},
+    };
+
+    const std::array<std::array<Real, 2>, 4> directions = {{
+        {Real(0), Real(0)},
+        {Real(0.35), Real(-0.25)},
+        {Real(-0.50), Real(0.45)},
+        {Real(0.20), Real(0.60)},
+    }};
+    const Real t = Real(1e-6);
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> apex_values;
+        basis.evaluate_values(apex, apex_values);
+
+        for (const auto& direction : directions) {
+            const svmp::FE::math::Vector<Real, 3> xi{
+                t * direction[0],
+                t * direction[1],
+                Real(1) - t
+            };
+
+            std::vector<Real> values;
+            basis.evaluate_values(xi, values);
+            ASSERT_EQ(values.size(), apex_values.size());
+
+            for (std::size_t i = 0; i < values.size(); ++i) {
+                EXPECT_NEAR(values[i], apex_values[i], c.tol)
+                    << "order " << c.order
+                    << ", basis " << i
+                    << ", direction (" << direction[0] << ", " << direction[1] << ")";
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, PyramidNearApexGradientShowsDirectionalSpread) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real min_spread;
+    } cases[] = {
+        {ElementType::Pyramid5, 1, Real(5e-2)},
+        {ElementType::Pyramid14, 2, Real(5e-2)},
+    };
+
+    const std::array<std::array<Real, 2>, 4> directions = {{
+        {Real(0), Real(0)},
+        {Real(0.45), Real(-0.30)},
+        {Real(-0.35), Real(0.40)},
+        {Real(0.25), Real(0.55)},
+    }};
+    const Real t = Real(1e-6);
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        double max_spread = 0.0;
+
+        std::vector<std::vector<Gradient>> directional_gradients;
+        directional_gradients.reserve(directions.size());
+        for (const auto& direction : directions) {
+            const svmp::FE::math::Vector<Real, 3> xi{
+                t * direction[0],
+                t * direction[1],
+                Real(1) - t
+            };
+
+            std::vector<Gradient> gradients;
+            basis.evaluate_gradients(xi, gradients);
+            directional_gradients.push_back(std::move(gradients));
+        }
+
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            for (int d = 0; d < 3; ++d) {
+                double min_value = std::numeric_limits<double>::infinity();
+                double max_value = -std::numeric_limits<double>::infinity();
+                for (const auto& gradients : directional_gradients) {
+                    const double value = static_cast<double>(gradients[i][static_cast<std::size_t>(d)]);
+                    min_value = std::min(min_value, value);
+                    max_value = std::max(max_value, value);
+                }
+                max_spread = std::max(max_spread, max_value - min_value);
+            }
+        }
+
+        EXPECT_GT(max_spread, static_cast<double>(c.min_spread))
+            << "order " << c.order;
+    }
+}
+
+TEST(LagrangeBasis, GradientSumZeroQuadAndTet) {
+    const std::vector<std::pair<ElementType, svmp::FE::math::Vector<Real, 3>>> cases = {
+        {ElementType::Quad4, svmp::FE::math::Vector<Real, 3>{Real(0.2), Real(-0.1), Real(0)}},
+        {ElementType::Tetra4, svmp::FE::math::Vector<Real, 3>{Real(0.1), Real(0.2), Real(0.1)}}
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.first, 1);
+        std::vector<Gradient> grads;
+        basis.evaluate_gradients(c.second, grads);
+
+        ASSERT_EQ(grads.size(), basis.size());
+        Gradient sum{};
+        for (const auto& g : grads) {
+            sum[0] += g[0];
+            sum[1] += g[1];
+            sum[2] += g[2];
+        }
+        EXPECT_NEAR(sum[0], 0.0, 1e-12);
+        EXPECT_NEAR(sum[1], 0.0, 1e-12);
+        EXPECT_NEAR(sum[2], 0.0, 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, HexPartitionAndGradientSumZeroOrderThree) {
+    LagrangeBasis basis(ElementType::Hex8, 3);
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.25)};
+
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+    const double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    Gradient gsum{};
+    for (const auto& g : grads) {
+        gsum[0] += g[0];
+        gsum[1] += g[1];
+        gsum[2] += g[2];
+    }
+    EXPECT_NEAR(gsum[0], 0.0, 1e-10);
+    EXPECT_NEAR(gsum[1], 0.0, 1e-10);
+    EXPECT_NEAR(gsum[2], 0.0, 1e-10);
+}
+
+TEST(LagrangeBasis, OracleLine3ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Line3, 2);
+    const Point xi{Real(0.2), Real(0), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 3u);
+    ASSERT_EQ(gradients.size(), 3u);
+    ASSERT_EQ(hessians.size(), 3u);
+
+    const Real expected_values[] = {Real(-2) / Real(25), Real(3) / Real(25), Real(24) / Real(25)};
+    const Real expected_gradients[] = {Real(-3) / Real(10), Real(7) / Real(10), Real(-2) / Real(5)};
+    const Real expected_hessians[] = {Real(1), Real(1), Real(-2)};
+
+    for (std::size_t i = 0; i < 3; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), expected_hessians[i], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, OracleTriangle3ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Triangle3, 1);
+    const Point xi{Real(0.2), Real(0.3), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 3u);
+    const Point expected_gradients[] = {
+        Point{Real(-1), Real(-1), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)}
+    };
+    const Real expected_values[] = {Real(0.5), Real(0.2), Real(0.3)};
+
+    for (std::size_t i = 0; i < 3; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        for (int a = 0; a < 2; ++a) {
+            for (int b = 0; b < 2; ++b) {
+                EXPECT_NEAR(hessians[i](static_cast<std::size_t>(a), static_cast<std::size_t>(b)),
+                            Real(0), 1e-14);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, OracleQuad4ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Quad4, 1);
+    const Point xi{Real(0.2), Real(-0.4), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 4u);
+    const Real expected_values[] = {Real(7) / Real(25), Real(21) / Real(50),
+                                    Real(9) / Real(50), Real(3) / Real(25)};
+    const Point expected_gradients[] = {
+        Point{Real(-7) / Real(20), Real(-1) / Real(5), Real(0)},
+        Point{Real(7) / Real(20), Real(-3) / Real(10), Real(0)},
+        Point{Real(3) / Real(20), Real(3) / Real(10), Real(0)},
+        Point{Real(-3) / Real(20), Real(1) / Real(5), Real(0)}
+    };
+    const Real expected_hxy[] = {Real(1) / Real(4), Real(-1) / Real(4),
+                                 Real(1) / Real(4), Real(-1) / Real(4)};
+
+    for (std::size_t i = 0; i < 4; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 1), expected_hxy[i], 1e-14);
+        EXPECT_NEAR(hessians[i](1, 0), expected_hxy[i], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, OracleWedge6ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Wedge6, 1);
+    const Point xi{Real(0.2), Real(0.25), Real(-0.3)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 6u);
+    const Real expected_values[] = {
+        Real(143) / Real(400), Real(13) / Real(100), Real(13) / Real(80),
+        Real(77) / Real(400), Real(7) / Real(100), Real(7) / Real(80)
+    };
+    const Point expected_gradients[] = {
+        Point{Real(-13) / Real(20), Real(-13) / Real(20), Real(-11) / Real(40)},
+        Point{Real(13) / Real(20), Real(0), Real(-1) / Real(10)},
+        Point{Real(0), Real(13) / Real(20), Real(-1) / Real(8)},
+        Point{Real(-7) / Real(20), Real(-7) / Real(20), Real(11) / Real(40)},
+        Point{Real(7) / Real(20), Real(0), Real(1) / Real(10)},
+        Point{Real(0), Real(7) / Real(20), Real(1) / Real(8)}
+    };
+    const Point expected_hxz[] = {
+        Point{Real(1) / Real(2), Real(1) / Real(2), Real(0)},
+        Point{Real(-1) / Real(2), Real(0), Real(0)},
+        Point{Real(0), Real(-1) / Real(2), Real(0)},
+        Point{Real(-1) / Real(2), Real(-1) / Real(2), Real(0)},
+        Point{Real(1) / Real(2), Real(0), Real(0)},
+        Point{Real(0), Real(1) / Real(2), Real(0)}
+    };
+
+    for (std::size_t i = 0; i < 6; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        EXPECT_NEAR(gradients[i][2], expected_gradients[i][2], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](2, 2), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 2), expected_hxz[i][0], 1e-14);
+        EXPECT_NEAR(hessians[i](2, 0), expected_hxz[i][0], 1e-14);
+        EXPECT_NEAR(hessians[i](1, 2), expected_hxz[i][1], 1e-14);
+        EXPECT_NEAR(hessians[i](2, 1), expected_hxz[i][1], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, DeterministicBoundarySweepMaintainsPartitionAndFiniteDerivatives) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 1},
+        {ElementType::Line3, 2},
+        {ElementType::Triangle3, 1},
+        {ElementType::Triangle6, 2},
+        {ElementType::Quad4, 1},
+        {ElementType::Quad9, 2},
+        {ElementType::Tetra4, 1},
+        {ElementType::Tetra10, 2},
+        {ElementType::Hex8, 1},
+        {ElementType::Hex27, 2},
+        {ElementType::Wedge6, 1},
+        {ElementType::Wedge18, 2},
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+    };
+
+    for (const auto& [type, order] : cases) {
+        LagrangeBasis basis(type, order);
+        for (const auto& xi : boundary_stress_points_for(type)) {
+            std::vector<Real> values;
+            std::vector<Gradient> gradients;
+            std::vector<Hessian> hessians;
+            basis.evaluate_values(xi, values);
+            basis.evaluate_gradients(xi, gradients);
+            basis.evaluate_hessians(xi, hessians);
+
+            ASSERT_EQ(values.size(), basis.size());
+            ASSERT_EQ(gradients.size(), basis.size());
+            ASSERT_EQ(hessians.size(), basis.size());
+
+            Real sum = Real(0);
+            for (Real value : values) {
+                EXPECT_TRUE(std::isfinite(value));
+                sum += value;
+            }
+            expect_all_finite(gradients);
+            expect_hessians_finite(hessians, basis.dimension());
+            EXPECT_NEAR(sum, Real(1), type == ElementType::Pyramid5 || type == ElementType::Pyramid14
+                                       ? Real(1e-8)
+                                       : Real(1e-12))
+                << "type=" << static_cast<int>(type)
+                << ", order=" << order
+                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+        }
+    }
+}
+
+TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedLinearShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
+        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
+        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
+        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
+        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
+        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
+        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
+    };
+
+    for (const auto& c : cases) {
+        expect_gradients_match_finite_difference(c, Real(1e-6), Real(1e-6));
+    }
+}
+
+TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedQuadraticShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
+        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
+        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
+        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
+        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
+        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
+        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
+    };
+
+    for (const auto& c : cases) {
+        expect_gradients_match_finite_difference(c, Real(1e-6), Real(2e-6));
+    }
+}
+
+TEST(LagrangeBasis, LinearPolynomialReproductionAcrossSupportedLinearShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
+        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
+        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
+        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
+        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
+        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
+        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
+    };
+
+    const std::vector<std::array<int, 3>> exponents = {
+        {0, 0, 0},
+        {1, 0, 0},
+        {0, 1, 0},
+        {0, 0, 1},
+    };
+
+    for (const auto& c : cases) {
+        const std::vector<std::array<int, 3>> relevant(
+            exponents.begin(),
+            exponents.begin() + static_cast<std::ptrdiff_t>(c.type == ElementType::Line2 ? 2 :
+                                                            (c.type == ElementType::Triangle3 ||
+                                                             c.type == ElementType::Quad4) ? 3 : 4));
+        expect_polynomial_reproduction(c, relevant, Real(1e-12));
+    }
+}
+
+TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossSupportedQuadraticShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
+        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
+        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
+        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
+        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
+        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
+        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
+    };
+
+    const std::vector<std::array<int, 3>> line_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {2, 0, 0}
+    };
+    const std::vector<std::array<int, 3>> surface_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {0, 1, 0},
+        {2, 0, 0}, {1, 1, 0}, {0, 2, 0}
+    };
+    const std::vector<std::array<int, 3>> volume_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+        {2, 0, 0}, {1, 1, 0}, {0, 2, 0},
+        {1, 0, 1}, {0, 1, 1}, {0, 0, 2}
+    };
+
+    for (const auto& c : cases) {
+        if (c.type == ElementType::Line3) {
+            expect_polynomial_reproduction(c, line_exponents, Real(1e-12));
+        } else if (c.type == ElementType::Triangle6 || c.type == ElementType::Quad9) {
+            expect_polynomial_reproduction(c, surface_exponents, Real(1e-11));
+        } else {
+            expect_polynomial_reproduction(c, volume_exponents, Real(2e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, HighOrderTensorLagrangeMaintainsPartitionAndDerivativeSums) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 8, {Point{-0.875, 0, 0}, Point{0.125, 0, 0}, Point{1, 0, 0}}},
+        {ElementType::Quad4, 7, {Point{0.2, -0.35, 0}, Point{-1, 0.5, 0}, Point{0.5, 1, 0}}},
+        {ElementType::Hex8, 6, {Point{0.1, -0.2, 0.3}, Point{-1, 0.5, 1}, Point{0.75, -1, -0.5}}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_partition_gradient_hessian_sums(basis, c.points, Real(2e-12), Real(2e-8));
+    }
+}
+
+TEST(LagrangeBasis, HighOrderTensorLagrangeReproducesTensorPolynomials) {
+    const LagrangeAccuracyCase line{ElementType::Line2,
+                                    8,
+                                    {Point{-0.73, 0, 0}, Point{-0.1, 0, 0}, Point{0.64, 0, 0}}};
+    expect_polynomial_reproduction(line,
+                                   {{0, 0, 0}, {1, 0, 0}, {4, 0, 0}, {8, 0, 0}},
+                                   Real(1e-11));
+
+    const LagrangeAccuracyCase quad{ElementType::Quad4,
+                                    7,
+                                    {Point{-0.6, -0.2, 0}, Point{0.15, 0.45, 0}, Point{0.8, -0.55, 0}}};
+    expect_polynomial_reproduction(quad,
+                                   {{0, 0, 0}, {7, 0, 0}, {0, 7, 0}, {4, 3, 0}},
+                                   Real(5e-10));
+
+    const LagrangeAccuracyCase hex{ElementType::Hex8,
+                                   6,
+                                   {Point{-0.4, 0.2, -0.3}, Point{0.35, -0.55, 0.25}, Point{0.75, 0.4, -0.65}}};
+    expect_polynomial_reproduction(hex,
+                                   {{0, 0, 0}, {6, 0, 0}, {0, 6, 0}, {0, 0, 6}, {3, 2, 4}},
+                                   Real(2e-9));
+}
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
new file mode 100644
index 000000000..9f2bf8be5
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -0,0 +1,116 @@
+/**
+ * @file test_SerendipityTensorModal.cpp
+ * @brief Tests for the migrated Serendipity basis subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void expect_partition_of_unity(const SerendipityBasis& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tolerance = Real(1e-10))
+{
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+
+    Real value_sum = Real(0);
+    Gradient gradient_sum{};
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        value_sum += values[i];
+        for (std::size_t component = 0; component < 3u; ++component) {
+            gradient_sum[component] += gradients[i][component];
+        }
+    }
+
+    EXPECT_NEAR(value_sum, Real(1), tolerance);
+    for (int component = 0; component < basis.dimension(); ++component) {
+        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(component)],
+                    Real(0),
+                    tolerance);
+    }
+}
+
+void expect_nodal_delta(const SerendipityBasis& basis,
+                        const std::vector<math::Vector<Real, 3>>& nodes,
+                        Real tolerance)
+{
+    ASSERT_EQ(nodes.size(), basis.size());
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        std::vector<Real> values;
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t dof = 0; dof < values.size(); ++dof) {
+            EXPECT_NEAR(values[dof], dof == node ? Real(1) : Real(0), tolerance)
+                << "node=" << node << " dof=" << dof;
+        }
+    }
+}
+
+std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
+                                                   std::size_t count)
+{
+    std::vector<math::Vector<Real, 3>> nodes;
+    nodes.reserve(count);
+    for (std::size_t i = 0; i < count; ++i) {
+        nodes.push_back(ReferenceNodeLayout::get_node_coords(type, i));
+    }
+    return nodes;
+}
+
+} // namespace
+
+TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+
+    EXPECT_EQ(basis.size(), 8u);
+    expect_nodal_delta(basis, basis.nodes(), Real(1e-10));
+    expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)});
+}
+
+TEST(SerendipityBasis, Hex20IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+
+    EXPECT_EQ(basis.size(), 20u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Hex20, basis.size()),
+                       Real(1e-10));
+    expect_partition_of_unity(basis, {Real(0.2), Real(-0.1), Real(0.3)});
+}
+
+TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+
+    EXPECT_EQ(basis.size(), 15u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Wedge15, basis.size()),
+                       Real(1e-9));
+    expect_partition_of_unity(basis, {Real(0.2), Real(0.3), Real(0.1)});
+}
+
+TEST(SerendipityBasis, Pyramid13IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Pyramid13, 2);
+
+    EXPECT_EQ(basis.size(), 13u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Pyramid13, basis.size()),
+                       Real(1e-8));
+    expect_partition_of_unity(basis, {Real(0.1), Real(-0.2), Real(0.4)});
+}
+
+TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
+}
+
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
new file mode 100644
index 000000000..2b44ad2bf
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -0,0 +1,265 @@
+/**
+ * @file test_DenseLinearAlgebra.cpp
+ * @brief Tests for shared dense linear algebra utilities.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Common/FEException.h"
+#include "FE/Math/DenseLinearAlgebra.h"
+
+#include <cmath>
+#include <span>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::math;
+
+namespace {
+
+Real multiply_entry(const std::vector<Real>& A,
+                    const std::vector<Real>& B,
+                    std::size_t n,
+                    std::size_t row,
+                    std::size_t col) {
+    Real sum = Real(0);
+    for (std::size_t k = 0; k < n; ++k) {
+        sum += A[row * n + k] * B[k * n + col];
+    }
+    return sum;
+}
+
+} // namespace
+
+TEST(DenseLinearAlgebra, InvertsScaledMatrix) {
+    const std::vector<Real> A{
+        Real(1.0e9), Real(2.0e6),
+        Real(3.0e3), Real(4.0)
+    };
+
+    const auto inv = invert_dense_matrix(A, 2u, "scaled 2x2");
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(A, inv, 2u, row, col), expected, Real(1.0e-10));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, FactorizationSolvesMultipleRightHandSides) {
+    const std::vector<Real> A{
+        Real(4), Real(2), Real(0),
+        Real(2), Real(5), Real(1),
+        Real(0), Real(1), Real(3)
+    };
+
+    const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3");
+    EXPECT_EQ(solver.diagnostics.rank, 3u);
+
+    const std::vector<Real> rhs{Real(2), Real(4), Real(6)};
+    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    ASSERT_EQ(x.size(), 3u);
+
+    for (std::size_t row = 0; row < 3u; ++row) {
+        Real ax = Real(0);
+        for (std::size_t col = 0; col < 3u; ++col) {
+            ax += A[row * 3u + col] * x[col];
+        }
+        EXPECT_NEAR(ax, rhs[row], Real(1.0e-12));
+    }
+
+    std::vector<Real> second_rhs{Real(1), Real(-2), Real(0.5)};
+    const auto original_second_rhs = second_rhs;
+    solver.solve_in_place(std::span<Real>(second_rhs.data(), second_rhs.size()));
+    for (std::size_t row = 0; row < 3u; ++row) {
+        Real ax = Real(0);
+        for (std::size_t col = 0; col < 3u; ++col) {
+            ax += A[row * 3u + col] * second_rhs[col];
+        }
+        EXPECT_NEAR(ax, original_second_rhs[row], Real(1.0e-12));
+    }
+}
+
+TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
+    const std::vector<Real> A{
+        Real(4), Real(2), Real(0),
+        Real(2), Real(5), Real(1),
+        Real(0), Real(1), Real(3)
+    };
+
+    const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3 block");
+
+    std::vector<Real> rhs{
+        Real(2), Real(1),
+        Real(4), Real(-2),
+        Real(6), Real(0.5)
+    };
+    const auto original_rhs = rhs;
+    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 2u);
+
+    for (std::size_t rhs_col = 0; rhs_col < 2u; ++rhs_col) {
+        for (std::size_t row = 0; row < 3u; ++row) {
+            Real ax = Real(0);
+            for (std::size_t col = 0; col < 3u; ++col) {
+                ax += A[row * 3u + col] * rhs[col * 2u + rhs_col];
+            }
+            EXPECT_NEAR(ax, original_rhs[row * 2u + rhs_col], Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
+    const std::vector<Real> high_condition{
+        Real(1), Real(0),
+        Real(0), Real(1.0e-13)
+    };
+
+    const auto result =
+        invert_dense_matrix_with_diagnostics(high_condition, 2u, "high-condition diagonal");
+    EXPECT_EQ(result.diagnostics.rank, 2u);
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    EXPECT_GT(result.diagnostics.condition_estimate,
+              dense_matrix_condition_fallback_threshold());
+    EXPECT_TRUE(result.used_svd_fallback);
+#else
+    EXPECT_FALSE(result.used_svd_fallback);
+#endif
+
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(high_condition, result.inverse, 2u, row, col),
+                        expected,
+                        Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "condition rejection requires FE_ENABLE_EIGEN diagnostics";
+#endif
+    DenseInverseResult result;
+    result.diagnostics.rank = 2u;
+    result.diagnostics.condition_estimate =
+        dense_matrix_condition_error_threshold() * Real(10);
+
+    EXPECT_GT(result.diagnostics.condition_estimate,
+              dense_matrix_condition_error_threshold());
+    EXPECT_THROW(validate_dense_inverse_diagnostics(
+                     result, 2u, "excessive-condition diagonal"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, ThrowsForScaleAwareSingularPivot) {
+    const std::vector<Real> singular{
+        Real(1.0e12), Real(2.0e12),
+        Real(0.5e12), Real(1.0e12)
+    };
+
+    EXPECT_THROW((void)invert_dense_matrix(singular, 2u, "singular 2x2"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, FactorizationThrowsForRankDeficientMatrix) {
+    const std::vector<Real> singular{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+
+    EXPECT_THROW((void)factor_dense_matrix(singular, 2u, "rank-one 2x2"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, RankUsesScaleAwareTolerance) {
+    const std::vector<Real> rank_one{
+        Real(1.0e8), Real(2.0e8),
+        Real(3.0e8), Real(6.0e8)
+    };
+    EXPECT_EQ(dense_matrix_rank(rank_one, 2u, 2u), 1u);
+
+    const std::vector<Real> full_rank{
+        Real(1.0e8), Real(2.0e8),
+        Real(3.0e8), Real(6.1e8)
+    };
+    EXPECT_EQ(dense_matrix_rank(full_rank, 2u, 2u), 2u);
+}
+
+TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
+    const std::vector<Real> diagonal{
+        Real(4), Real(0),
+        Real(0), Real(0.5)
+    };
+    const auto full =
+        dense_matrix_diagnostics(diagonal, 2u, 2u, "diagonal 2x2");
+    EXPECT_EQ(full.rank, 2u);
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    EXPECT_NEAR(full.largest_singular_value, Real(4), Real(1.0e-14));
+    EXPECT_NEAR(full.smallest_retained_singular_value, Real(0.5), Real(1.0e-14));
+    EXPECT_NEAR(full.condition_estimate, Real(8), Real(1.0e-14));
+#else
+    EXPECT_TRUE(std::isinf(full.condition_estimate));
+#endif
+
+    const std::vector<Real> rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+    const auto deficient =
+        dense_matrix_diagnostics(rank_one, 2u, 2u, "rank-one 2x2");
+    EXPECT_EQ(deficient.rank, 1u);
+    EXPECT_TRUE(std::isinf(deficient.condition_estimate));
+}
+
+TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquations) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
+#endif
+    const std::vector<Real> rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+
+    const auto pinv =
+        rank_revealing_pseudo_inverse(rank_one, 2u, 2u, "rank-one 2x2");
+    EXPECT_EQ(pinv.rank, 1u);
+    EXPECT_NEAR(pinv.inverse[0], Real(0.04), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[1], Real(0.08), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[2], Real(0.08), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[3], Real(0.16), Real(1.0e-13));
+
+    std::vector<Real> projection(4u, Real(0));
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            for (std::size_t a = 0; a < 2u; ++a) {
+                for (std::size_t b = 0; b < 2u; ++b) {
+                    projection[row * 2u + col] +=
+                        rank_one[row * 2u + a] * pinv.inverse[a * 2u + b] *
+                        rank_one[b * 2u + col];
+                }
+            }
+            EXPECT_NEAR(projection[row * 2u + col],
+                        rank_one[row * 2u + col],
+                        Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, PseudoInverseDropsNearZeroSingularValues) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
+#endif
+    const std::vector<Real> near_singular{
+        Real(1), Real(0),
+        Real(0), Real(1.0e-18)
+    };
+
+    const auto pinv =
+        rank_revealing_pseudo_inverse(near_singular, 2u, 2u, "near-singular 2x2");
+    EXPECT_EQ(pinv.rank, 1u);
+    EXPECT_GT(pinv.tolerance, Real(1.0e-18));
+    EXPECT_NEAR(pinv.inverse[0], Real(1), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[1], Real(0), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[2], Real(0), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[3], Real(0), Real(1.0e-14));
+}
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
new file mode 100644
index 000000000..307b308a1
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
@@ -0,0 +1,509 @@
+/**
+ * @file test_ExpressionOps.cpp
+ * @brief Unit tests for ExpressionOps.h - expression template operators
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/ExpressionOps.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/Matrix.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+using namespace svmp::FE::math::detail::ops;
+
+// Test fixture for ExpressionOps tests
+class ExpressionOpsTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Binary Operation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, AddOperator) {
+    Add op;
+
+    // Integer addition
+    EXPECT_EQ(op(5, 3), 8);
+    EXPECT_EQ(op(-5, 3), -2);
+    EXPECT_EQ(op(-5, -3), -8);
+
+    // Floating point addition
+    EXPECT_DOUBLE_EQ(op(3.14, 2.86), 6.0);
+    EXPECT_DOUBLE_EQ(op(-1.5, 2.5), 1.0);
+
+    // Mixed types
+    auto result = op(3, 2.5);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 5.5);
+}
+
+TEST_F(ExpressionOpsTest, SubOperator) {
+    Sub op;
+
+    // Integer subtraction
+    EXPECT_EQ(op(5, 3), 2);
+    EXPECT_EQ(op(3, 5), -2);
+    EXPECT_EQ(op(-5, -3), -2);
+
+    // Floating point subtraction
+    EXPECT_DOUBLE_EQ(op(5.5, 2.5), 3.0);
+    EXPECT_DOUBLE_EQ(op(2.5, 5.5), -3.0);
+
+    // Mixed types
+    auto result = op(5.5, 2);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 3.5);
+}
+
+TEST_F(ExpressionOpsTest, MulOperator) {
+    Mul op;
+
+    // Integer multiplication
+    EXPECT_EQ(op(5, 3), 15);
+    EXPECT_EQ(op(-5, 3), -15);
+    EXPECT_EQ(op(-5, -3), 15);
+
+    // Floating point multiplication
+    EXPECT_DOUBLE_EQ(op(2.5, 4.0), 10.0);
+    EXPECT_DOUBLE_EQ(op(-2.5, 4.0), -10.0);
+
+    // Zero multiplication
+    EXPECT_EQ(op(0, 100), 0);
+    EXPECT_DOUBLE_EQ(op(0.0, 3.14), 0.0);
+
+    // Mixed types
+    auto result = op(3, 2.5);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 7.5);
+}
+
+TEST_F(ExpressionOpsTest, DivOperator) {
+    Div op;
+
+    // Integer division
+    EXPECT_EQ(op(10, 2), 5);
+    EXPECT_EQ(op(10, 3), 3);  // Integer division truncates
+    EXPECT_EQ(op(-10, 2), -5);
+
+    // Floating point division
+    EXPECT_DOUBLE_EQ(op(10.0, 2.0), 5.0);
+    EXPECT_DOUBLE_EQ(op(10.0, 3.0), 10.0/3.0);
+    EXPECT_DOUBLE_EQ(op(-10.0, 2.0), -5.0);
+
+    // Mixed types
+    auto result = op(10.0, 3);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 10.0/3.0);
+}
+
+// =============================================================================
+// Unary Operation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, NegateOperator) {
+    Negate op;
+
+    // Integer negation
+    EXPECT_EQ(op(5), -5);
+    EXPECT_EQ(op(-5), 5);
+    EXPECT_EQ(op(0), 0);
+
+    // Floating point negation
+    EXPECT_DOUBLE_EQ(op(3.14), -3.14);
+    EXPECT_DOUBLE_EQ(op(-2.71), 2.71);
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+
+    // Type preservation
+    auto int_result = op(5);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    auto double_result = op(5.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+}
+
+TEST_F(ExpressionOpsTest, AbsOperator) {
+    Abs op;
+
+    // Integer absolute value
+    EXPECT_EQ(op(5), 5);
+    EXPECT_EQ(op(-5), 5);
+    EXPECT_EQ(op(0), 0);
+
+    // Floating point absolute value
+    EXPECT_DOUBLE_EQ(op(3.14), 3.14);
+    EXPECT_DOUBLE_EQ(op(-3.14), 3.14);
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+
+    // Special cases
+    EXPECT_DOUBLE_EQ(op(-0.0), 0.0);
+
+    // Type preservation
+    auto int_result = op(-5);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    auto double_result = op(-5.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+}
+
+TEST_F(ExpressionOpsTest, SqrtOperator) {
+    Sqrt op;
+
+    // Perfect squares
+    EXPECT_DOUBLE_EQ(op(4.0), 2.0);
+    EXPECT_DOUBLE_EQ(op(9.0), 3.0);
+    EXPECT_DOUBLE_EQ(op(16.0), 4.0);
+    EXPECT_DOUBLE_EQ(op(25.0), 5.0);
+
+    // Non-perfect squares
+    EXPECT_DOUBLE_EQ(op(2.0), std::sqrt(2.0));
+    EXPECT_DOUBLE_EQ(op(3.0), std::sqrt(3.0));
+
+    // Special cases
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+    EXPECT_DOUBLE_EQ(op(1.0), 1.0);
+
+    // Type conversion
+    auto result = op(4);  // Integer input
+    EXPECT_DOUBLE_EQ(result, 2.0);
+}
+
+// =============================================================================
+// Constexpr Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ConstexprOperators) {
+    // Test that operators can be used in constexpr contexts
+    constexpr Add add_op;
+    constexpr Sub sub_op;
+    constexpr Mul mul_op;
+    constexpr Div div_op;
+    constexpr Negate neg_op;
+
+    // Compile-time evaluation
+    constexpr auto sum = add_op(3, 4);
+    constexpr auto diff = sub_op(7, 3);
+    constexpr auto prod = mul_op(3, 4);
+    constexpr auto quot = div_op(12, 3);
+    constexpr auto neg = neg_op(5);
+
+    EXPECT_EQ(sum, 7);
+    EXPECT_EQ(diff, 4);
+    EXPECT_EQ(prod, 12);
+    EXPECT_EQ(quot, 4);
+    EXPECT_EQ(neg, -5);
+
+    // Static assertions to verify compile-time evaluation
+    static_assert(add_op(2, 3) == 5);
+    static_assert(sub_op(5, 2) == 3);
+    static_assert(mul_op(3, 4) == 12);
+    static_assert(div_op(10, 2) == 5);
+    static_assert(neg_op(3) == -3);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, TypeDeduction) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+
+    // int + int -> int
+    auto int_result = add_op(3, 4);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    // double + double -> double
+    auto double_result = add_op(3.0, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+
+    // int + double -> double
+    auto mixed_result1 = add_op(3, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(mixed_result1), double>));
+
+    // double + int -> double
+    auto mixed_result2 = add_op(3.0, 4);
+    EXPECT_TRUE((std::is_same_v<decltype(mixed_result2), double>));
+
+    // float + double -> double
+    auto float_double_result = add_op(3.0f, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(float_double_result), double>));
+}
+
+// =============================================================================
+// Complex Expression Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ChainedOperations) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Negate neg_op;
+
+    // Simulate complex expression: -(a + b) * c / d
+    double a = 2.0, b = 3.0, c = 4.0, d = 2.0;
+
+    auto sum = add_op(a, b);       // 5.0
+    auto negated = neg_op(sum);    // -5.0
+    auto product = mul_op(negated, c);  // -20.0
+    auto result = div_op(product, d);   // -10.0
+
+    EXPECT_DOUBLE_EQ(result, -10.0);
+}
+
+TEST_F(ExpressionOpsTest, MixedPrecisionChain) {
+    Add add_op;
+    Mul mul_op;
+
+    // Mixed precision chain
+    int a = 2;
+    float b = 3.5f;
+    double c = 1.5;
+
+    auto step1 = add_op(a, b);    // int + float -> float (5.5f)
+    auto step2 = mul_op(step1, c); // float + double -> double (8.25)
+
+    EXPECT_TRUE((std::is_same_v<decltype(step2), double>));
+    EXPECT_DOUBLE_EQ(step2, 8.25);
+}
+
+// =============================================================================
+// Operator Integration with Vector/Matrix Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, VectorIntegration) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2{4.0, 5.0, 6.0};
+
+    // Test that operators work correctly in vector expressions
+    Vector<double, 3> sum = v1 + v2;
+    Vector<double, 3> diff = v1 - v2;
+    Vector<double, 3> neg = -v1;
+    Vector<double, 3> scaled = v1 * 2.0;
+
+    EXPECT_DOUBLE_EQ(sum[0], 5.0);
+    EXPECT_DOUBLE_EQ(sum[1], 7.0);
+    EXPECT_DOUBLE_EQ(sum[2], 9.0);
+
+    EXPECT_DOUBLE_EQ(diff[0], -3.0);
+    EXPECT_DOUBLE_EQ(diff[1], -3.0);
+    EXPECT_DOUBLE_EQ(diff[2], -3.0);
+
+    EXPECT_DOUBLE_EQ(neg[0], -1.0);
+    EXPECT_DOUBLE_EQ(neg[1], -2.0);
+    EXPECT_DOUBLE_EQ(neg[2], -3.0);
+
+    EXPECT_DOUBLE_EQ(scaled[0], 2.0);
+    EXPECT_DOUBLE_EQ(scaled[1], 4.0);
+    EXPECT_DOUBLE_EQ(scaled[2], 6.0);
+}
+
+TEST_F(ExpressionOpsTest, MatrixIntegration) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> m2{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Test that operators work correctly in matrix expressions
+    Matrix<double, 2, 2> sum = m1 + m2;
+    Matrix<double, 2, 2> diff = m1 - m2;
+    Matrix<double, 2, 2> neg = -m1;
+    Matrix<double, 2, 2> scaled = m1 * 2.0;
+
+    EXPECT_DOUBLE_EQ(sum(0, 0), 6.0);
+    EXPECT_DOUBLE_EQ(sum(0, 1), 8.0);
+    EXPECT_DOUBLE_EQ(sum(1, 0), 10.0);
+    EXPECT_DOUBLE_EQ(sum(1, 1), 12.0);
+
+    EXPECT_DOUBLE_EQ(diff(0, 0), -4.0);
+    EXPECT_DOUBLE_EQ(diff(0, 1), -4.0);
+    EXPECT_DOUBLE_EQ(diff(1, 0), -4.0);
+    EXPECT_DOUBLE_EQ(diff(1, 1), -4.0);
+
+    EXPECT_DOUBLE_EQ(neg(0, 0), -1.0);
+    EXPECT_DOUBLE_EQ(neg(0, 1), -2.0);
+    EXPECT_DOUBLE_EQ(neg(1, 0), -3.0);
+    EXPECT_DOUBLE_EQ(neg(1, 1), -4.0);
+
+    EXPECT_DOUBLE_EQ(scaled(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(scaled(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(scaled(1, 0), 6.0);
+    EXPECT_DOUBLE_EQ(scaled(1, 1), 8.0);
+}
+
+// =============================================================================
+// Edge Cases and Special Values Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, SpecialFloatingPointValues) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Abs abs_op;
+    Negate neg_op;
+
+    // Infinity handling
+    double inf = std::numeric_limits<double>::infinity();
+    EXPECT_DOUBLE_EQ(add_op(inf, 1.0), inf);
+    EXPECT_DOUBLE_EQ(sub_op(inf, 1.0), inf);
+    EXPECT_DOUBLE_EQ(mul_op(inf, 2.0), inf);
+    EXPECT_DOUBLE_EQ(div_op(inf, 2.0), inf);
+    EXPECT_DOUBLE_EQ(abs_op(inf), inf);
+    EXPECT_DOUBLE_EQ(neg_op(inf), -inf);
+
+    // NaN handling
+    double nan = std::numeric_limits<double>::quiet_NaN();
+    EXPECT_TRUE(std::isnan(add_op(nan, 1.0)));
+    EXPECT_TRUE(std::isnan(sub_op(nan, 1.0)));
+    EXPECT_TRUE(std::isnan(mul_op(nan, 2.0)));
+    EXPECT_TRUE(std::isnan(div_op(nan, 2.0)));
+    EXPECT_TRUE(std::isnan(abs_op(nan)));
+    EXPECT_TRUE(std::isnan(neg_op(nan)));
+
+    // Division by zero
+    EXPECT_DOUBLE_EQ(div_op(1.0, 0.0), inf);
+    EXPECT_DOUBLE_EQ(div_op(-1.0, 0.0), -inf);
+    EXPECT_TRUE(std::isnan(div_op(0.0, 0.0)));
+}
+
+TEST_F(ExpressionOpsTest, LargeAndSmallValues) {
+    Add add_op;
+    Mul mul_op;
+
+    // Large values
+    double large = 1e308;
+    double result = add_op(large, large);
+    EXPECT_TRUE(std::isinf(result));  // Overflow to infinity
+
+    // Small values
+    double tiny = std::numeric_limits<double>::min();
+    double tiny_result = mul_op(tiny, 0.5);
+    EXPECT_GT(tiny_result, 0.0);  // Should still be positive
+    EXPECT_LT(tiny_result, tiny);  // But smaller
+
+    // Denormalized numbers
+    double denorm = std::numeric_limits<double>::denorm_min();
+    double denorm_result = add_op(denorm, denorm);
+    EXPECT_EQ(denorm_result, 2.0 * denorm);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Constraint Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, SFINAECompatibility) {
+    // Test that operators work with any arithmetic types
+    Add add_op;
+
+    // Various integer types
+    EXPECT_EQ(add_op(int8_t(3), int8_t(4)), 7);
+    EXPECT_EQ(add_op(int16_t(100), int16_t(200)), 300);
+    EXPECT_EQ(add_op(int32_t(1000), int32_t(2000)), 3000);
+    EXPECT_EQ(add_op(int64_t(10000), int64_t(20000)), 30000);
+
+    // Unsigned types
+    EXPECT_EQ(add_op(uint8_t(3), uint8_t(4)), 7u);
+    EXPECT_EQ(add_op(uint16_t(100), uint16_t(200)), 300u);
+    EXPECT_EQ(add_op(uint32_t(1000), uint32_t(2000)), 3000u);
+
+    // Floating point types
+    EXPECT_FLOAT_EQ(add_op(3.0f, 4.0f), 7.0f);
+    EXPECT_DOUBLE_EQ(add_op(3.0, 4.0), 7.0);
+
+    // Long double
+    long double ld1 = 3.0L;
+    long double ld2 = 4.0L;
+    EXPECT_DOUBLE_EQ(add_op(ld1, ld2), 7.0L);
+}
+
+// =============================================================================
+// Template Instantiation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, TemplateInstantiations) {
+    // Test that operators can be instantiated with various types
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Abs abs_op;
+    Sqrt sqrt_op;
+    Negate neg_op;
+
+    // Custom types that support arithmetic operations
+    struct CustomNumber {
+        double value;
+        CustomNumber(double v) : value(v) {}
+        CustomNumber operator+(const CustomNumber& other) const { return CustomNumber(value + other.value); }
+        CustomNumber operator-(const CustomNumber& other) const { return CustomNumber(value - other.value); }
+        CustomNumber operator*(const CustomNumber& other) const { return CustomNumber(value * other.value); }
+        CustomNumber operator/(const CustomNumber& other) const { return CustomNumber(value / other.value); }
+        CustomNumber operator-() const { return CustomNumber(-value); }
+        bool operator==(const CustomNumber& other) const { return value == other.value; }
+    };
+
+    CustomNumber cn1(3.0);
+    CustomNumber cn2(4.0);
+
+    auto cn_sum = add_op(cn1, cn2);
+    EXPECT_EQ(cn_sum.value, 7.0);
+
+    auto cn_diff = sub_op(cn1, cn2);
+    EXPECT_EQ(cn_diff.value, -1.0);
+
+    auto cn_prod = mul_op(cn1, cn2);
+    EXPECT_EQ(cn_prod.value, 12.0);
+
+    auto cn_quot = div_op(cn1, cn2);
+    EXPECT_EQ(cn_quot.value, 0.75);
+
+    auto cn_neg = neg_op(cn1);
+    EXPECT_EQ(cn_neg.value, -3.0);
+}
+
+// =============================================================================
+// Complex Number Support Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ComplexNumberSupport) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Negate neg_op;
+
+    std::complex<double> c1(3.0, 4.0);
+    std::complex<double> c2(1.0, 2.0);
+
+    auto c_sum = add_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_sum.real(), 4.0);
+    EXPECT_DOUBLE_EQ(c_sum.imag(), 6.0);
+
+    auto c_diff = sub_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_diff.real(), 2.0);
+    EXPECT_DOUBLE_EQ(c_diff.imag(), 2.0);
+
+    auto c_prod = mul_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_prod.real(), -5.0);  // (3+4i)(1+2i) = 3+6i+4i+8i² = 3+10i-8 = -5+10i
+    EXPECT_DOUBLE_EQ(c_prod.imag(), 10.0);
+
+    auto c_neg = neg_op(c1);
+    EXPECT_DOUBLE_EQ(c_neg.real(), -3.0);
+    EXPECT_DOUBLE_EQ(c_neg.imag(), -4.0);
+}
diff --git a/tests/unitTests/FE/Math/test_MathConstants.cpp b/tests/unitTests/FE/Math/test_MathConstants.cpp
new file mode 100644
index 000000000..5619690ed
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_MathConstants.cpp
@@ -0,0 +1,341 @@
+/**
+ * @file test_MathConstants.cpp
+ * @brief Unit tests for MathConstants.h - mathematical constants and tolerances
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/MathConstants.h"
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for MathConstants tests
+class MathConstantsTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+    void TearDown() override {}
+};
+
+// =============================================================================
+// Mathematical Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, PiConstants) {
+    // Test PI value
+    EXPECT_NEAR(constants::PI, 3.14159265358979323846, 1e-15);
+
+    // Test PI/2
+    EXPECT_NEAR(constants::PI_2, constants::PI / 2.0, 1e-15);
+
+    // Test PI/4
+    EXPECT_NEAR(constants::PI_4, constants::PI / 4.0, 1e-15);
+
+    // Test 2*PI
+    EXPECT_NEAR(constants::TWO_PI, 2.0 * constants::PI, 1e-15);
+
+    // Test 1/PI
+    EXPECT_NEAR(constants::INV_PI, 1.0 / constants::PI, 1e-15);
+
+    // Test sqrt(PI)
+    EXPECT_NEAR(constants::SQRT_PI, std::sqrt(constants::PI), 1e-15);
+}
+
+TEST_F(MathConstantsTest, EulerConstant) {
+    // Test e (Euler's number)
+    EXPECT_NEAR(constants::E, std::exp(1.0), 1e-15);
+
+    // Test ln(2)
+    EXPECT_NEAR(constants::LN_2, std::log(2.0), 1e-15);
+
+    // Test ln(10)
+    EXPECT_NEAR(constants::LN_10, std::log(10.0), 1e-15);
+
+    // Test log10(e)
+    EXPECT_NEAR(constants::LOG10_E, std::log10(constants::E), 1e-15);
+
+    // Test log2(e)
+    EXPECT_NEAR(constants::LOG2_E, std::log2(constants::E), 1e-15);
+}
+
+TEST_F(MathConstantsTest, SquareRootConstants) {
+    // Test sqrt(2)
+    EXPECT_NEAR(constants::SQRT_2, std::sqrt(2.0), 1e-15);
+
+    // Test sqrt(3)
+    EXPECT_NEAR(constants::SQRT_3, std::sqrt(3.0), 1e-15);
+
+    // Test sqrt(5)
+    EXPECT_NEAR(constants::SQRT_5, std::sqrt(5.0), 1e-15);
+
+    // Test 1/sqrt(2)
+    EXPECT_NEAR(constants::INV_SQRT_2, 1.0 / std::sqrt(2.0), 1e-15);
+
+    // Test 1/sqrt(3)
+    EXPECT_NEAR(constants::INV_SQRT_3, 1.0 / std::sqrt(3.0), 1e-15);
+}
+
+TEST_F(MathConstantsTest, GoldenRatio) {
+    // Test golden ratio φ = (1 + sqrt(5))/2
+    EXPECT_NEAR(constants::PHI, (1.0 + std::sqrt(5.0)) / 2.0, 1e-15);
+
+    // Property: φ² = φ + 1
+    EXPECT_NEAR(constants::PHI * constants::PHI, constants::PHI + 1.0, 1e-14);
+
+    // Property: 1/φ = φ - 1
+    EXPECT_NEAR(1.0 / constants::PHI, constants::PHI - 1.0, 1e-14);
+}
+
+// =============================================================================
+// Angle Conversion Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, DegreesToRadians) {
+    // Test common conversions
+    EXPECT_NEAR(constants::deg_to_rad(0.0), 0.0, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(90.0), constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(180.0), constants::PI, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(270.0), 3.0 * constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(360.0), constants::TWO_PI, 1e-15);
+
+    // Test negative angles
+    EXPECT_NEAR(constants::deg_to_rad(-90.0), -constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(-180.0), -constants::PI, 1e-15);
+
+    // Test arbitrary angle
+    EXPECT_NEAR(constants::deg_to_rad(45.0), constants::PI_4, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(30.0), constants::PI / 6.0, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(60.0), constants::PI / 3.0, 1e-15);
+}
+
+TEST_F(MathConstantsTest, RadiansToDegrees) {
+    // Test common conversions
+    EXPECT_NEAR(constants::rad_to_deg(0.0), 0.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::PI_2), 90.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::PI), 180.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::TWO_PI), 360.0, 1e-13);
+
+    // Test negative angles
+    EXPECT_NEAR(constants::rad_to_deg(-constants::PI), -180.0, 1e-13);
+
+    // Test round-trip conversion
+    double angle_deg = 123.456;
+    double angle_rad = constants::deg_to_rad(angle_deg);
+    double back_to_deg = constants::rad_to_deg(angle_rad);
+    EXPECT_NEAR(back_to_deg, angle_deg, 1e-13);
+}
+
+// =============================================================================
+// Machine Precision Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, MachineEpsilon) {
+    // Test double precision epsilon
+    EXPECT_EQ(constants::EPSILON, std::numeric_limits<double>::epsilon());
+
+    // Test float precision epsilon
+    EXPECT_EQ(constants::EPSILON_F, std::numeric_limits<float>::epsilon());
+
+    // Verify epsilon is the smallest value such that 1.0 + epsilon != 1.0
+    double one_plus_eps = 1.0 + constants::EPSILON;
+    double one_plus_half_eps = 1.0 + constants::EPSILON / 2.0;
+
+    EXPECT_NE(one_plus_eps, 1.0);
+    EXPECT_EQ(one_plus_half_eps, 1.0);
+}
+
+TEST_F(MathConstantsTest, NumericalLimits) {
+    // Test infinity
+    EXPECT_TRUE(std::isinf(constants::INF_VALUE));
+    EXPECT_GT(constants::INF_VALUE, std::numeric_limits<double>::max());
+
+    // Test NaN
+    EXPECT_TRUE(std::isnan(constants::NOT_A_NUMBER));
+    EXPECT_NE(constants::NOT_A_NUMBER, constants::NOT_A_NUMBER);  // NaN != NaN
+
+    // Test max/min values
+    EXPECT_EQ(constants::MAX_DOUBLE, std::numeric_limits<double>::max());
+    EXPECT_EQ(constants::MIN_DOUBLE, std::numeric_limits<double>::min());
+    EXPECT_EQ(constants::LOWEST_DOUBLE, std::numeric_limits<double>::lowest());
+}
+
+// =============================================================================
+// Tolerance Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, DefaultTolerances) {
+    // Test default absolute tolerance
+    EXPECT_GT(constants::DEFAULT_TOLERANCE, 0.0);
+    EXPECT_LT(constants::DEFAULT_TOLERANCE, 1e-10);
+
+    // Test default relative tolerance
+    EXPECT_GT(constants::DEFAULT_REL_TOLERANCE, 0.0);
+    EXPECT_LT(constants::DEFAULT_REL_TOLERANCE, 1e-10);
+
+    // Test solver tolerance
+    EXPECT_GT(constants::SOLVER_TOLERANCE, 0.0);
+    EXPECT_LE(constants::SOLVER_TOLERANCE, constants::DEFAULT_TOLERANCE);
+
+    // Test geometry tolerance (typically larger)
+    EXPECT_GT(constants::GEOMETRY_TOLERANCE, 0.0);
+    EXPECT_GE(constants::GEOMETRY_TOLERANCE, constants::DEFAULT_TOLERANCE);
+}
+
+TEST_F(MathConstantsTest, ToleranceComparison) {
+    double a = 1.0;
+    double b = 1.0 + constants::DEFAULT_TOLERANCE / 2.0;
+    double c = 1.0 + constants::DEFAULT_TOLERANCE * 2.0;
+
+    // Values within tolerance should be considered equal
+    EXPECT_TRUE(constants::near(a, b, constants::DEFAULT_TOLERANCE));
+
+    // Values outside tolerance should not be equal
+    EXPECT_FALSE(constants::near(a, c, constants::DEFAULT_TOLERANCE));
+
+    // Test relative tolerance
+    double large_a = 1e10;
+    double large_b = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE / 2.0);
+    double large_c = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE * 2.0);
+
+    EXPECT_TRUE(constants::near_relative(large_a, large_b, constants::DEFAULT_REL_TOLERANCE));
+    EXPECT_FALSE(constants::near_relative(large_a, large_c, constants::DEFAULT_REL_TOLERANCE));
+}
+
+TEST_F(MathConstantsTest, ZeroComparison) {
+    // Test near zero detection
+    EXPECT_TRUE(constants::is_zero(0.0));
+    EXPECT_TRUE(constants::is_zero(constants::DEFAULT_TOLERANCE / 2.0));
+    EXPECT_FALSE(constants::is_zero(constants::DEFAULT_TOLERANCE * 2.0));
+
+    // Test with negative values
+    EXPECT_TRUE(constants::is_zero(-constants::DEFAULT_TOLERANCE / 2.0));
+    EXPECT_FALSE(constants::is_zero(-constants::DEFAULT_TOLERANCE * 2.0));
+}
+
+// =============================================================================
+// Physical Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, PhysicalConstants) {
+    // Test speed of light (m/s)
+    EXPECT_NEAR(constants::SPEED_OF_LIGHT, 299792458.0, 1.0);
+
+    // Test gravitational constant (m³/kg/s²)
+    EXPECT_NEAR(constants::GRAVITATIONAL_CONSTANT, 6.67430e-11, 1e-16);
+
+    // Test standard gravity (m/s²)
+    EXPECT_NEAR(constants::STANDARD_GRAVITY, 9.80665, 1e-10);
+
+    // Test Planck constant (J⋅s)
+    EXPECT_NEAR(constants::PLANCK_CONSTANT, 6.62607015e-34, 1e-42);
+
+    // Test Boltzmann constant (J/K)
+    EXPECT_NEAR(constants::BOLTZMANN_CONSTANT, 1.380649e-23, 1e-29);
+
+    // Test Avogadro's number (1/mol)
+    EXPECT_NEAR(constants::AVOGADRO_NUMBER, 6.02214076e23, 1e15);
+}
+
+// =============================================================================
+// Compile-Time Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, CompileTimeConstants) {
+    // Test that constants are constexpr (compile-time)
+    constexpr double pi = constants::PI;
+    constexpr double e = constants::E;
+    constexpr double sqrt2 = constants::SQRT_2;
+
+    EXPECT_EQ(pi, constants::PI);
+    EXPECT_EQ(e, constants::E);
+    EXPECT_EQ(sqrt2, constants::SQRT_2);
+
+    // Test compile-time functions
+    constexpr double angle_rad = constants::deg_to_rad(90.0);
+    EXPECT_NEAR(angle_rad, constants::PI_2, 1e-15);
+
+    constexpr double angle_deg = constants::rad_to_deg(constants::PI);
+    EXPECT_NEAR(angle_deg, 180.0, 1e-13);
+}
+
+// =============================================================================
+// Type Traits Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, TypedConstants) {
+    // Test float versions
+    EXPECT_NEAR(constants::PI_F, static_cast<float>(constants::PI), 1e-7f);
+    EXPECT_NEAR(constants::E_F, static_cast<float>(constants::E), 1e-7f);
+    EXPECT_NEAR(constants::SQRT_2_F, static_cast<float>(constants::SQRT_2), 1e-7f);
+
+    // Test long double versions
+    EXPECT_NEAR(constants::PI_L, static_cast<long double>(constants::PI), 1e-18L);
+    EXPECT_NEAR(constants::E_L, static_cast<long double>(constants::E), 1e-18L);
+}
+
+// =============================================================================
+// Special Functions Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, SignFunction) {
+    // Test sign function
+    EXPECT_EQ(constants::sign(5.0), 1);
+    EXPECT_EQ(constants::sign(-5.0), -1);
+    EXPECT_EQ(constants::sign(0.0), 0);
+
+    // Test with very small values
+    EXPECT_EQ(constants::sign(constants::EPSILON), 1);
+    EXPECT_EQ(constants::sign(-constants::EPSILON), -1);
+
+    // Test with infinity
+    EXPECT_EQ(constants::sign(constants::INF_VALUE), 1);
+    EXPECT_EQ(constants::sign(-constants::INF_VALUE), -1);
+}
+
+TEST_F(MathConstantsTest, SafeDivision) {
+    // Test safe division
+    EXPECT_NEAR(constants::safe_divide(10.0, 2.0), 5.0, 1e-15);
+    EXPECT_NEAR(constants::safe_divide(1.0, 3.0), 1.0/3.0, 1e-15);
+
+    // Test division by zero returns default
+    EXPECT_EQ(constants::safe_divide(1.0, 0.0, 999.0), 999.0);
+    EXPECT_EQ(constants::safe_divide(1.0, constants::EPSILON/2.0, -1.0), -1.0);
+
+    // Test division by near-zero
+    double tiny = constants::DEFAULT_TOLERANCE / 10.0;
+    EXPECT_EQ(constants::safe_divide(1.0, tiny, 0.0), 0.0);
+}
+
+// =============================================================================
+// Utility Functions Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, ClampFunction) {
+    // Test clamping
+    EXPECT_EQ(constants::clamp(5.0, 0.0, 10.0), 5.0);
+    EXPECT_EQ(constants::clamp(-5.0, 0.0, 10.0), 0.0);
+    EXPECT_EQ(constants::clamp(15.0, 0.0, 10.0), 10.0);
+
+    // Test with same min/max
+    EXPECT_EQ(constants::clamp(5.0, 3.0, 3.0), 3.0);
+
+    // Test with infinity
+    EXPECT_EQ(constants::clamp(constants::INF_VALUE, 0.0, 10.0), 10.0);
+    EXPECT_EQ(constants::clamp(-constants::INF_VALUE, 0.0, 10.0), 0.0);
+}
+
+TEST_F(MathConstantsTest, LerpFunction) {
+    // Test linear interpolation
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.0), 0.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.0), 10.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.5), 5.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.25), 2.5, 1e-15);
+
+    // Test extrapolation
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, -0.5), -5.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.5), 15.0, 1e-15);
+
+    // Test with negative range
+    EXPECT_NEAR(constants::lerp(-10.0, -5.0, 0.5), -7.5, 1e-15);
+}
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
new file mode 100644
index 000000000..c186c26ee
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_Matrix.cpp
@@ -0,0 +1,594 @@
+/**
+ * @file test_Matrix.cpp
+ * @brief Unit tests for Matrix.h - fixed-size matrices with expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Matrix.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/MatrixExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+using namespace svmp::FE::math;
+
+// Test fixture for Matrix tests
+class MatrixTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    // Helper function to check if two values are approximately equal
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Construction and Initialization Tests
+// =============================================================================
+
+TEST_F(MatrixTest, DefaultConstruction) {
+    Matrix<double, 3, 3> m;
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(m(i, j), 0.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, FillConstruction) {
+    Matrix<double, 2, 3> m(5.0);
+    for (size_t i = 0; i < 2; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(m(i, j), 5.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, InitializerListConstruction) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    EXPECT_EQ(m(0, 0), 1.0);
+    EXPECT_EQ(m(0, 1), 2.0);
+    EXPECT_EQ(m(0, 2), 3.0);
+    EXPECT_EQ(m(1, 0), 4.0);
+    EXPECT_EQ(m(1, 1), 5.0);
+    EXPECT_EQ(m(1, 2), 6.0);
+}
+
+TEST_F(MatrixTest, CopyConstruction) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0},
+                            {3.0, 4.0}};
+    Matrix<double, 2, 2> m2(m1);
+
+    EXPECT_EQ(m2(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 1), 2.0);
+    EXPECT_EQ(m2(1, 0), 3.0);
+    EXPECT_EQ(m2(1, 1), 4.0);
+
+    // Ensure deep copy
+    m2(0, 0) = 10.0;
+    EXPECT_EQ(m1(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 0), 10.0);
+}
+
+TEST_F(MatrixTest, MoveConstruction) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0},
+                            {3.0, 4.0}};
+    Matrix<double, 2, 2> m2(std::move(m1));
+
+    EXPECT_EQ(m2(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 1), 2.0);
+    EXPECT_EQ(m2(1, 0), 3.0);
+    EXPECT_EQ(m2(1, 1), 4.0);
+}
+
+// =============================================================================
+// Element Access Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ElementAccess) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    // Non-const access using operator()
+    EXPECT_EQ(m(0, 0), 1.0);
+    EXPECT_EQ(m(0, 2), 3.0);
+    EXPECT_EQ(m(1, 1), 5.0);
+
+    // Modification
+    m(1, 2) = 7.0;
+    EXPECT_EQ(m(1, 2), 7.0);
+
+    // Const access
+    const Matrix<double, 2, 3> cm{{1.0, 2.0, 3.0},
+                                  {4.0, 5.0, 6.0}};
+    EXPECT_EQ(cm(0, 1), 2.0);
+    EXPECT_EQ(cm(1, 0), 4.0);
+}
+
+TEST_F(MatrixTest, ElementAccessBounds) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    // at() with bounds checking
+    EXPECT_EQ(m.at(0, 0), 1.0);
+    EXPECT_EQ(m.at(1, 2), 6.0);
+
+    // Test out of bounds throws
+    EXPECT_THROW(m.at(2, 0), std::out_of_range);
+    EXPECT_THROW(m.at(0, 3), std::out_of_range);
+    EXPECT_THROW(m.at(10, 10), std::out_of_range);
+}
+
+TEST_F(MatrixTest, RowColumnAccess) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+
+    // Get row
+    auto row1 = m.row(1);
+    EXPECT_EQ(row1[0], 4.0);
+    EXPECT_EQ(row1[1], 5.0);
+    EXPECT_EQ(row1[2], 6.0);
+
+    // Get column
+    auto col2 = m.col(2);
+    EXPECT_EQ(col2[0], 3.0);
+    EXPECT_EQ(col2[1], 6.0);
+    EXPECT_EQ(col2[2], 9.0);
+
+    // Set row
+    Vector<double, 3> new_row{10.0, 11.0, 12.0};
+    m.set_row(0, new_row);
+    EXPECT_EQ(m(0, 0), 10.0);
+    EXPECT_EQ(m(0, 1), 11.0);
+    EXPECT_EQ(m(0, 2), 12.0);
+
+    // Set column
+    Vector<double, 3> new_col{20.0, 21.0, 22.0};
+    m.set_col(1, new_col);
+    EXPECT_EQ(m(0, 1), 20.0);
+    EXPECT_EQ(m(1, 1), 21.0);
+    EXPECT_EQ(m(2, 1), 22.0);
+}
+
+// =============================================================================
+// Arithmetic Operations Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Addition) {
+    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
+                           {10.0, 11.0, 12.0}};
+
+    Matrix<double, 2, 3> c = a + b;
+    EXPECT_EQ(c(0, 0), 8.0);
+    EXPECT_EQ(c(0, 1), 10.0);
+    EXPECT_EQ(c(0, 2), 12.0);
+    EXPECT_EQ(c(1, 0), 14.0);
+    EXPECT_EQ(c(1, 1), 16.0);
+    EXPECT_EQ(c(1, 2), 18.0);
+}
+
+TEST_F(MatrixTest, Subtraction) {
+    Matrix<double, 2, 3> a{{8.0, 10.0, 12.0},
+                           {14.0, 16.0, 18.0}};
+    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
+                           {10.0, 11.0, 12.0}};
+
+    Matrix<double, 2, 3> c = a - b;
+    EXPECT_EQ(c(0, 0), 1.0);
+    EXPECT_EQ(c(0, 1), 2.0);
+    EXPECT_EQ(c(0, 2), 3.0);
+    EXPECT_EQ(c(1, 0), 4.0);
+    EXPECT_EQ(c(1, 1), 5.0);
+    EXPECT_EQ(c(1, 2), 6.0);
+}
+
+TEST_F(MatrixTest, ScalarMultiplication) {
+    Matrix<double, 2, 2> a{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> b = 2.0 * a;
+    EXPECT_EQ(b(0, 0), 2.0);
+    EXPECT_EQ(b(0, 1), 4.0);
+    EXPECT_EQ(b(1, 0), 6.0);
+    EXPECT_EQ(b(1, 1), 8.0);
+
+    Matrix<double, 2, 2> c = a * 3.0;
+    EXPECT_EQ(c(0, 0), 3.0);
+    EXPECT_EQ(c(0, 1), 6.0);
+    EXPECT_EQ(c(1, 0), 9.0);
+    EXPECT_EQ(c(1, 1), 12.0);
+}
+
+TEST_F(MatrixTest, ScalarDivision) {
+    Matrix<double, 2, 2> a{{2.0, 4.0},
+                           {6.0, 8.0}};
+
+    Matrix<double, 2, 2> b = a / 2.0;
+    EXPECT_EQ(b(0, 0), 1.0);
+    EXPECT_EQ(b(0, 1), 2.0);
+    EXPECT_EQ(b(1, 0), 3.0);
+    EXPECT_EQ(b(1, 1), 4.0);
+}
+
+TEST_F(MatrixTest, MatrixMultiplication) {
+    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+    Matrix<double, 3, 2> b{{7.0, 8.0},
+                           {9.0, 10.0},
+                           {11.0, 12.0}};
+
+    Matrix<double, 2, 2> c = a * b;
+    EXPECT_EQ(c(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
+    EXPECT_EQ(c(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
+    EXPECT_EQ(c(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
+    EXPECT_EQ(c(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
+}
+
+TEST_F(MatrixTest, MatrixVectorMultiplication) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    Vector<double, 3> result = m * v;
+    EXPECT_EQ(result[0], 14.0);  // 1*1 + 2*2 + 3*3
+    EXPECT_EQ(result[1], 32.0);  // 4*1 + 5*2 + 6*3
+    EXPECT_EQ(result[2], 50.0);  // 7*1 + 8*2 + 9*3
+}
+
+// =============================================================================
+// Special Matrix Operations Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Transpose) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    Matrix<double, 3, 2> mt = m.transpose();
+    EXPECT_EQ(mt(0, 0), 1.0);
+    EXPECT_EQ(mt(0, 1), 4.0);
+    EXPECT_EQ(mt(1, 0), 2.0);
+    EXPECT_EQ(mt(1, 1), 5.0);
+    EXPECT_EQ(mt(2, 0), 3.0);
+    EXPECT_EQ(mt(2, 1), 6.0);
+}
+
+TEST_F(MatrixTest, Determinant2x2) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, -2.0);  // 1*4 - 2*3 = 4 - 6 = -2
+}
+
+TEST_F(MatrixTest, Determinant3x3) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {0.0, 1.0, 4.0},
+                           {5.0, 6.0, 0.0}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, 1.0);  // Using Sarrus rule
+}
+
+TEST_F(MatrixTest, Determinant4x4) {
+    Matrix<double, 4, 4> m{{1, 0, 0, 0},
+                           {0, 2, 0, 0},
+                           {0, 0, 3, 0},
+                           {0, 0, 0, 4}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, 24.0);  // 1*2*3*4 = 24 (diagonal matrix)
+}
+
+TEST_F(MatrixTest, Inverse2x2) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> inv = m.inverse();
+
+    // Check inverse properties
+    EXPECT_NEAR(inv(0, 0), -2.0, tolerance);
+    EXPECT_NEAR(inv(0, 1), 1.0, tolerance);
+    EXPECT_NEAR(inv(1, 0), 1.5, tolerance);
+    EXPECT_NEAR(inv(1, 1), -0.5, tolerance);
+
+    // Verify M * M^-1 = I
+    Matrix<double, 2, 2> identity = m * inv;
+    EXPECT_NEAR(identity(0, 0), 1.0, tolerance);
+    EXPECT_NEAR(identity(0, 1), 0.0, tolerance);
+    EXPECT_NEAR(identity(1, 0), 0.0, tolerance);
+    EXPECT_NEAR(identity(1, 1), 1.0, tolerance);
+}
+
+TEST_F(MatrixTest, Inverse3x3) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {0.0, 1.0, 4.0},
+                           {5.0, 6.0, 0.0}};
+
+    Matrix<double, 3, 3> inv = m.inverse();
+
+    // Verify M * M^-1 = I
+    Matrix<double, 3, 3> identity = m * inv;
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(identity(i, j), expected, tolerance);
+        }
+    }
+}
+
+TEST_F(MatrixTest, Trace) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+
+    double trace = m.trace();
+    EXPECT_EQ(trace, 15.0);  // 1 + 5 + 9 = 15
+}
+
+// =============================================================================
+// Special Matrix Types Tests
+// =============================================================================
+
+TEST_F(MatrixTest, IdentityMatrix) {
+    Matrix<double, 3, 3> I = Matrix<double, 3, 3>::identity();
+
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_EQ(I(i, j), expected);
+        }
+    }
+
+    // Test identity property
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+    Matrix<double, 3, 3> result = m * I;
+
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(result(i, j), m(i, j));
+        }
+    }
+}
+
+TEST_F(MatrixTest, ZeroMatrix) {
+    Matrix<double, 2, 3> Z = Matrix<double, 2, 3>::zero();
+
+    for (size_t i = 0; i < 2; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(Z(i, j), 0.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, DiagonalMatrix) {
+    Vector<double, 3> diag{1.0, 2.0, 3.0};
+    Matrix<double, 3, 3> D = Matrix<double, 3, 3>::diagonal(diag);
+
+    EXPECT_EQ(D(0, 0), 1.0);
+    EXPECT_EQ(D(1, 1), 2.0);
+    EXPECT_EQ(D(2, 2), 3.0);
+
+    // Off-diagonal elements should be zero
+    EXPECT_EQ(D(0, 1), 0.0);
+    EXPECT_EQ(D(0, 2), 0.0);
+    EXPECT_EQ(D(1, 0), 0.0);
+    EXPECT_EQ(D(1, 2), 0.0);
+    EXPECT_EQ(D(2, 0), 0.0);
+    EXPECT_EQ(D(2, 1), 0.0);
+}
+
+// =============================================================================
+// Expression Template Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ExpressionTemplatesNoTemporaries) {
+    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
+    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
+    Matrix<double, 2, 2> c{{9, 10}, {11, 12}};
+
+    // Complex expression should create no temporaries
+    Matrix<double, 2, 2> result = a + b - c;
+
+    EXPECT_EQ(result(0, 0), -3.0);   // 1 + 5 - 9
+    EXPECT_EQ(result(0, 1), -2.0);   // 2 + 6 - 10
+    EXPECT_EQ(result(1, 0), -1.0);   // 3 + 7 - 11
+    EXPECT_EQ(result(1, 1), 0.0);    // 4 + 8 - 12
+}
+
+TEST_F(MatrixTest, LazyEvaluation) {
+    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
+    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
+
+    // Expression should not be evaluated until assignment
+    auto expr = a + b;  // No computation yet
+
+    Matrix<double, 2, 2> result = expr;  // Evaluation happens here
+    EXPECT_EQ(result(0, 0), 6.0);
+    EXPECT_EQ(result(0, 1), 8.0);
+}
+
+// =============================================================================
+// Edge Cases and Error Handling Tests
+// =============================================================================
+
+TEST_F(MatrixTest, SingularMatrixInverse) {
+    Matrix<double, 2, 2> singular{{1.0, 2.0},
+                                  {2.0, 4.0}};  // det = 0
+
+    EXPECT_THROW(singular.inverse(), std::runtime_error);
+}
+
+TEST_F(MatrixTest, DivisionByZero) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> result = m / 0.0;
+    EXPECT_TRUE(std::isinf(result(0, 0)));
+    EXPECT_TRUE(std::isinf(result(0, 1)));
+}
+
+TEST_F(MatrixTest, ExtremeLargeValues) {
+    double large = 1e308;
+    Matrix<double, 2, 2> m{{large, 0}, {0, large}};
+
+    Matrix<double, 2, 2> half = m / 2.0;
+    EXPECT_FALSE(std::isinf(half(0, 0)));
+    EXPECT_EQ(half(0, 0), large / 2.0);
+}
+
+// =============================================================================
+// Numerical Precision Tests
+// =============================================================================
+
+TEST_F(MatrixTest, NumericalStability) {
+    // Test near-singular matrix
+    double eps = 1e-15;
+    Matrix<double, 2, 2> m{{1.0, 1.0},
+                           {1.0, 1.0 + eps}};
+
+    double det = m.determinant();
+    // Relax tolerance due to floating-point arithmetic in determinant calculation
+    EXPECT_NEAR(det, eps, 1e-14);
+}
+
+TEST_F(MatrixTest, OrthogonalMatrixProperties) {
+    // Create rotation matrix (orthogonal)
+    double angle = M_PI / 4;
+    Matrix<double, 2, 2> R{{cos(angle), -sin(angle)},
+                           {sin(angle), cos(angle)}};
+
+    // Check orthogonality: R * R^T = I
+    Matrix<double, 2, 2> RRt = R * R.transpose();
+    EXPECT_NEAR(RRt(0, 0), 1.0, tolerance);
+    EXPECT_NEAR(RRt(0, 1), 0.0, tolerance);
+    EXPECT_NEAR(RRt(1, 0), 0.0, tolerance);
+    EXPECT_NEAR(RRt(1, 1), 1.0, tolerance);
+
+    // Check determinant = ±1
+    EXPECT_NEAR(std::abs(R.determinant()), 1.0, tolerance);
+}
+
+// =============================================================================
+// Matrix Properties Tests
+// =============================================================================
+
+TEST_F(MatrixTest, IsSymmetric) {
+    Matrix<double, 3, 3> sym{{1, 2, 3},
+                             {2, 4, 5},
+                             {3, 5, 6}};
+    EXPECT_TRUE(sym.is_symmetric(tolerance));
+
+    Matrix<double, 3, 3> nonsym{{1, 2, 3},
+                                {4, 5, 6},
+                                {7, 8, 9}};
+    EXPECT_FALSE(nonsym.is_symmetric(tolerance));
+}
+
+TEST_F(MatrixTest, IsSkewSymmetric) {
+    Matrix<double, 3, 3> skew{{0, -1, 2},
+                              {1, 0, -3},
+                              {-2, 3, 0}};
+    EXPECT_TRUE(skew.is_skew_symmetric(tolerance));
+
+    Matrix<double, 3, 3> nonskew{{1, 2, 3},
+                                 {4, 5, 6},
+                                 {7, 8, 9}};
+    EXPECT_FALSE(nonskew.is_skew_symmetric(tolerance));
+}
+
+TEST_F(MatrixTest, IsDiagonal) {
+    Matrix<double, 3, 3> diag{{1, 0, 0},
+                              {0, 2, 0},
+                              {0, 0, 3}};
+    EXPECT_TRUE(diag.is_diagonal(tolerance));
+
+    Matrix<double, 3, 3> nondiag{{1, 0.1, 0},
+                                 {0, 2, 0},
+                                 {0, 0, 3}};
+    EXPECT_FALSE(nondiag.is_diagonal(tolerance));
+}
+
+// =============================================================================
+// Thread Safety Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ThreadSafetyReadOnly) {
+    Matrix<double, 3, 3> m{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+
+    std::vector<std::thread> threads;
+    std::vector<double> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&m, &results, i]() {
+            results[static_cast<std::size_t>(i)] = m.trace();
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    for (double r : results) {
+        EXPECT_EQ(r, 15.0);
+    }
+}
+
+// =============================================================================
+// Memory Alignment Tests
+// =============================================================================
+
+TEST_F(MatrixTest, MemoryAlignment) {
+    Matrix<double, 3, 3> m;
+
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(m.data());
+    EXPECT_EQ(addr % 32, 0) << "Matrix data should be 32-byte aligned for AVX";
+}
+
+// =============================================================================
+// Utility Function Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Norms) {
+    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
+
+    // Frobenius norm: sqrt(1^2 + 2^2 + 3^2 + 4^2) = sqrt(30)
+    EXPECT_NEAR(m.frobenius_norm(), std::sqrt(30.0), tolerance);
+
+    // Infinity norm (max row sum)
+    EXPECT_EQ(m.infinity_norm(), 7.0);  // max(|1|+|2|, |3|+|4|) = max(3, 7)
+
+    // One norm (max column sum)
+    EXPECT_EQ(m.one_norm(), 6.0);  // max(|1|+|3|, |2|+|4|) = max(4, 6)
+}
+
+TEST_F(MatrixTest, MinMaxElements) {
+    Matrix<double, 2, 3> m{{3, -1, 4}, {1, -2, 5}};
+
+    EXPECT_EQ(m.min(), -2.0);
+    EXPECT_EQ(m.max(), 5.0);
+}
+
+TEST_F(MatrixTest, ToString) {
+    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
+    std::stringstream ss;
+    ss << m;
+
+    std::string expected = "[[1, 2]\n [3, 4]]";
+    EXPECT_EQ(ss.str(), expected);
+}
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
new file mode 100644
index 000000000..9486f409c
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
@@ -0,0 +1,528 @@
+/**
+ * @file test_MatrixExpr.cpp
+ * @brief Unit tests for MatrixExpr.h - matrix expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Matrix.h"
+#include "FE/Math/MatrixExpr.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <memory>
+#include <atomic>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for MatrixExpr tests
+class MatrixExprTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    // Custom allocator to track memory allocations
+    template<typename T>
+    class TrackingAllocator {
+    public:
+        using value_type = T;
+
+        static std::atomic<size_t> allocations;
+        static std::atomic<size_t> deallocations;
+        static std::atomic<size_t> bytes_allocated;
+
+        TrackingAllocator() = default;
+
+        template<typename U>
+        TrackingAllocator(const TrackingAllocator<U>&) {}
+
+        T* allocate(size_t n) {
+            allocations.fetch_add(1);
+            bytes_allocated.fetch_add(n * sizeof(T));
+            return static_cast<T*>(::operator new(n * sizeof(T)));
+        }
+
+        void deallocate(T* p, size_t n) {
+            deallocations.fetch_add(1);
+            ::operator delete(p);
+        }
+
+        static void reset() {
+            allocations = 0;
+            deallocations = 0;
+            bytes_allocated = 0;
+        }
+    };
+
+    void SetUp() override {
+        TrackingAllocator<double>::reset();
+    }
+
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::allocations{0};
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::deallocations{0};
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::bytes_allocated{0};
+
+// =============================================================================
+// Lazy Evaluation Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, LazyEvaluationNoTemporaries) {
+    // Expression templates should not create temporary matrices
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Build expression without evaluation
+    auto expr = A + B - C;
+
+    // Expression type should not be Matrix, but an expression type
+    using ExprType = decltype(expr);
+    EXPECT_FALSE((std::is_same_v<ExprType, Matrix<double, 2, 2>>));
+
+    // Now evaluate
+    Matrix<double, 2, 2> result = expr;
+    EXPECT_DOUBLE_EQ(result(0, 0), -3.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), -2.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), -1.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 0.0);
+}
+
+TEST_F(MatrixExprTest, LazyEvaluationAccessPattern) {
+    Matrix<double, 3, 3> A;
+    Matrix<double, 3, 3> B;
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = i * 3 + j + 1;
+            B(i, j) = (i * 3 + j + 1) * 2;
+        }
+    }
+
+    auto expr = A + B;
+
+    // Access individual elements without full evaluation
+    EXPECT_DOUBLE_EQ(expr(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(expr(1, 1), 15.0);
+    EXPECT_DOUBLE_EQ(expr(2, 2), 27.0);
+
+    // Size should be accessible
+    EXPECT_EQ(expr.rows(), 3u);
+    EXPECT_EQ(expr.cols(), 3u);
+}
+
+// =============================================================================
+// Matrix Multiplication Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, MatrixMultiplicationExpression) {
+    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
+    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
+
+    // Matrix multiplication should produce 2x2 result
+    Matrix<double, 2, 2> C = A * B;
+
+    // Verify results
+    EXPECT_DOUBLE_EQ(C(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
+    EXPECT_DOUBLE_EQ(C(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
+    EXPECT_DOUBLE_EQ(C(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
+    EXPECT_DOUBLE_EQ(C(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
+}
+
+TEST_F(MatrixExprTest, ChainedMatrixMultiplication) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Chain matrix multiplications: (A * B) * C
+    Matrix<double, 2, 2> result = A * B * C;
+
+    // First compute A * B
+    Matrix<double, 2, 2> AB = A * B;
+    EXPECT_DOUBLE_EQ(AB(0, 0), 19.0);  // 1*5 + 2*7
+    EXPECT_DOUBLE_EQ(AB(0, 1), 22.0);  // 1*6 + 2*8
+    EXPECT_DOUBLE_EQ(AB(1, 0), 43.0);  // 3*5 + 4*7
+    EXPECT_DOUBLE_EQ(AB(1, 1), 50.0);  // 3*6 + 4*8
+
+    // Then (A * B) * C
+    EXPECT_DOUBLE_EQ(result(0, 0), 413.0);  // 19*9 + 22*11
+    EXPECT_DOUBLE_EQ(result(0, 1), 454.0);  // 19*10 + 22*12
+    EXPECT_DOUBLE_EQ(result(1, 0), 937.0);  // 43*9 + 50*11
+    EXPECT_DOUBLE_EQ(result(1, 1), 1030.0); // 43*10 + 50*12
+}
+
+// =============================================================================
+// Mixed Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, MixedMatrixOperations) {
+    Matrix<double, 3, 3> A, B, C, D;
+
+    // Initialize matrices
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = i + j + 1;
+            B(i, j) = (i + 1) * (j + 1);
+            C(i, j) = i * j + 1;
+            D(i, j) = 1.0;
+        }
+    }
+
+    // Complex expression: A * B + C * D
+    Matrix<double, 3, 3> result = A * B + C * D;
+
+    // Verify a few key elements
+    Matrix<double, 3, 3> AB = A * B;
+    Matrix<double, 3, 3> CD = C * D;
+
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            EXPECT_DOUBLE_EQ(result(i, j), AB(i, j) + CD(i, j));
+        }
+    }
+}
+
+TEST_F(MatrixExprTest, ScalarMultiplicationInExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+
+    Matrix<double, 2, 2> result = 2.0 * (A + B) / 3.0;
+
+    EXPECT_TRUE(approx_equal(result(0, 0), 4.0));
+    EXPECT_TRUE(approx_equal(result(0, 1), 16.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 0), 20.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 1), 8.0));
+}
+
+// =============================================================================
+// Transpose Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, TransposeExpression) {
+    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
+
+    auto AT = transpose(A);
+
+    // Check dimensions
+    EXPECT_EQ(AT.rows(), 3u);
+    EXPECT_EQ(AT.cols(), 2u);
+
+    // Check values
+    EXPECT_DOUBLE_EQ(AT(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(AT(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(AT(1, 0), 2.0);
+    EXPECT_DOUBLE_EQ(AT(1, 1), 5.0);
+    EXPECT_DOUBLE_EQ(AT(2, 0), 3.0);
+    EXPECT_DOUBLE_EQ(AT(2, 1), 6.0);
+}
+
+TEST_F(MatrixExprTest, TransposeInExpression) {
+    Matrix<double, 3, 2> A{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}};
+    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
+
+    // Compute A^T * B (should be 2x2)
+    Matrix<double, 2, 2> result = transpose(A) * B;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 89.0);   // 1*7 + 3*9 + 5*11
+    EXPECT_DOUBLE_EQ(result(0, 1), 98.0);   // 1*8 + 3*10 + 5*12
+    EXPECT_DOUBLE_EQ(result(1, 0), 116.0);  // 2*7 + 4*9 + 6*11
+    EXPECT_DOUBLE_EQ(result(1, 1), 128.0);  // 2*8 + 4*10 + 6*12
+}
+
+// =============================================================================
+// Unary Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, NegationInExpression) {
+    Matrix<double, 2, 2> A{{1.0, -2.0}, {3.0, -4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {-7.0, 8.0}};
+
+    Matrix<double, 2, 2> result = -A + (-B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), -6.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), -4.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), -4.0);
+}
+
+TEST_F(MatrixExprTest, AbsoluteValueExpression) {
+    Matrix<double, 2, 3> M{{-1.5, 2.3, -4.7}, {0.0, -3.2, 5.1}};
+
+    Matrix<double, 2, 3> result = abs(M);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 1.5);
+    EXPECT_DOUBLE_EQ(result(0, 1), 2.3);
+    EXPECT_DOUBLE_EQ(result(0, 2), 4.7);
+    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 3.2);
+    EXPECT_DOUBLE_EQ(result(1, 2), 5.1);
+}
+
+TEST_F(MatrixExprTest, SqrtExpression) {
+    Matrix<double, 2, 2> M{{4.0, 9.0}, {16.0, 25.0}};
+
+    Matrix<double, 2, 2> result = sqrt(M);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 3.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 5.0);
+}
+
+// =============================================================================
+// Element-wise Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, HadamardProductExpression) {
+    Matrix<double, 2, 3> A{{2.0, 3.0, 4.0}, {5.0, 6.0, 7.0}};
+    Matrix<double, 2, 3> B{{8.0, 9.0, 10.0}, {11.0, 12.0, 13.0}};
+
+    Matrix<double, 2, 3> result = hadamard(A, B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 16.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 27.0);
+    EXPECT_DOUBLE_EQ(result(0, 2), 40.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 55.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 72.0);
+    EXPECT_DOUBLE_EQ(result(1, 2), 91.0);
+}
+
+TEST_F(MatrixExprTest, HadamardDivisionExpression) {
+    Matrix<double, 2, 2> A{{10.0, 18.0}, {28.0, 36.0}};
+    Matrix<double, 2, 2> B{{2.0, 3.0}, {4.0, 6.0}};
+
+    Matrix<double, 2, 2> result = hadamard_div(A, B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 5.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 6.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 7.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 6.0);
+}
+
+// =============================================================================
+// Norm and Trace Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, FrobeniusNormOfExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{2.0, 2.0}, {2.0, 2.0}};
+
+    double norm_sq = frobenius_norm_squared(A - B);
+    double norm = frobenius_norm(A - B);
+
+    // (A - B) = [[-1, 0], [1, 2]]
+    // norm_squared = 1 + 0 + 1 + 4 = 6
+    EXPECT_DOUBLE_EQ(norm_sq, 6.0);
+    EXPECT_DOUBLE_EQ(norm, std::sqrt(6.0));
+}
+
+TEST_F(MatrixExprTest, TraceOfExpression) {
+    Matrix<double, 3, 3> A;
+    Matrix<double, 3, 3> B;
+
+    // Initialize as diagonal matrices
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = (i == j) ? (i + 1) : 0.0;  // diag(1, 2, 3)
+            B(i, j) = (i == j) ? (i + 4) : 0.0;  // diag(4, 5, 6)
+        }
+    }
+
+    double tr = trace(A + B);
+
+    // trace(A + B) = trace(diag(5, 7, 9)) = 21
+    EXPECT_DOUBLE_EQ(tr, 21.0);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, TypeDeductionCorrectness) {
+    Matrix<float, 2, 2> Mf{{1.0f, 2.0f}, {3.0f, 4.0f}};
+    Matrix<double, 2, 2> Md{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Float expression
+    auto expr = Mf + Mf;
+    using ExprType = decltype(expr(0, 0));
+    EXPECT_TRUE((std::is_same_v<ExprType, float>));
+
+    // Test that expression evaluates correctly
+    Matrix<float, 2, 2> result = expr;
+    EXPECT_FLOAT_EQ(result(0, 0), 2.0f);
+    EXPECT_FLOAT_EQ(result(1, 1), 8.0f);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SFINAEConstraints) {
+    // Test that MatrixExpr operators only work with MatrixExpr types
+    Matrix<double, 2, 2> M1{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> M2{{5.0, 6.0}, {7.0, 8.0}};
+
+    // This should compile
+    auto expr = M1 + M2;
+    Matrix<double, 2, 2> result = expr;
+
+    // Verify the constraint checking
+    EXPECT_TRUE((std::is_base_of_v<MatrixExpr<Matrix<double, 2, 2>>, Matrix<double, 2, 2>>));
+}
+
+// =============================================================================
+// Aliasing and Self-Assignment Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SelfAssignmentWithExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Self-assignment through expression
+    A = A + B;
+
+    EXPECT_DOUBLE_EQ(A(0, 0), 6.0);
+    EXPECT_DOUBLE_EQ(A(0, 1), 8.0);
+    EXPECT_DOUBLE_EQ(A(1, 0), 10.0);
+    EXPECT_DOUBLE_EQ(A(1, 1), 12.0);
+}
+
+TEST_F(MatrixExprTest, AliasingInExpression) {
+    Matrix<double, 2, 2> A{{2.0, 3.0}, {4.0, 5.0}};
+    Matrix<double, 2, 2> B{{1.0, 1.0}, {1.0, 1.0}};
+
+    // A appears on both sides
+    A = B + A;
+
+    EXPECT_DOUBLE_EQ(A(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(A(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(A(1, 0), 5.0);
+    EXPECT_DOUBLE_EQ(A(1, 1), 6.0);
+}
+
+// =============================================================================
+// Edge Cases Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SingleElementMatrix) {
+    Matrix<double, 1, 1> A{{5.0}};
+    Matrix<double, 1, 1> B{{3.0}};
+
+    Matrix<double, 1, 1> result = A + B - A * 0.5;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 5.5);
+}
+
+TEST_F(MatrixExprTest, NonSquareMatrixOperations) {
+    Matrix<double, 2, 4> A;
+    Matrix<double, 2, 4> B;
+
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            A(i, j) = i * 4 + j + 1;
+            B(i, j) = (i * 4 + j + 1) * 2;
+        }
+    }
+
+    Matrix<double, 2, 4> result = A + B - A * 0.5;
+
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            double expected = A(i, j) + B(i, j) - A(i, j) * 0.5;
+            EXPECT_DOUBLE_EQ(result(i, j), expected);
+        }
+    }
+}
+
+// =============================================================================
+// Diagonal Matrix Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, DiagonalMatrixExpression) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    auto diag = DiagonalExpr<Vector<double, 3>>(v);
+
+    // Check dimensions
+    EXPECT_EQ(diag.rows(), 3u);
+    EXPECT_EQ(diag.cols(), 3u);
+
+    // Check values
+    EXPECT_DOUBLE_EQ(diag(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(diag(1, 1), 2.0);
+    EXPECT_DOUBLE_EQ(diag(2, 2), 3.0);
+
+    // Off-diagonal should be zero
+    EXPECT_DOUBLE_EQ(diag(0, 1), 0.0);
+    EXPECT_DOUBLE_EQ(diag(1, 0), 0.0);
+}
+
+TEST_F(MatrixExprTest, DiagonalMatrixInExpression) {
+    Vector<double, 2> v{2.0, 3.0};
+    Matrix<double, 2, 2> A{{1.0, 1.0}, {1.0, 1.0}};
+
+    auto diag = DiagonalExpr<Vector<double, 2>>(v);
+    Matrix<double, 2, 2> result = A + diag;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 1.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 1.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 4.0);
+}
+
+// =============================================================================
+// Complex Expression Pattern Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, ComplexNestedExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Complex expression with multiple operation types
+    Matrix<double, 2, 2> result = 2.0 * abs(A - B) + sqrt(hadamard(C, C)) / 3.0;
+
+    // |A - B| = |[-4, -4], [-4, -4]| = [4, 4], [4, 4]
+    // 2 * [4, 4], [4, 4] = [8, 8], [8, 8]
+    // C * C (element-wise) = [81, 100], [121, 144]
+    // sqrt(C * C) = [9, 10], [11, 12]
+    // sqrt(C * C) / 3 = [3, 10/3], [11/3, 4]
+    // result = [11, 34/3], [35/3, 12]
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 11.0);
+    EXPECT_TRUE(approx_equal(result(0, 1), 34.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 0), 35.0/3.0));
+    EXPECT_DOUBLE_EQ(result(1, 1), 12.0);
+}
+
+TEST_F(MatrixExprTest, MatrixVectorMixedExpression) {
+    Matrix<double, 3, 3> A;
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Create identity matrix
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = (i == j) ? 1.0 : 0.0;
+        }
+    }
+
+    // Create diagonal from vector and add to identity
+    auto diag = DiagonalExpr<Vector<double, 3>>(v);
+    Matrix<double, 3, 3> result = A + diag;
+
+    // Result should be diag(2, 3, 4)
+    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 3.0);
+    EXPECT_DOUBLE_EQ(result(2, 2), 4.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 0.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
+}
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
new file mode 100644
index 000000000..a38a71727
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_Vector.cpp
@@ -0,0 +1,589 @@
+/**
+ * @file test_Vector.cpp
+ * @brief Unit tests for Vector.h - fixed-size vectors with expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Vector.h"
+#include "FE/Math/VectorExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+using namespace svmp::FE::math;
+
+// Test fixture for Vector tests
+class VectorTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    // Helper function to check if two values are approximately equal
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Construction and Initialization Tests
+// =============================================================================
+
+TEST_F(VectorTest, DefaultConstruction) {
+    Vector<double, 3> v;
+    EXPECT_EQ(v[0], 0.0);
+    EXPECT_EQ(v[1], 0.0);
+    EXPECT_EQ(v[2], 0.0);
+
+    Vector<float, 4> vf;
+    for (size_t i = 0; i < 4; ++i) {
+        EXPECT_EQ(vf[i], 0.0f);
+    }
+}
+
+TEST_F(VectorTest, FillConstruction) {
+    Vector<double, 3> v(5.0);
+    EXPECT_EQ(v[0], 5.0);
+    EXPECT_EQ(v[1], 5.0);
+    EXPECT_EQ(v[2], 5.0);
+
+    Vector<int, 10> vi(-3);
+    for (size_t i = 0; i < 10; ++i) {
+        EXPECT_EQ(vi[i], -3);
+    }
+}
+
+TEST_F(VectorTest, InitializerListConstruction) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[1], 2.0);
+    EXPECT_EQ(v[2], 3.0);
+
+    // Partial initialization
+    Vector<double, 5> v2{1.0, 2.0};
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 0.0);
+    EXPECT_EQ(v2[3], 0.0);
+    EXPECT_EQ(v2[4], 0.0);
+}
+
+TEST_F(VectorTest, CopyConstruction) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2(v1);
+
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 3.0);
+
+    // Ensure deep copy
+    v2[0] = 10.0;
+    EXPECT_EQ(v1[0], 1.0);
+    EXPECT_EQ(v2[0], 10.0);
+}
+
+TEST_F(VectorTest, MoveConstruction) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2(std::move(v1));
+
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 3.0);
+}
+
+// =============================================================================
+// Element Access Tests
+// =============================================================================
+
+TEST_F(VectorTest, ElementAccess) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Non-const access
+    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[1], 2.0);
+    EXPECT_EQ(v[2], 3.0);
+
+    // Modification
+    v[1] = 5.0;
+    EXPECT_EQ(v[1], 5.0);
+
+    // Const access
+    const Vector<double, 3> cv{4.0, 5.0, 6.0};
+    EXPECT_EQ(cv[0], 4.0);
+    EXPECT_EQ(cv[1], 5.0);
+    EXPECT_EQ(cv[2], 6.0);
+}
+
+TEST_F(VectorTest, ElementAccessBounds) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // at() with bounds checking
+    EXPECT_EQ(v.at(0), 1.0);
+    EXPECT_EQ(v.at(1), 2.0);
+    EXPECT_EQ(v.at(2), 3.0);
+
+    // Test out of bounds throws
+    EXPECT_THROW(v.at(3), std::out_of_range);
+    EXPECT_THROW(v.at(100), std::out_of_range);
+}
+
+TEST_F(VectorTest, DataPointerAccess) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    double* data = v.data();
+    EXPECT_EQ(data[0], 1.0);
+    EXPECT_EQ(data[1], 2.0);
+    EXPECT_EQ(data[2], 3.0);
+
+    // Const data access
+    const Vector<double, 3> cv{4.0, 5.0, 6.0};
+    const double* cdata = cv.data();
+    EXPECT_EQ(cdata[0], 4.0);
+    EXPECT_EQ(cdata[1], 5.0);
+    EXPECT_EQ(cdata[2], 6.0);
+}
+
+// =============================================================================
+// Arithmetic Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, Addition) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> c = a + b;
+    EXPECT_EQ(c[0], 5.0);
+    EXPECT_EQ(c[1], 7.0);
+    EXPECT_EQ(c[2], 9.0);
+}
+
+TEST_F(VectorTest, Subtraction) {
+    Vector<double, 3> a{5.0, 7.0, 9.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> c = a - b;
+    EXPECT_EQ(c[0], 1.0);
+    EXPECT_EQ(c[1], 2.0);
+    EXPECT_EQ(c[2], 3.0);
+}
+
+TEST_F(VectorTest, ScalarMultiplication) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+
+    // Scalar * Vector
+    Vector<double, 3> b = 2.0 * a;
+    EXPECT_EQ(b[0], 2.0);
+    EXPECT_EQ(b[1], 4.0);
+    EXPECT_EQ(b[2], 6.0);
+
+    // Vector * Scalar
+    Vector<double, 3> c = a * 3.0;
+    EXPECT_EQ(c[0], 3.0);
+    EXPECT_EQ(c[1], 6.0);
+    EXPECT_EQ(c[2], 9.0);
+}
+
+TEST_F(VectorTest, ScalarDivision) {
+    Vector<double, 3> a{2.0, 4.0, 6.0};
+
+    Vector<double, 3> b = a / 2.0;
+    EXPECT_EQ(b[0], 1.0);
+    EXPECT_EQ(b[1], 2.0);
+    EXPECT_EQ(b[2], 3.0);
+}
+
+TEST_F(VectorTest, UnaryNegation) {
+    Vector<double, 3> a{1.0, -2.0, 3.0};
+
+    Vector<double, 3> b = -a;
+    EXPECT_EQ(b[0], -1.0);
+    EXPECT_EQ(b[1], 2.0);
+    EXPECT_EQ(b[2], -3.0);
+}
+
+TEST_F(VectorTest, CompoundAssignment) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // +=
+    a += b;
+    EXPECT_EQ(a[0], 5.0);
+    EXPECT_EQ(a[1], 7.0);
+    EXPECT_EQ(a[2], 9.0);
+
+    // -=
+    a -= b;
+    EXPECT_EQ(a[0], 1.0);
+    EXPECT_EQ(a[1], 2.0);
+    EXPECT_EQ(a[2], 3.0);
+
+    // *=
+    a *= 2.0;
+    EXPECT_EQ(a[0], 2.0);
+    EXPECT_EQ(a[1], 4.0);
+    EXPECT_EQ(a[2], 6.0);
+
+    // /=
+    a /= 2.0;
+    EXPECT_EQ(a[0], 1.0);
+    EXPECT_EQ(a[1], 2.0);
+    EXPECT_EQ(a[2], 3.0);
+}
+
+// =============================================================================
+// Vector Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, DotProduct) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    double dot = a.dot(b);
+    EXPECT_EQ(dot, 32.0);  // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+
+    // Test commutativity
+    EXPECT_EQ(b.dot(a), dot);
+
+    // Test orthogonal vectors
+    Vector<double, 3> x{1.0, 0.0, 0.0};
+    Vector<double, 3> y{0.0, 1.0, 0.0};
+    EXPECT_EQ(x.dot(y), 0.0);
+}
+
+TEST_F(VectorTest, CrossProduct3D) {
+    Vector<double, 3> x{1.0, 0.0, 0.0};
+    Vector<double, 3> y{0.0, 1.0, 0.0};
+    Vector<double, 3> z{0.0, 0.0, 1.0};
+
+    // Test basis vector cross products
+    Vector<double, 3> xy = x.cross(y);
+    EXPECT_EQ(xy[0], 0.0);
+    EXPECT_EQ(xy[1], 0.0);
+    EXPECT_EQ(xy[2], 1.0);
+
+    Vector<double, 3> yx = y.cross(x);
+    EXPECT_EQ(yx[0], 0.0);
+    EXPECT_EQ(yx[1], 0.0);
+    EXPECT_EQ(yx[2], -1.0);
+
+    // General cross product
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c = a.cross(b);
+
+    EXPECT_EQ(c[0], -3.0);  // 2*6 - 3*5 = 12 - 15 = -3
+    EXPECT_EQ(c[1], 6.0);   // 3*4 - 1*6 = 12 - 6 = 6
+    EXPECT_EQ(c[2], -3.0);  // 1*5 - 2*4 = 5 - 8 = -3
+}
+
+TEST_F(VectorTest, Norm) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    EXPECT_EQ(v.norm(), 5.0);
+
+    Vector<double, 3> unit{1.0, 0.0, 0.0};
+    EXPECT_EQ(unit.norm(), 1.0);
+
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+    EXPECT_EQ(zero.norm(), 0.0);
+}
+
+TEST_F(VectorTest, NormSquared) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    EXPECT_EQ(v.norm_squared(), 25.0);
+
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    EXPECT_EQ(a.norm_squared(), 14.0);  // 1 + 4 + 9 = 14
+}
+
+TEST_F(VectorTest, Normalize) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    Vector<double, 3> n = v.normalized();
+
+    EXPECT_NEAR(n[0], 0.6, tolerance);
+    EXPECT_NEAR(n[1], 0.8, tolerance);
+    EXPECT_NEAR(n[2], 0.0, tolerance);
+    EXPECT_NEAR(n.norm(), 1.0, tolerance);
+
+    // In-place normalization
+    v.normalize();
+    EXPECT_NEAR(v[0], 0.6, tolerance);
+    EXPECT_NEAR(v[1], 0.8, tolerance);
+    EXPECT_NEAR(v.norm(), 1.0, tolerance);
+}
+
+// =============================================================================
+// Expression Template Tests
+// =============================================================================
+
+TEST_F(VectorTest, ExpressionTemplatesNoTemporaries) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+    Vector<double, 3> d{10.0, 11.0, 12.0};
+
+    // Complex expression should create no temporaries
+    Vector<double, 3> result = a + b - c + d;
+
+    EXPECT_EQ(result[0], 8.0);   // 1 + 4 - 7 + 10
+    EXPECT_EQ(result[1], 10.0);  // 2 + 5 - 8 + 11
+    EXPECT_EQ(result[2], 12.0);  // 3 + 6 - 9 + 12
+}
+
+TEST_F(VectorTest, LazyEvaluation) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // Expression should not be evaluated until assignment
+    auto expr = a + b;  // No computation yet
+
+    Vector<double, 3> result = expr;  // Evaluation happens here
+    EXPECT_EQ(result[0], 5.0);
+    EXPECT_EQ(result[1], 7.0);
+    EXPECT_EQ(result[2], 9.0);
+}
+
+TEST_F(VectorTest, MixedExpressions) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    double scalar = 2.0;
+
+    // Complex mixed expression
+    Vector<double, 3> result = scalar * (a + b) - a / scalar;
+
+    EXPECT_NEAR(result[0], 9.5, tolerance);   // 2*(1+4) - 1/2
+    EXPECT_NEAR(result[1], 13.0, tolerance);  // 2*(2+5) - 2/2
+    EXPECT_NEAR(result[2], 16.5, tolerance);  // 2*(3+6) - 3/2
+}
+
+// =============================================================================
+// Special Values Tests
+// =============================================================================
+
+TEST_F(VectorTest, ZeroVector) {
+    Vector<double, 3> zero = Vector<double, 3>::zero();
+    EXPECT_EQ(zero[0], 0.0);
+    EXPECT_EQ(zero[1], 0.0);
+    EXPECT_EQ(zero[2], 0.0);
+    EXPECT_EQ(zero.norm(), 0.0);
+}
+
+TEST_F(VectorTest, OnesVector) {
+    Vector<double, 3> ones = Vector<double, 3>::ones();
+    EXPECT_EQ(ones[0], 1.0);
+    EXPECT_EQ(ones[1], 1.0);
+    EXPECT_EQ(ones[2], 1.0);
+}
+
+TEST_F(VectorTest, BasisVectors) {
+    auto e0 = Vector<double, 3>::basis(0);
+    EXPECT_EQ(e0[0], 1.0);
+    EXPECT_EQ(e0[1], 0.0);
+    EXPECT_EQ(e0[2], 0.0);
+
+    auto e1 = Vector<double, 3>::basis(1);
+    EXPECT_EQ(e1[0], 0.0);
+    EXPECT_EQ(e1[1], 1.0);
+    EXPECT_EQ(e1[2], 0.0);
+
+    auto e2 = Vector<double, 3>::basis(2);
+    EXPECT_EQ(e2[0], 0.0);
+    EXPECT_EQ(e2[1], 0.0);
+    EXPECT_EQ(e2[2], 1.0);
+}
+
+// =============================================================================
+// Edge Cases and Error Handling Tests
+// =============================================================================
+
+TEST_F(VectorTest, DivisionByZero) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Division by zero should produce inf
+    Vector<double, 3> result = v / 0.0;
+    EXPECT_TRUE(std::isinf(result[0]));
+    EXPECT_TRUE(std::isinf(result[1]));
+    EXPECT_TRUE(std::isinf(result[2]));
+}
+
+TEST_F(VectorTest, NormalizeZeroVector) {
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+
+    // Normalizing zero vector should handle gracefully
+    Vector<double, 3> n = zero.normalized();
+    EXPECT_TRUE(std::isnan(n[0]) || n[0] == 0.0);
+}
+
+TEST_F(VectorTest, ExtremeLargeValues) {
+    double large = 1e308;  // Near double max
+    Vector<double, 3> v{large, large, large};
+
+    // Operations should not overflow
+    Vector<double, 3> half = v / 2.0;
+    EXPECT_FALSE(std::isinf(half[0]));
+    EXPECT_EQ(half[0], large / 2.0);
+}
+
+TEST_F(VectorTest, ExtremeSmallValues) {
+    double tiny = 1e-308;  // Near double min
+    Vector<double, 3> v{tiny, tiny, tiny};
+
+    // Operations should maintain precision
+    Vector<double, 3> doubled = v * 2.0;
+    EXPECT_EQ(doubled[0], tiny * 2.0);
+}
+
+// =============================================================================
+// Numerical Precision Tests
+// =============================================================================
+
+TEST_F(VectorTest, NumericalStability) {
+    // Test Kahan summation for better precision
+    Vector<double, 4> v{1e16, 1.0, -1e16, 1.0};
+    // Computed for future validation - demonstrates numerical precision issues
+    [[maybe_unused]] double sum = v[0] + v[1] + v[2] + v[3];
+
+    // Direct summation might lose precision
+    // But vector operations should maintain it
+    Vector<double, 4> a{1e16, 0.0, -1e16, 0.0};
+    Vector<double, 4> b{0.0, 1.0, 0.0, 1.0};
+    Vector<double, 4> c = a + b;
+
+    EXPECT_EQ(c[0], 1e16);
+    EXPECT_EQ(c[1], 1.0);
+    EXPECT_EQ(c[2], -1e16);
+    EXPECT_EQ(c[3], 1.0);
+}
+
+TEST_F(VectorTest, OrthogonalityPreservation) {
+    // Create nearly orthogonal vectors
+    Vector<double, 3> a{1.0, 1e-15, 0.0};
+    Vector<double, 3> b{0.0, 1.0, 0.0};
+
+    double dot = a.dot(b);
+    EXPECT_NEAR(dot, 1e-15, 1e-16);
+}
+
+// =============================================================================
+// Comparison Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, Equality) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{1.0, 2.0, 3.0};
+    Vector<double, 3> c{1.0, 2.0, 3.1};
+
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a == c);
+    EXPECT_FALSE(a != b);
+    EXPECT_TRUE(a != c);
+}
+
+TEST_F(VectorTest, ApproximateEquality) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{1.0 + 1e-15, 2.0 - 1e-15, 3.0 + 1e-15};
+
+    EXPECT_TRUE(a.approx_equal(b, 1e-14));
+    EXPECT_FALSE(a.approx_equal(b, 1e-16));
+}
+
+// =============================================================================
+// Thread Safety Tests
+// =============================================================================
+
+TEST_F(VectorTest, ThreadSafetyReadOnly) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Multiple threads reading should be safe
+    std::vector<std::thread> threads;
+    std::vector<double> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&v, &results, i]() {
+            results[static_cast<std::size_t>(i)] = v.norm();
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // All threads should get same result
+    double expected = v.norm();
+    for (double r : results) {
+        EXPECT_EQ(r, expected);
+    }
+}
+
+TEST_F(VectorTest, ThreadSafetyIsolated) {
+    // Each thread works on its own vector
+    std::vector<std::thread> threads;
+    std::vector<Vector<double, 3>> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&results, i]() {
+            Vector<double, 3> local{static_cast<double>(i), 0.0, 0.0};
+            results[static_cast<std::size_t>(i)] = local * 2.0;
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // Check each thread computed correctly
+    for (int i = 0; i < 10; ++i) {
+        EXPECT_EQ(results[static_cast<std::size_t>(i)][0], 2.0 * i);
+    }
+}
+
+// =============================================================================
+// Memory Alignment Tests
+// =============================================================================
+
+TEST_F(VectorTest, MemoryAlignment) {
+    Vector<double, 3> v;
+
+    // Check that data is properly aligned for SIMD
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(v.data());
+    EXPECT_EQ(addr % 32, 0) << "Vector data should be 32-byte aligned for AVX";
+}
+
+// =============================================================================
+// Utility Function Tests
+// =============================================================================
+
+TEST_F(VectorTest, MinMaxElements) {
+    Vector<double, 5> v{3.0, -1.0, 4.0, 1.0, -2.0};
+
+    EXPECT_EQ(v.min(), -2.0);
+    EXPECT_EQ(v.max(), 4.0);
+    EXPECT_EQ(v.min_index(), 4);
+    EXPECT_EQ(v.max_index(), 2);
+}
+
+TEST_F(VectorTest, Sum) {
+    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
+    EXPECT_EQ(v.sum(), 10.0);
+
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+    EXPECT_EQ(zero.sum(), 0.0);
+}
+
+TEST_F(VectorTest, Mean) {
+    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
+    EXPECT_EQ(v.mean(), 2.5);
+}
+
+TEST_F(VectorTest, ToString) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+    std::stringstream ss;
+    ss << v;
+
+    std::string expected = "[1, 2, 3]";
+    EXPECT_EQ(ss.str(), expected);
+}
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
new file mode 100644
index 000000000..bd6d85d51
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_VectorExpr.cpp
@@ -0,0 +1,409 @@
+/**
+ * @file test_VectorExpr.cpp
+ * @brief Unit tests for VectorExpr.h - vector expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Vector.h"
+#include "FE/Math/VectorExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <memory>
+#include <atomic>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for VectorExpr tests
+class VectorExprTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    // Custom allocator to track memory allocations
+    template<typename T>
+    class TrackingAllocator {
+    public:
+        using value_type = T;
+
+        static std::atomic<size_t> allocations;
+        static std::atomic<size_t> deallocations;
+        static std::atomic<size_t> bytes_allocated;
+
+        TrackingAllocator() = default;
+
+        template<typename U>
+        TrackingAllocator(const TrackingAllocator<U>&) {}
+
+        T* allocate(size_t n) {
+            allocations.fetch_add(1);
+            bytes_allocated.fetch_add(n * sizeof(T));
+            return static_cast<T*>(::operator new(n * sizeof(T)));
+        }
+
+        void deallocate(T* p, size_t n) {
+            deallocations.fetch_add(1);
+            ::operator delete(p);
+        }
+
+        static void reset() {
+            allocations = 0;
+            deallocations = 0;
+            bytes_allocated = 0;
+        }
+    };
+
+    void SetUp() override {
+        TrackingAllocator<double>::reset();
+    }
+
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::allocations{0};
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::deallocations{0};
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::bytes_allocated{0};
+
+// =============================================================================
+// Lazy Evaluation Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, LazyEvaluationNoTemporaries) {
+    // Expression templates should not create temporary vectors
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+
+    // Build expression without evaluation
+    auto expr = a + b - c;
+
+    // Expression type should not be Vector, but an expression type
+    using ExprType = decltype(expr);
+    EXPECT_FALSE((std::is_same_v<ExprType, Vector<double, 3>>));
+
+    // Now evaluate
+    Vector<double, 3> result = expr;
+    EXPECT_DOUBLE_EQ(result[0], -2.0);
+    EXPECT_DOUBLE_EQ(result[1], -1.0);
+    EXPECT_DOUBLE_EQ(result[2], 0.0);
+}
+
+TEST_F(VectorExprTest, LazyEvaluationAccessPattern) {
+    Vector<double, 4> a{1.0, 2.0, 3.0, 4.0};
+    Vector<double, 4> b{5.0, 6.0, 7.0, 8.0};
+
+    auto expr = a + b;
+
+    // Access individual elements without full evaluation
+    EXPECT_DOUBLE_EQ(expr[0], 6.0);
+    EXPECT_DOUBLE_EQ(expr[2], 10.0);
+
+    // Size should be accessible
+    EXPECT_EQ(expr.size(), 4u);
+}
+
+// =============================================================================
+// Expression Chaining Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ChainedAdditionSubtraction) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{2.0, 3.0, 4.0};
+    Vector<double, 3> d{1.0, 1.0, 1.0};
+
+    // Chain multiple operations
+    Vector<double, 3> result = a + b - c + d;
+
+    EXPECT_DOUBLE_EQ(result[0], 4.0);
+    EXPECT_DOUBLE_EQ(result[1], 5.0);
+    EXPECT_DOUBLE_EQ(result[2], 6.0);
+}
+
+TEST_F(VectorExprTest, DeepExpressionNesting) {
+    Vector<double, 2> v1{1.0, 2.0};
+    Vector<double, 2> v2{3.0, 4.0};
+    Vector<double, 2> v3{5.0, 6.0};
+    Vector<double, 2> v4{7.0, 8.0};
+    Vector<double, 2> v5{9.0, 10.0};
+
+    // Deep nesting
+    Vector<double, 2> result = ((v1 + v2) - (v3 - v4)) + v5;
+
+    EXPECT_DOUBLE_EQ(result[0], 15.0);
+    EXPECT_DOUBLE_EQ(result[1], 18.0);
+}
+
+// =============================================================================
+// Mixed Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ScalarMultiplicationInExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> result = 2.0 * (a + b) / 3.0;
+
+    EXPECT_TRUE(approx_equal(result[0], 10.0/3.0));
+    EXPECT_TRUE(approx_equal(result[1], 14.0/3.0));
+    EXPECT_TRUE(approx_equal(result[2], 6.0));
+}
+
+TEST_F(VectorExprTest, MixedScalarVectorOperations) {
+    Vector<double, 4> v{2.0, 4.0, 6.0, 8.0};
+
+    // Complex mixed expression
+    Vector<double, 4> result = 3.0 * v / 2.0 + v * 0.5 - 1.0 * v;
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 4.0);
+    EXPECT_DOUBLE_EQ(result[2], 6.0);
+    EXPECT_DOUBLE_EQ(result[3], 8.0);
+}
+
+// =============================================================================
+// Unary Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, NegationInExpression) {
+    Vector<double, 3> a{1.0, -2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, -6.0};
+
+    Vector<double, 3> result = -a + (-b);
+
+    EXPECT_DOUBLE_EQ(result[0], -5.0);
+    EXPECT_DOUBLE_EQ(result[1], -3.0);
+    EXPECT_DOUBLE_EQ(result[2], 3.0);
+}
+
+TEST_F(VectorExprTest, AbsoluteValueExpression) {
+    Vector<double, 4> v{-1.5, 2.3, -4.7, 0.0};
+
+    Vector<double, 4> result = abs(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 1.5);
+    EXPECT_DOUBLE_EQ(result[1], 2.3);
+    EXPECT_DOUBLE_EQ(result[2], 4.7);
+    EXPECT_DOUBLE_EQ(result[3], 0.0);
+}
+
+TEST_F(VectorExprTest, SqrtExpression) {
+    Vector<double, 3> v{4.0, 9.0, 16.0};
+
+    Vector<double, 3> result = sqrt(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 3.0);
+    EXPECT_DOUBLE_EQ(result[2], 4.0);
+}
+
+// =============================================================================
+// Element-wise Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, HadamardProductExpression) {
+    Vector<double, 3> a{2.0, 3.0, 4.0};
+    Vector<double, 3> b{5.0, 6.0, 7.0};
+
+    Vector<double, 3> result = hadamard(a, b);
+
+    EXPECT_DOUBLE_EQ(result[0], 10.0);
+    EXPECT_DOUBLE_EQ(result[1], 18.0);
+    EXPECT_DOUBLE_EQ(result[2], 28.0);
+}
+
+TEST_F(VectorExprTest, HadamardDivisionExpression) {
+    Vector<double, 3> a{10.0, 18.0, 28.0};
+    Vector<double, 3> b{2.0, 3.0, 4.0};
+
+    Vector<double, 3> result = hadamard_div(a, b);
+
+    EXPECT_DOUBLE_EQ(result[0], 5.0);
+    EXPECT_DOUBLE_EQ(result[1], 6.0);
+    EXPECT_DOUBLE_EQ(result[2], 7.0);
+}
+
+// =============================================================================
+// Dot Product and Norm Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, DotProductOfExpressions) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{2.0, 2.0, 2.0};
+
+    // Dot product of expressions
+    double result = dot(a + b, c);
+
+    EXPECT_DOUBLE_EQ(result, 42.0);
+}
+
+TEST_F(VectorExprTest, NormOfExpression) {
+    Vector<double, 2> a{3.0, 0.0};
+    Vector<double, 2> b{0.0, 4.0};
+
+    double result = norm(a + b);
+
+    EXPECT_DOUBLE_EQ(result, 5.0);  // norm of (3,4) = 5
+}
+
+TEST_F(VectorExprTest, NormalizeExpression) {
+    Vector<double, 3> v{3.0, 0.0, 4.0};
+
+    Vector<double, 3> result = normalize(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 0.6);
+    EXPECT_DOUBLE_EQ(result[1], 0.0);
+    EXPECT_DOUBLE_EQ(result[2], 0.8);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, TypeDeductionCorrectness) {
+    Vector<float, 3> vf{1.0f, 2.0f, 3.0f};
+    Vector<double, 3> vd{4.0, 5.0, 6.0};
+
+    // Mixed type operations should promote to higher precision
+    auto expr = vf + vf;  // float expression
+    using ExprType = decltype(expr[0]);
+    EXPECT_TRUE((std::is_same_v<ExprType, float>));
+
+    // Test that expression evaluates correctly
+    Vector<float, 3> result = expr;
+    EXPECT_FLOAT_EQ(result[0], 2.0f);
+    EXPECT_FLOAT_EQ(result[1], 4.0f);
+    EXPECT_FLOAT_EQ(result[2], 6.0f);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SFINAEConstraints) {
+    // Test that VectorExpr operators only work with VectorExpr types
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2{4.0, 5.0, 6.0};
+
+    // This should compile
+    auto expr = v1 + v2;
+    Vector<double, 3> result = expr;
+
+    // Verify the constraint checking
+    EXPECT_TRUE((std::is_base_of_v<VectorExpr<Vector<double, 3>>, Vector<double, 3>>));
+}
+
+// =============================================================================
+// Aliasing and Self-Assignment Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SelfAssignmentWithExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // Self-assignment through expression
+    a = a + b;
+
+    EXPECT_DOUBLE_EQ(a[0], 5.0);
+    EXPECT_DOUBLE_EQ(a[1], 7.0);
+    EXPECT_DOUBLE_EQ(a[2], 9.0);
+}
+
+TEST_F(VectorExprTest, AliasingInExpression) {
+    Vector<double, 3> a{2.0, 3.0, 4.0};
+    Vector<double, 3> b{1.0, 1.0, 1.0};
+
+    // a appears on both sides
+    a = b + a;
+
+    EXPECT_DOUBLE_EQ(a[0], 3.0);
+    EXPECT_DOUBLE_EQ(a[1], 4.0);
+    EXPECT_DOUBLE_EQ(a[2], 5.0);
+}
+
+// =============================================================================
+// Edge Cases Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SingleElementVector) {
+    Vector<double, 1> a{5.0};
+    Vector<double, 1> b{3.0};
+
+    Vector<double, 1> result = a + b - a * 0.5;
+
+    EXPECT_DOUBLE_EQ(result[0], 5.5);
+}
+
+TEST_F(VectorExprTest, EmptyExpression) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Expression that evaluates to identity
+    Vector<double, 3> result = v + v * 0.0;
+
+    EXPECT_DOUBLE_EQ(result[0], 1.0);
+    EXPECT_DOUBLE_EQ(result[1], 2.0);
+    EXPECT_DOUBLE_EQ(result[2], 3.0);
+}
+
+TEST_F(VectorExprTest, LargeVectorExpression) {
+    const size_t N = 100;
+    Vector<double, N> a, b, c;
+
+    for (size_t i = 0; i < N; ++i) {
+        a[i] = static_cast<double>(i);
+        b[i] = static_cast<double>(i * 2);
+        c[i] = static_cast<double>(i * 3);
+    }
+
+    Vector<double, N> result = a + b - c / 2.0;
+
+    for (size_t i = 0; i < N; ++i) {
+        EXPECT_DOUBLE_EQ(result[i], i + 2.0 * i - 1.5 * i);
+    }
+}
+
+// =============================================================================
+// Complex Expression Pattern Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ComplexNestedExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+
+    // Complex expression with multiple operation types
+    Vector<double, 3> result = 2.0 * abs(a - b) + sqrt(hadamard(c, c)) / 3.0;
+
+    // Verify each component
+    // |a - b| = |(-3, -3, -3)| = (3, 3, 3)
+    // 2 * (3, 3, 3) = (6, 6, 6)
+    // c * c = (49, 64, 81)
+    // sqrt(c * c) = (7, 8, 9)
+    // sqrt(c * c) / 3 = (7/3, 8/3, 3)
+    // result = (6 + 7/3, 6 + 8/3, 6 + 3) = (25/3, 26/3, 9)
+
+    EXPECT_TRUE(approx_equal(result[0], 25.0/3.0));
+    EXPECT_TRUE(approx_equal(result[1], 26.0/3.0));
+    EXPECT_DOUBLE_EQ(result[2], 9.0);
+}
+
+TEST_F(VectorExprTest, ChainedUnaryOperations) {
+    Vector<double, 4> v{-4.0, -9.0, -16.0, -25.0};
+
+    // Chain of unary operations
+    Vector<double, 4> result = sqrt(abs(-v));
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 3.0);
+    EXPECT_DOUBLE_EQ(result[2], 4.0);
+    EXPECT_DOUBLE_EQ(result[3], 5.0);
+}

From dfdeead1edd2813a4a24bdba45fc13b883c6a919 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Fri, 5 Jun 2026 12:44:00 -0700
Subject: [PATCH 02/91] Update FSI HEX8 FE Basis reference results

Regenerate affected FSI and FSI-ustruct HEX8 result_005.vtu references for the FE Basis path with nonzero HEX8 Hessian contributions.

Update the pipe_3d PETSc and Trilinos references to match the base pipe_3d reference, preserving the existing shared-reference pattern across linear algebra variants.
---
 tests/cases/fsi/pipe_3d/result_005.vtu             | 4 ++--
 tests/cases/fsi/pipe_3d_petsc/result_005.vtu       | 4 ++--
 tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu | 4 ++--
 tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu | 4 ++--
 tests/cases/fsi/pipe_RCR_3d/result_005.vtu         | 4 ++--
 tests/cases/fsi_ustruct/pipe_3d/result_005.vtu     | 4 ++--
 tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/cases/fsi/pipe_3d/result_005.vtu b/tests/cases/fsi/pipe_3d/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_petsc/result_005.vtu b/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu b/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu b/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_RCR_3d/result_005.vtu b/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
index 79eaced8c..6945fd005 100644
--- a/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
+++ b/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f194a3c364de0bf1a6cc79ba542306469e151de36275a06564022730c3f2c84c
-size 209865
+oid sha256:25a08e99ae0163800e73ea54720557d742548fe75a0eb6b68461d8bdb366972f
+size 227320
diff --git a/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu b/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
index c838c9c3f..8b5f73c2a 100644
--- a/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
+++ b/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:262ffb4d7b644280f15bb2e32c8e5fc5ddade7fa5cabd845c31fe3803e9ef0a0
-size 207864
+oid sha256:16f0f2b2ea6a133f54db03954e76ea7586b0fb56d36e2e350ccd21ebadaf4bfb
+size 228764
diff --git a/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu b/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
index e9e051d73..7d6c64d9b 100644
--- a/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
+++ b/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dec176a56b610ed6b754f66e532a15ac1563b72c25198f49a0bc53adc6e4552
-size 207628
+oid sha256:5c00d715542a495f37a6ea1cd514cc654d3215360170a06c3af1440b71f7d093
+size 228708

From 8b47802fbcaf83ec07a4636de7ac6e6084db1364 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Sun, 7 Jun 2026 23:21:16 -0700
Subject: [PATCH 03/91] fixing temporary A + B expression in matrix and vector
 objects

---
 Code/Source/solver/FE/Math/MatrixExpr.h | 20 ++++++++++----------
 Code/Source/solver/FE/Math/VectorExpr.h | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index da2f8c8d6..097f35361 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -82,8 +82,8 @@ class MatrixExpr {
 template<typename LHS, typename RHS, typename Op>
 class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
     Op op_;
 
 public:
@@ -131,7 +131,7 @@ class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
 template<typename Expr, typename Op>
 class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Op op_;
 
 public:
@@ -178,7 +178,7 @@ class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
 template<typename Expr, typename Scalar>
 class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -225,7 +225,7 @@ class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
 template<typename Expr, typename Scalar>
 class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -274,8 +274,8 @@ class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>>
 template<typename LHS, typename RHS>
 class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
 
 public:
     /**
@@ -326,7 +326,7 @@ class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
 template<typename Expr>
 class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
 
 public:
     /**
@@ -370,7 +370,7 @@ class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
 template<typename VecExpr>
 class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
 private:
-    const VecExpr& vec_;
+    VecExpr vec_;
     std::size_t n_;
 
 public:
@@ -623,4 +623,4 @@ constexpr auto trace(const MatrixExpr<Expr>& expr) {
 } // namespace FE
 } // namespace svmp
 
-#endif // SVMP_FE_MATH_MATRIX_EXPR_H
\ No newline at end of file
+#endif // SVMP_FE_MATH_MATRIX_EXPR_H
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 8b9c8e382..627d2fd88 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -72,8 +72,8 @@ class VectorExpr {
 template<typename LHS, typename RHS, typename Op>
 class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
     Op op_;
 
 public:
@@ -112,7 +112,7 @@ class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
 template<typename Expr, typename Op>
 class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Op op_;
 
 public:
@@ -150,7 +150,7 @@ class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
 template<typename Expr, typename Scalar>
 class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -188,7 +188,7 @@ class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
 template<typename Expr, typename Scalar>
 class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -415,4 +415,4 @@ constexpr auto normalize(const VectorExpr<Expr>& expr) {
 } // namespace FE
 } // namespace svmp
 
-#endif // SVMP_FE_MATH_VECTOR_EXPR_H
\ No newline at end of file
+#endif // SVMP_FE_MATH_VECTOR_EXPR_H

From 4d6baaa57f809a9ee4d6261069a84aec6efc9806 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 00:41:06 -0700
Subject: [PATCH 04/91] fixing fetch content for google tests

---
 Code/Source/solver/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index e42391862..4f317cf79 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -348,11 +348,11 @@ if(ENABLE_UNIT_TEST)
 
   # install Google Test
   #if(NOT TARGET gtest_main AND NOT TARGET gtest)
-  include(FetchContent)
   FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/heads/main.zip
-    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+          googletest
+          GIT_REPOSITORY https://github.com/google/googletest.git
+          GIT_TAG v1.17.0
+          DOWNLOAD_EXTRACT_TIMESTAMP TRUE
   )
   FetchContent_MakeAvailable(googletest)
   #endif()

From 81cad5461e14d754b42cd44b89d2abba10993d71 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 00:51:47 -0700
Subject: [PATCH 05/91] adding fetch content to include for enabled unit tests

---
 Code/Source/solver/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index 4f317cf79..1adc6be78 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -345,7 +345,7 @@ if(ENABLE_UNIT_TEST)
 
   # link pthread on ubuntu20
   find_package(Threads REQUIRED)
-
+  include(FetchContent)
   # install Google Test
   #if(NOT TARGET gtest_main AND NOT TARGET gtest)
   FetchContent_Declare(

From 004e678ca0e830d5a54f93b2a468fe347bf0456c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 12:20:53 -0700
Subject: [PATCH 06/91] removing basis optimizations, caching, pyramid support,
 manual/static reference tables and related unit tests

---
 Code/Source/solver/CMakeLists.txt             |    8 +
 Code/Source/solver/FE/Basis/BasisCache.cpp    |  309 -
 Code/Source/solver/FE/Basis/BasisCache.h      |  456 -
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |   81 +-
 Code/Source/solver/FE/Basis/BasisFactory.h    |    6 -
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  262 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  367 +-
 Code/Source/solver/FE/Basis/BasisTolerance.h  |   52 -
 Code/Source/solver/FE/Basis/BasisTraits.h     |   55 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 8661 +----------------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  123 +-
 .../solver/FE/Basis/LagrangeBasisFast.h       | 1378 ---
 .../solver/FE/Basis/LagrangeBasisPyramid.cpp  | 2069 ----
 .../solver/FE/Basis/LagrangeBasisPyramid.h    |   67 -
 .../solver/FE/Basis/LagrangeBasisSimplex.cpp  | 2457 -----
 .../solver/FE/Basis/LagrangeBasisSimplex.h    |   78 -
 .../solver/FE/Basis/LagrangeBasisUtility.h    |   25 -
 .../FE/Basis/NodeOrderingConventions.cpp      |  580 +-
 .../solver/FE/Basis/NodeOrderingConventions.h |  508 +-
 .../solver/FE/Basis/PyramidModalBasis.h       |  265 -
 .../solver/FE/Basis/SerendipityBasis.cpp      |   74 +-
 .../Source/solver/FE/Basis/SerendipityBasis.h |    7 -
 Code/Source/solver/FE/Basis/VectorBasis.h     |  255 -
 .../FE/Basis/VectorBasisEvaluationHelpers.cpp |  593 --
 .../FE/Basis/VectorBasisEvaluationHelpers.h   |  751 --
 .../FE/Basis/VectorBasisModalPolynomial.h     |   77 -
 Code/Source/solver/FE/Common/Alignment.h      |   23 -
 Code/Source/solver/FE/Common/Types.h          |    9 +-
 Code/Source/solver/FE/Math/Matrix.h           |    2 +-
 Code/Source/solver/FE/Math/Vector.h           |    2 +-
 .../solver/FE/Quadrature/QuadratureRule.h     |  237 -
 Code/Source/solver/Timer.h                    |   21 +-
 Code/Source/solver/load_msh.cpp               |    2 -
 Code/Source/solver/utils.cpp                  |   14 +-
 .../eigen3/unsupported/Eigen/CXX11/Tensor     |    2 +
 .../FE/Basis/test_BasisCacheFactory.cpp       |  256 -
 .../FE/Basis/test_BasisErrorPaths.cpp         |   60 +-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |   32 -
 .../FE/Basis/test_ConstexprBasis.cpp          |  135 +-
 ...ePyramid.cpp => test_HigherOrderWedge.cpp} |   66 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 3198 +-----
 .../FE/Basis/test_SerendipityTensorModal.cpp  |   12 +-
 tests/unitTests/test_common.h                 |    3 +-
 43 files changed, 1060 insertions(+), 22578 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Basis/BasisCache.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/BasisCache.h
 delete mode 100644 Code/Source/solver/FE/Basis/BasisTolerance.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisFast.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
 delete mode 100644 Code/Source/solver/FE/Basis/PyramidModalBasis.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasis.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
 delete mode 100644 Code/Source/solver/FE/Common/Alignment.h
 delete mode 100644 Code/Source/solver/FE/Quadrature/QuadratureRule.h
 delete mode 100644 tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
 rename tests/unitTests/FE/Basis/{test_HigherOrderWedgePyramid.cpp => test_HigherOrderWedge.cpp} (64%)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index 1adc6be78..bdebc4a52 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -355,6 +355,14 @@ if(ENABLE_UNIT_TEST)
           DOWNLOAD_EXTRACT_TIMESTAMP TRUE
   )
   FetchContent_MakeAvailable(googletest)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_STANDARD GREATER_EQUAL 20)
+    foreach(GTEST_TARGET gtest gtest_main gmock gmock_main)
+      if(TARGET ${GTEST_TARGET})
+        target_compile_options(${GTEST_TARGET} PRIVATE -std=gnu++17)
+      endif()
+    endforeach()
+  endif()
   #endif()
 
   enable_testing()
diff --git a/Code/Source/solver/FE/Basis/BasisCache.cpp b/Code/Source/solver/FE/Basis/BasisCache.cpp
deleted file mode 100644
index 6d8a4ede3..000000000
--- a/Code/Source/solver/FE/Basis/BasisCache.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#include "BasisCache.h"
-#include <utility>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-namespace {
-
-QuadratureCacheKey make_quadrature_cache_key(const quadrature::QuadratureRule& quad) noexcept {
-    const auto fingerprint = quad.point_fingerprint();
-    return QuadratureCacheKey{fingerprint.dimension,
-                              fingerprint.num_points,
-                              fingerprint.points_hash_a,
-                              fingerprint.points_hash_b};
-}
-
-void mix_hash_word(std::uint64_t word,
-                   std::uint64_t& hash_a,
-                   std::uint64_t& hash_b) noexcept {
-    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
-    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
-}
-
-std::pair<std::uint64_t, std::uint64_t>
-identity_fingerprint(const std::string& identity) noexcept {
-    std::uint64_t hash_a = 0xa4093822299f31d0ULL;
-    std::uint64_t hash_b = 0x082efa98ec4e6c89ULL;
-    mix_hash_word(static_cast<std::uint64_t>(identity.size()), hash_a, hash_b);
-    for (const char c : identity) {
-        mix_hash_word(static_cast<std::uint64_t>(static_cast<unsigned char>(c)), hash_a, hash_b);
-    }
-    return {hash_a, hash_b};
-}
-
-BasisCacheKey make_basis_cache_key(const BasisFunction& basis,
-                                   const quadrature::QuadratureRule& quad,
-                                   bool gradients,
-                                   bool hessians) {
-    StructuralBasisKey structural_key{
-        basis.basis_type(),
-        basis.element_type(),
-        basis.dimension(),
-        basis.order(),
-        basis.size(),
-        basis.is_vector_valued(),
-        make_quadrature_cache_key(quad),
-        gradients,
-        hessians
-    };
-
-    BasisCacheKey key;
-    const bool uses_basis_identity = !basis.cache_identity_is_structural();
-    if (!uses_basis_identity) {
-        key.value = structural_key;
-        return key;
-    }
-
-    std::vector<std::uint64_t> basis_identity_words;
-    const bool uses_structured_identity = basis.cache_identity_words(basis_identity_words);
-    if (!uses_structured_identity) {
-        basis_identity_words.clear();
-    }
-    const std::string basis_identity =
-        uses_structured_identity ? std::string{} : basis.cache_identity();
-    BasisIdentityFingerprint cached_identity_hash{};
-    const bool has_cached_identity_hash =
-        uses_structured_identity &&
-        basis.cache_identity_fingerprint(cached_identity_hash.hash_a,
-                                         cached_identity_hash.hash_b);
-    const auto identity_hash = uses_structured_identity
-        ? has_cached_identity_hash
-              ? std::pair<std::uint64_t, std::uint64_t>{
-                    cached_identity_hash.hash_a,
-                    cached_identity_hash.hash_b}
-              : [&basis_identity_words] {
-                    const auto fingerprint =
-                        compute_basis_identity_fingerprint(basis_identity_words);
-                    return std::pair<std::uint64_t, std::uint64_t>{
-                        fingerprint.hash_a,
-                        fingerprint.hash_b};
-                }()
-        : identity_fingerprint(basis_identity);
-    key.value = ParameterizedBasisKey{
-        structural_key,
-        uses_structured_identity,
-        identity_hash.first,
-        identity_hash.second,
-        std::move(basis_identity_words),
-        basis_identity
-    };
-    return key;
-}
-
-} // namespace
-
-BasisCache& BasisCache::instance() {
-    static BasisCache cache;
-    return cache;
-}
-
-const BasisCacheEntry& BasisCache::get_or_compute(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return *get_or_compute_shared(basis, quad, gradients, hessians);
-}
-
-std::shared_ptr<const BasisCacheEntry> BasisCache::get_or_compute_shared(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    const BasisCacheKey key = make_basis_cache_key(basis, quad, gradients, hessians);
-
-    // Warm path: shared (reader) lock allows concurrent cache hits.
-    {
-        std::shared_lock<std::shared_mutex> read_lock(mutex_);
-        auto it = slots_.find(key);
-        if (it != slots_.end() && it->second.entry) {
-            return it->second.entry;
-        }
-    }
-
-    std::shared_ptr<InFlightComputation> in_flight;
-    bool owner = false;
-    {
-        std::unique_lock<std::shared_mutex> write_lock(mutex_);
-        auto& slot = slots_[key];
-        if (slot.entry) {
-            return slot.entry;
-        }
-
-        if (!slot.pending) {
-            in_flight = std::make_shared<InFlightComputation>();
-            slot.pending = in_flight;
-            owner = true;
-        } else {
-            in_flight = slot.pending;
-        }
-    }
-
-    if (!owner) {
-        std::unique_lock<std::mutex> wait_lock(in_flight->mutex);
-        in_flight->ready_cv.wait(wait_lock, [&in_flight] { return in_flight->ready; });
-        if (in_flight->exception) {
-            std::rethrow_exception(in_flight->exception);
-        }
-        return in_flight->entry;
-    }
-
-    try {
-        auto entry = std::make_shared<BasisCacheEntry>(compute(basis, quad, gradients, hessians));
-        {
-            std::unique_lock<std::shared_mutex> write_lock(mutex_);
-            auto slot_it = slots_.find(key);
-            if (slot_it == slots_.end()) {
-                slot_it = slots_.emplace(key, CacheSlot{}).first;
-            }
-            auto& slot = slot_it->second;
-            if (slot.entry) {
-                entry = slot.entry;
-            } else {
-                slot.entry = entry;
-            }
-            if (slot.pending == in_flight) {
-                slot.pending.reset();
-            }
-        }
-        {
-            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
-            in_flight->entry = entry;
-            in_flight->ready = true;
-        }
-        in_flight->ready_cv.notify_all();
-        return entry;
-    } catch (...) {
-        {
-            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
-            in_flight->exception = std::current_exception();
-            in_flight->ready = true;
-        }
-        {
-            std::unique_lock<std::shared_mutex> write_lock(mutex_);
-            auto slot_it = slots_.find(key);
-            if (slot_it != slots_.end() && slot_it->second.pending == in_flight) {
-                slot_it->second.pending.reset();
-                if (!slot_it->second.entry) {
-                    slots_.erase(slot_it);
-                }
-            }
-        }
-        in_flight->ready_cv.notify_all();
-        throw;
-    }
-}
-
-const BasisCacheEntry& BasisCache::prewarm(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return get_or_compute(basis, quad, gradients, hessians);
-}
-
-BasisCacheHandle BasisCache::prewarm_handle(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return BasisCacheHandle(get_or_compute_shared(basis, quad, gradients, hessians));
-}
-
-BasisCacheEntry BasisCache::compute_uncached(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) const {
-    return compute(basis, quad, gradients, hessians);
-}
-
-void BasisCache::clear() {
-    std::unique_lock<std::shared_mutex> lock(mutex_);
-    for (auto it = slots_.begin(); it != slots_.end();) {
-        if (it->second.pending) {
-            it->second.entry.reset();
-            ++it;
-        } else {
-            it = slots_.erase(it);
-        }
-    }
-}
-
-std::size_t BasisCache::size() const {
-    std::shared_lock<std::shared_mutex> lock(mutex_);
-    std::size_t completed = 0;
-    for (const auto& [key, slot] : slots_) {
-        (void)key;
-        if (slot.entry) {
-            ++completed;
-        }
-    }
-    return completed;
-}
-
-BasisCacheEntry BasisCache::compute(const BasisFunction& basis,
-                                    const quadrature::QuadratureRule& quad,
-                                    bool gradients,
-                                    bool hessians) const {
-    BasisCacheEntry entry;
-    const auto& points = quad.points();
-    entry.num_qpts = points.size();
-    entry.num_dofs = basis.size();
-
-    const bool vector_basis = basis.is_vector_valued();
-    if (!vector_basis) {
-        entry.scalar_values.assign(entry.num_dofs * entry.num_qpts, Real(0));
-        if (gradients) {
-            entry.gradients.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        }
-        if (hessians) {
-            entry.hessians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
-        }
-    } else {
-        entry.vector_values_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        if (gradients && basis.supports_vector_jacobians()) {
-            entry.vector_jacobians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
-        }
-        if (gradients && basis.supports_curl()) {
-            entry.vector_curls_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        }
-        if (gradients && basis.supports_divergence()) {
-            entry.vector_divergence.assign(entry.num_dofs * entry.num_qpts, Real(0));
-        }
-    }
-
-    if (vector_basis) {
-        if (entry.num_dofs > 0 && entry.num_qpts > 0) {
-            basis.evaluate_vector_at_quadrature_points(
-                points,
-                entry.vector_values_xyz.data(),
-                entry.vector_jacobians.empty() ? nullptr : entry.vector_jacobians.data(),
-                entry.vector_curls_xyz.empty() ? nullptr : entry.vector_curls_xyz.data(),
-                entry.vector_divergence.empty() ? nullptr : entry.vector_divergence.data());
-        }
-        return entry;
-    }
-
-    if (entry.num_dofs > 0 && entry.num_qpts > 0) {
-        basis.fill_scalar_cache_entry(points,
-                                      entry.num_qpts,
-                                      entry.scalar_values.data(),
-                                      gradients ? entry.gradients.data() : nullptr,
-                                      hessians ? entry.hessians.data() : nullptr);
-    }
-
-    return entry;
-}
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisCache.h b/Code/Source/solver/FE/Basis/BasisCache.h
deleted file mode 100644
index a84c0e87a..000000000
--- a/Code/Source/solver/FE/Basis/BasisCache.h
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_BASISCACHE_H
-#define SVMP_FE_BASIS_BASISCACHE_H
-
-/**
- * @file BasisCache.h
- * @brief Cache for basis evaluations at quadrature points
- */
-
-#include "BasisFunction.h"
-#include "Quadrature/QuadratureRule.h"
-#include <cstddef>
-#include <condition_variable>
-#include <exception>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <cstdint>
-#include <shared_mutex>
-#include <span>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <utility>
-#include <variant>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-struct QuadratureCacheKey {
-    int dimension{0};
-    std::size_t num_points{0};
-    // Quadrature coordinates are intentionally fingerprinted from their exact
-    // Real bit patterns. Values such as -0.0 and +0.0 therefore produce
-    // distinct cache keys unless a future API explicitly normalizes them. The
-    // key intentionally ignores weights and rule class because basis values only
-    // depend on reference coordinates; bit-identical point sets share entries.
-    std::uint64_t points_hash_a{0};
-    std::uint64_t points_hash_b{0};
-
-    bool operator==(const QuadratureCacheKey& other) const noexcept {
-        return dimension == other.dimension &&
-               num_points == other.num_points &&
-               points_hash_a == other.points_hash_a &&
-               points_hash_b == other.points_hash_b;
-    }
-};
-
-struct StructuralBasisKey {
-    BasisType basis_type{BasisType::Custom};
-    ElementType element_type{ElementType::Unknown};
-    int dimension{0};
-    int order{0};
-    std::size_t num_dofs{0};
-    bool vector_valued{false};
-    QuadratureCacheKey quadrature;
-    bool with_gradients{false};
-    bool with_hessians{false};
-
-    bool operator==(const StructuralBasisKey& other) const noexcept {
-        return basis_type == other.basis_type &&
-               element_type == other.element_type &&
-               dimension == other.dimension &&
-               order == other.order &&
-               num_dofs == other.num_dofs &&
-               vector_valued == other.vector_valued &&
-               quadrature == other.quadrature &&
-               with_gradients == other.with_gradients &&
-               with_hessians == other.with_hessians;
-    }
-};
-
-struct ParameterizedBasisKey {
-    StructuralBasisKey structural;
-    bool uses_structured_identity{false};
-    std::uint64_t identity_hash_a{0};
-    std::uint64_t identity_hash_b{0};
-    std::vector<std::uint64_t> basis_identity_words;
-    std::string basis_identity;
-
-    bool operator==(const ParameterizedBasisKey& other) const noexcept {
-        return structural == other.structural &&
-               uses_structured_identity == other.uses_structured_identity &&
-               identity_hash_a == other.identity_hash_a &&
-               identity_hash_b == other.identity_hash_b &&
-               basis_identity_words == other.basis_identity_words &&
-               basis_identity == other.basis_identity;
-    }
-};
-
-struct BasisCacheKey {
-    std::variant<StructuralBasisKey, ParameterizedBasisKey> value;
-
-    bool operator==(const BasisCacheKey& other) const noexcept {
-        return value == other.value;
-    }
-};
-
-struct BasisCacheKeyHash {
-    std::size_t operator()(const BasisCacheKey& key) const noexcept {
-        std::size_t seed = 0;
-        auto combine = [&seed](std::size_t value) noexcept {
-            seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
-        };
-
-        auto hash_structural = [&](const StructuralBasisKey& structural) noexcept {
-            combine(std::hash<int>()(structural.quadrature.dimension));
-            combine(std::hash<std::size_t>()(structural.quadrature.num_points));
-            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_a));
-            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_b));
-            combine(std::hash<int>()(static_cast<int>(structural.basis_type)));
-            combine(std::hash<int>()(static_cast<int>(structural.element_type)));
-            combine(std::hash<int>()(structural.dimension));
-            combine(std::hash<int>()(structural.order));
-            combine(std::hash<std::size_t>()(structural.num_dofs));
-            unsigned flags = 0u;
-            flags |= structural.vector_valued ? 1u : 0u;
-            flags |= structural.with_gradients ? 2u : 0u;
-            flags |= structural.with_hessians ? 4u : 0u;
-            combine(std::hash<unsigned>()(flags));
-        };
-
-        std::visit([&](const auto& active_key) {
-            using ActiveKey = std::decay_t<decltype(active_key)>;
-            if constexpr (std::is_same_v<ActiveKey, StructuralBasisKey>) {
-                combine(0x5354525543544b45ULL);
-                hash_structural(active_key);
-            } else {
-                combine(0x504152414d4b4559ULL);
-                hash_structural(active_key.structural);
-                combine(active_key.uses_structured_identity ? 1u : 0u);
-                combine(std::hash<std::uint64_t>()(active_key.identity_hash_a));
-                combine(std::hash<std::uint64_t>()(active_key.identity_hash_b));
-            }
-        }, key.value);
-        return seed;
-    }
-};
-
-struct BasisCacheEntry {
-    std::size_t num_qpts{0};
-    std::size_t num_dofs{0};
-    // Scalar basis values in dof-major SoA layout: [dof * num_qpts + qp].
-    std::vector<Real> scalar_values;
-    // Scalar reference gradients in dof/component/qpt SoA layout:
-    // [(dof * 3 + component) * num_qpts + qp].
-    std::vector<Real> gradients;
-    // Scalar reference Hessians in dof/component/qpt SoA layout:
-    // [(dof * 9 + row * 3 + col) * num_qpts + qp].
-    std::vector<Real> hessians;
-
-    // Vector basis values in dof/component/qpt SoA layout:
-    // [(dof * 3 + component) * num_qpts + qp].
-    std::vector<Real> vector_values_xyz;
-    // Vector basis reference Jacobians in dof/component/derivative/qpt layout:
-    // [(dof * 9 + component * 3 + derivative) * num_qpts + qp].
-    std::vector<Real> vector_jacobians;
-    // Vector basis curls in dof/component/qpt SoA layout.
-    std::vector<Real> vector_curls_xyz;
-    // Vector basis divergences in dof/qpt SoA layout.
-    std::vector<Real> vector_divergence;
-
-    // The object-returning accessors below are convenience helpers for tests,
-    // diagnostics, and occasional scalar use. Hot loops should prefer the SoA
-    // span accessors so they do not reconstruct Gradient, Hessian, or matrix
-    // objects per DOF and quadrature point.
-
-    [[nodiscard]] Real scalarValue(std::size_t dof, std::size_t qp) const noexcept {
-        return scalar_values[dof * num_qpts + qp];
-    }
-
-    [[nodiscard]] std::span<const Real> scalarValuesForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(scalar_values.data() + dof * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] Real gradientValue(std::size_t dof,
-                                     std::size_t component,
-                                     std::size_t qp) const noexcept {
-        return gradients[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] Gradient gradientVector(std::size_t dof, std::size_t qp) const noexcept {
-        Gradient out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = gradientValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> gradientsForDofComponent(std::size_t dof,
-                                                                  std::size_t component) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(gradients.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> gradientsForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(gradients.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real hessianValue(std::size_t dof,
-                                    std::size_t row,
-                                    std::size_t col,
-                                    std::size_t qp) const noexcept {
-        return hessians[(dof * 9u + row * 3u + col) * num_qpts + qp];
-    }
-
-    [[nodiscard]] Hessian hessianMatrix(std::size_t dof, std::size_t qp) const noexcept {
-        Hessian out{};
-        for (std::size_t row = 0; row < 3u; ++row) {
-            for (std::size_t col = 0; col < 3u; ++col) {
-                out(row, col) = hessianValue(dof, row, col, qp);
-            }
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> hessiansForDofComponent(std::size_t dof,
-                                                                 std::size_t row,
-                                                                 std::size_t col) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(hessians.data() + (dof * 9u + row * 3u + col) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> hessiansForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(hessians.data() + dof * 9u * num_qpts, 9u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorValue(std::size_t dof,
-                                   std::size_t component,
-                                   std::size_t qp) const noexcept {
-        return vector_values_xyz[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] math::Vector<Real, 3> vectorValue(std::size_t dof,
-                                                     std::size_t qp) const noexcept {
-        math::Vector<Real, 3> out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = vectorValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorValuesForDofComponent(std::size_t dof,
-                                                                     std::size_t component) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(vector_values_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorValuesForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_values_xyz.empty()) return {};
-        return std::span<const Real>(vector_values_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorJacobianValue(std::size_t dof,
-                                           std::size_t component,
-                                           std::size_t derivative,
-                                           std::size_t qp) const noexcept {
-        return vector_jacobians[(dof * 9u + component * 3u + derivative) * num_qpts + qp];
-    }
-
-    [[nodiscard]] VectorJacobian vectorJacobianMatrix(std::size_t dof,
-                                                       std::size_t qp) const noexcept {
-        VectorJacobian out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
-                out(component, derivative) =
-                    vectorJacobianValue(dof, component, derivative, qp);
-            }
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorJacobiansForDofComponentDerivative(
-        std::size_t dof,
-        std::size_t component,
-        std::size_t derivative) const noexcept {
-        if (num_qpts == 0 || vector_jacobians.empty()) return {};
-        return std::span<const Real>(
-            vector_jacobians.data() + (dof * 9u + component * 3u + derivative) * num_qpts,
-            num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorJacobiansForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_jacobians.empty()) return {};
-        return std::span<const Real>(vector_jacobians.data() + dof * 9u * num_qpts, 9u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorCurlValue(std::size_t dof,
-                                       std::size_t component,
-                                       std::size_t qp) const noexcept {
-        return vector_curls_xyz[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] math::Vector<Real, 3> vectorCurl(std::size_t dof,
-                                                    std::size_t qp) const noexcept {
-        math::Vector<Real, 3> out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = vectorCurlValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorCurlsForDofComponent(std::size_t dof,
-                                                                    std::size_t component) const noexcept {
-        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
-        return std::span<const Real>(vector_curls_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorCurlsForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
-        return std::span<const Real>(vector_curls_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorDivergenceValue(std::size_t dof,
-                                             std::size_t qp) const noexcept {
-        return vector_divergence[dof * num_qpts + qp];
-    }
-
-    [[nodiscard]] std::span<const Real> vectorDivergenceForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_divergence.empty()) return {};
-        return std::span<const Real>(vector_divergence.data() + dof * num_qpts, num_qpts);
-    }
-};
-
-class BasisCacheHandle {
-public:
-    BasisCacheHandle() = default;
-
-    [[nodiscard]] const BasisCacheEntry& entry() const {
-        BASIS_CHECK_CONFIG(entry_ != nullptr,
-                           "BasisCacheHandle: attempted to access an empty handle");
-        return *entry_;
-    }
-
-    [[nodiscard]] bool valid() const noexcept { return entry_ != nullptr; }
-    explicit operator bool() const noexcept { return valid(); }
-
-private:
-    friend class BasisCache;
-
-    explicit BasisCacheHandle(std::shared_ptr<const BasisCacheEntry> entry)
-        : entry_(std::move(entry)) {}
-
-    std::shared_ptr<const BasisCacheEntry> entry_;
-};
-
-class BasisCache {
-public:
-    static BasisCache& instance();
-
-    const BasisCacheEntry& get_or_compute(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Compute an entry without consulting, publishing to, or waiting on
-     * the shared cache.
-     */
-    BasisCacheEntry compute_uncached(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false) const;
-
-    /**
-     * @brief Eagerly populate the cache for the given (basis, quadrature) key
-     *
-     * Pays the compute cost up front so that subsequent get_or_compute calls
-     * for the same key hit the warm-cache path immediately. Equivalent to
-     * calling get_or_compute and discarding the return value.
-     *
-     * Returns the inserted (or pre-existing) entry for convenience.
-     */
-    const BasisCacheEntry& prewarm(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Eagerly populate the cache and return a hot-loop handle.
-     *
-     * The returned handle owns a shared reference to the completed entry. Access
-     * through BasisCacheHandle::entry() performs no key construction, hashing,
-     * map lookup, or cache mutex acquisition. Calling clear() removes the entry
-     * from the global lookup map but does not invalidate existing handles.
-     */
-    BasisCacheHandle prewarm_handle(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Remove completed cache entries.
-     *
-     * This is a soft clear: computations that were already in flight before
-     * clear() was called are allowed to publish their completed entry afterward.
-     * This preserves the returned-reference lifetime contract for concurrent
-     * get_or_compute() callers while still dropping all entries that had already
-     * completed at the time of the call.
-     */
-    void clear();
-    std::size_t size() const;
-
-private:
-    struct InFlightComputation {
-        std::mutex mutex;
-        std::condition_variable ready_cv;
-        bool ready{false};
-        std::shared_ptr<BasisCacheEntry> entry;
-        std::exception_ptr exception;
-    };
-
-    struct CacheSlot {
-        std::shared_ptr<BasisCacheEntry> entry;
-        std::shared_ptr<InFlightComputation> pending;
-    };
-
-    BasisCache() = default;
-    BasisCache(const BasisCache&) = delete;
-    BasisCache& operator=(const BasisCache&) = delete;
-
-    BasisCacheEntry compute(const BasisFunction& basis,
-                            const quadrature::QuadratureRule& quad,
-                            bool gradients,
-                            bool hessians) const;
-
-    std::shared_ptr<const BasisCacheEntry> get_or_compute_shared(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients,
-        bool hessians);
-
-    mutable std::shared_mutex mutex_;
-    std::unordered_map<BasisCacheKey, CacheSlot, BasisCacheKeyHash> slots_;
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_BASISCACHE_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index dddbd4c5c..9f0867959 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -10,29 +10,12 @@
 #include "LagrangeBasis.h"
 #include "SerendipityBasis.h"
 
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-
 namespace svmp {
 namespace FE {
 namespace basis {
 
 namespace {
 
-using CustomRegistryMap =
-    std::unordered_map<std::string, basis_factory::CustomFactory>;
-
-CustomRegistryMap& custom_registry() {
-    static CustomRegistryMap registry;
-    return registry;
-}
-
-std::mutex& custom_registry_mutex() {
-    static std::mutex mutex;
-    return mutex;
-}
-
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
@@ -50,12 +33,12 @@ int require_basis_order(const BasisRequest& req,
 void require_scalar_c0_request(const BasisRequest& req) {
     if (req.field_type != FieldType::Scalar) {
         throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases currently support scalar fields only",
+            "BasisFactory: Lagrange/Serendipity bases support scalar fields only",
             __FILE__, __LINE__, __func__);
     }
     if (req.continuity != Continuity::C0) {
         throw BasisConfigurationException(
-            "BasisFactory: migrated Lagrange/Serendipity scope supports C0 continuity only",
+            "BasisFactory: Lagrange/Serendipity bases support C0 continuity only",
             __FILE__, __LINE__, __func__);
     }
 }
@@ -78,36 +61,6 @@ std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
     return std::make_shared<SerendipityBasis>(req.element_type, order);
 }
 
-std::shared_ptr<BasisFunction> create_custom(const BasisRequest& req) {
-    if (req.custom_id.empty()) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom basis requests require custom_id",
-            __FILE__, __LINE__, __func__);
-    }
-
-    basis_factory::CustomFactory factory;
-    {
-        std::lock_guard<std::mutex> lock(custom_registry_mutex());
-        const auto it = custom_registry().find(req.custom_id);
-        if (it == custom_registry().end()) {
-            throw BasisConfigurationException(
-                "BasisFactory: no custom basis factory registered for id '" +
-                    req.custom_id + "'",
-                __FILE__, __LINE__, __func__);
-        }
-        factory = it->second;
-    }
-
-    auto basis = factory(req);
-    if (!basis) {
-        throw BasisConstructionException(
-            "BasisFactory: custom factory returned null basis for id '" +
-                req.custom_id + "'",
-            __FILE__, __LINE__, __func__);
-    }
-    return basis;
-}
-
 } // namespace
 
 namespace basis_factory {
@@ -118,41 +71,13 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
             return create_lagrange(req);
         case BasisType::Serendipity:
             return create_serendipity(req);
-        case BasisType::Custom:
-            return create_custom(req);
         default:
             throw BasisConfigurationException(
-                "BasisFactory: requested basis family is outside the migrated Lagrange/Serendipity scope",
+                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope",
                 __FILE__, __LINE__, __func__);
     }
 }
 
-void register_custom(std::string custom_id, CustomFactory factory) {
-    if (custom_id.empty()) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom factory id must not be empty",
-            __FILE__, __LINE__, __func__);
-    }
-    if (!factory) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom factory must be callable",
-            __FILE__, __LINE__, __func__);
-    }
-
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry()[std::move(custom_id)] = std::move(factory);
-}
-
-void unregister_custom(const std::string& custom_id) {
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry().erase(custom_id);
-}
-
-void clear_custom_registry_for_tests() {
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry().clear();
-}
-
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index cedf1ba6d..c937dd4a0 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -14,7 +14,6 @@
  */
 
 #include "BasisFunction.h"
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -41,12 +40,7 @@ struct BasisRequest {
 
 namespace basis_factory {
 
-using CustomFactory = std::function<std::shared_ptr<BasisFunction>(const BasisRequest&)>;
-
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
-void register_custom(std::string custom_id, CustomFactory factory);
-void unregister_custom(const std::string& custom_id);
-void clear_custom_registry_for_tests();
 
 } // namespace basis_factory
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 49c8d8763..2a1d4f6b0 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -6,11 +6,8 @@
  */
 
 #include "BasisFunction.h"
-#include "VectorBasisEvaluationHelpers.h"
+
 #include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <sstream>
 
 namespace svmp {
 namespace FE {
@@ -19,81 +16,26 @@ namespace basis {
 namespace {
 
 struct BasisFunctionScratch {
-    std::vector<Real> scalar_values;
-    std::vector<Gradient> scalar_gradients;
-    std::vector<Hessian> scalar_hessians;
-    std::vector<math::Vector<Real, 3>> vector_values;
-    std::vector<VectorJacobian> vector_jacobians;
-    std::vector<math::Vector<Real, 3>> vector_curls;
-    std::vector<Real> vector_divergences;
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
 
     void prewarm(std::size_t max_size) {
-        scalar_values.reserve(max_size);
-        scalar_gradients.reserve(max_size);
-        scalar_hessians.reserve(max_size);
-        vector_values.reserve(max_size);
-        vector_jacobians.reserve(max_size);
-        vector_curls.reserve(max_size);
-        vector_divergences.reserve(max_size);
+        values.reserve(max_size);
+        gradients.reserve(max_size);
+        hessians.reserve(max_size);
     }
 };
 
-BasisFunctionScratch& basis_function_scratch() {
-    // Scratch is intentionally thread-local: production assembly uses a
-    // persistent worker-thread team, so buffers stay warm on each worker.
-    static thread_local BasisFunctionScratch scratch;
-    return scratch;
-}
-
-void mix_identity_hash_word(std::uint64_t word,
-                            std::uint64_t& hash_a,
-                            std::uint64_t& hash_b) noexcept {
-    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
-    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+BasisFunctionScratch& scratch() {
+    static thread_local BasisFunctionScratch data;
+    return data;
 }
 
 } // namespace
 
-BasisIdentityFingerprint
-compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept {
-    BasisIdentityFingerprint fingerprint{0x243f6a8885a308d3ULL,
-                                         0x13198a2e03707344ULL};
-    mix_identity_hash_word(static_cast<std::uint64_t>(words.size()),
-                           fingerprint.hash_a,
-                           fingerprint.hash_b);
-    for (const auto word : words) {
-        mix_identity_hash_word(word, fingerprint.hash_a, fingerprint.hash_b);
-    }
-    return fingerprint;
-}
-
-std::string BasisFunction::cache_identity() const {
-    std::ostringstream oss;
-    oss << "basis=" << static_cast<int>(basis_type())
-        << "|elem=" << static_cast<int>(element_type())
-        << "|dim=" << dimension()
-        << "|order=" << order()
-        << "|size=" << size()
-        << "|vector=" << is_vector_valued();
-    return oss.str();
-}
-
-bool BasisFunction::cache_identity_words(std::vector<std::uint64_t>& words) const {
-    (void)words;
-    return false;
-}
-
-bool BasisFunction::cache_identity_fingerprint(std::uint64_t& hash_a,
-                                               std::uint64_t& hash_b) const {
-    (void)hash_a;
-    (void)hash_b;
-    return false;
-}
-
-void prewarm_basis_function_scratch(std::size_t max_size,
-                                    std::size_t max_qpts) {
-    (void)max_qpts;
-    basis_function_scratch().prewarm(max_size);
+void prewarm_basis_function_scratch(std::size_t max_size) {
+    scratch().prewarm(max_size);
 }
 
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -123,7 +65,7 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT values_out) const {
-    auto& tmp = basis_function_scratch().scalar_values;
+    auto& tmp = scratch().values;
     tmp.resize(size());
     evaluate_values(xi, tmp);
     std::copy_n(tmp.data(), tmp.size(), values_out);
@@ -131,7 +73,7 @@ void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                           Real* SVMP_RESTRICT gradients_out) const {
-    auto& tmp = basis_function_scratch().scalar_gradients;
+    auto& tmp = scratch().gradients;
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
@@ -143,7 +85,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                          Real* SVMP_RESTRICT hessians_out) const {
-    auto& tmp = basis_function_scratch().scalar_hessians;
+    auto& tmp = scratch().hessians;
     tmp.resize(size());
     evaluate_hessians(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
@@ -151,165 +93,6 @@ void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
     }
 }
 
-void BasisFunction::evaluate_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(
-        points, points.size(), values_out, gradients_out, hessians_out);
-}
-
-void BasisFunction::evaluate_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException(
-            "BasisFunction strided evaluation requires output_stride >= points.size()",
-            __FILE__, __LINE__, __func__);
-    }
-
-    auto& scratch = basis_function_scratch();
-    auto& v_tmp = scratch.scalar_values;
-    auto& g_tmp = scratch.scalar_gradients;
-    auto& h_tmp = scratch.scalar_hessians;
-    if (values_out) v_tmp.resize(num_dofs);
-    if (gradients_out) g_tmp.resize(num_dofs);
-    if (hessians_out) h_tmp.resize(num_dofs);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out && gradients_out && hessians_out) {
-            evaluate_all(points[q], v_tmp, g_tmp, h_tmp);
-        } else {
-            if (values_out) evaluate_values(points[q], v_tmp);
-            if (gradients_out) evaluate_gradients(points[q], g_tmp);
-            if (hessians_out) evaluate_hessians(points[q], h_tmp);
-        }
-
-        if (values_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                values_out[dof * output_stride + q] = v_tmp[dof];
-            }
-        }
-        if (gradients_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    gradients_out[(dof * 3u + component) * output_stride + q] =
-                        g_tmp[dof][component];
-                }
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                store_hessian_strided(
-                    h_tmp[dof], hessians_out + dof * 9u * output_stride, output_stride, q);
-            }
-        }
-    }
-}
-
-void BasisFunction::fill_scalar_cache_entry(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(
-        points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-void BasisFunction::evaluate_vector_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) const {
-    evaluate_vector_at_quadrature_points_strided(
-        points, points.size(), values_out, jacobians_out, curls_out, divergence_out);
-}
-
-void BasisFunction::evaluate_vector_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    detail::vector_common::validate_vector_strided_outputs(
-        num_qpts, output_stride, "BasisFunction");
-
-    auto& scratch = basis_function_scratch();
-    auto& v_tmp = scratch.vector_values;
-    auto& j_tmp = scratch.vector_jacobians;
-    auto& c_tmp = scratch.vector_curls;
-    auto& d_tmp = scratch.vector_divergences;
-    if (values_out) v_tmp.resize(num_dofs);
-    if (jacobians_out) j_tmp.resize(num_dofs);
-    if (curls_out) c_tmp.resize(num_dofs);
-    if (divergence_out) d_tmp.resize(num_dofs);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out) {
-            evaluate_vector_values(points[q], v_tmp);
-            detail::vector_common::write_vector_values_strided(
-                v_tmp, num_dofs, output_stride, q, values_out);
-        }
-
-        if (jacobians_out) {
-            evaluate_vector_jacobians(points[q], j_tmp);
-            detail::vector_common::write_vector_jacobians_strided(
-                j_tmp, num_dofs, output_stride, q, jacobians_out);
-        }
-
-        if (curls_out) {
-            evaluate_curl(points[q], c_tmp);
-            detail::vector_common::write_vector_curl_strided(
-                c_tmp, num_dofs, output_stride, q, curls_out);
-        }
-
-        if (divergence_out) {
-            evaluate_divergence(points[q], d_tmp);
-            detail::vector_common::write_vector_divergence_strided(
-                d_tmp, num_dofs, output_stride, q, divergence_out);
-        }
-    }
-}
-
-void BasisFunction::evaluate_vector_values(
-    const math::Vector<Real, 3>&,
-    std::vector<math::Vector<Real, 3>>&) const {
-    throw BasisEvaluationException("Vector-valued evaluation requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_vector_jacobians(
-    const math::Vector<Real, 3>&,
-    std::vector<VectorJacobian>&) const {
-    throw BasisEvaluationException("Vector-basis Jacobian evaluation requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_divergence(
-    const math::Vector<Real, 3>&,
-    std::vector<Real>&) const {
-    throw BasisEvaluationException("Divergence requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_curl(
-    const math::Vector<Real, 3>&,
-    std::vector<math::Vector<Real, 3>>&) const {
-    throw BasisEvaluationException("Curl requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
 void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients,
                                        Real eps) const {
@@ -320,11 +103,12 @@ void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
         math::Vector<Real, 3> backward = xi;
-        const std::size_t idx = static_cast<std::size_t>(d);
+        const auto idx = static_cast<std::size_t>(d);
         forward[idx] += eps;
         backward[idx] -= eps;
 
-        std::vector<Real> fwd, bwd;
+        std::vector<Real> fwd;
+        std::vector<Real> bwd;
         evaluate_values(forward, fwd);
         evaluate_values(backward, bwd);
 
@@ -344,18 +128,20 @@ void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
         math::Vector<Real, 3> backward = xi;
-        const std::size_t col = static_cast<std::size_t>(d);
+        const auto col = static_cast<std::size_t>(d);
         forward[col] += eps;
         backward[col] -= eps;
 
-        std::vector<Gradient> g_forward, g_backward;
+        std::vector<Gradient> g_forward;
+        std::vector<Gradient> g_backward;
         evaluate_gradients(forward, g_forward);
         evaluate_gradients(backward, g_backward);
 
         for (std::size_t i = 0; i < base_grad.size(); ++i) {
             for (int k = 0; k < dimension(); ++k) {
-                const std::size_t row = static_cast<std::size_t>(k);
-                hessians[i](row, col) = (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
+                const auto row = static_cast<std::size_t>(k);
+                hessians[i](row, col) =
+                    (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
             }
         }
     }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index ee38a5b19..dbabf7061 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -8,24 +8,12 @@
 #ifndef SVMP_FE_BASIS_BASISFUNCTION_H
 #define SVMP_FE_BASIS_BASISFUNCTION_H
 
-/**
- * @file BasisFunction.h
- * @brief Abstract interface for basis function evaluation on reference elements
- *
- * The Basis module operates purely on reference elements and is independent of
- * mesh-specific data structures. Implementations may leverage Math and
- * Quadrature utilities but must not read mesh connectivity or geometry.
- */
-
-#include "Types.h"
 #include "BasisExceptions.h"
-#include "Math/Vector.h"
 #include "Math/Matrix.h"
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <span>
-#include <string>
+#include "Math/Vector.h"
+#include "Types.h"
+
+#include <cstddef>
 #include <vector>
 
 namespace svmp {
@@ -34,18 +22,8 @@ namespace basis {
 
 using Gradient = math::Vector<Real, 3>;
 using Hessian  = math::Matrix<Real, 3, 3>;
-using VectorJacobian = math::Matrix<Real, 3, 3>;
-
-struct BasisIdentityFingerprint {
-    std::uint64_t hash_a{0};
-    std::uint64_t hash_b{0};
-};
-
-[[nodiscard]] BasisIdentityFingerprint
-compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept;
 
-void prewarm_basis_function_scratch(std::size_t max_size,
-                                    std::size_t max_qpts = 0);
+void prewarm_basis_function_scratch(std::size_t max_size);
 
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
                                                     Real yy,
@@ -57,363 +35,80 @@ void prewarm_basis_function_scratch(std::size_t max_size,
     hessian(0, 0) = xx;
     hessian(1, 1) = yy;
     hessian(2, 2) = zz;
-    hessian(0, 1) = xy;
-    hessian(1, 0) = xy;
-    hessian(0, 2) = xz;
-    hessian(2, 0) = xz;
-    hessian(1, 2) = yz;
-    hessian(2, 1) = yz;
+    hessian(0, 1) = hessian(1, 0) = xy;
+    hessian(0, 2) = hessian(2, 0) = xz;
+    hessian(1, 2) = hessian(2, 1) = yz;
     return hessian;
 }
 
-// Raw Hessian buffers use row-major 3x3 blocks:
-// dst[row * 3 + col] = H(row, col).
 inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
-    dst[0u] = hessian(0u, 0u);
-    dst[1u] = hessian(0u, 1u);
-    dst[2u] = hessian(0u, 2u);
-    dst[3u] = hessian(1u, 0u);
-    dst[4u] = hessian(1u, 1u);
-    dst[5u] = hessian(1u, 2u);
-    dst[6u] = hessian(2u, 0u);
-    dst[7u] = hessian(2u, 1u);
-    dst[8u] = hessian(2u, 2u);
-}
-
-inline void store_hessian_strided(const Hessian& hessian,
-                                  Real* dst,
-                                  std::size_t stride,
-                                  std::size_t offset) noexcept {
-    dst[0u * stride + offset] = hessian(0u, 0u);
-    dst[1u * stride + offset] = hessian(0u, 1u);
-    dst[2u * stride + offset] = hessian(0u, 2u);
-    dst[3u * stride + offset] = hessian(1u, 0u);
-    dst[4u * stride + offset] = hessian(1u, 1u);
-    dst[5u * stride + offset] = hessian(1u, 2u);
-    dst[6u * stride + offset] = hessian(2u, 0u);
-    dst[7u * stride + offset] = hessian(2u, 1u);
-    dst[8u * stride + offset] = hessian(2u, 2u);
-}
-
-inline void scatter_hessian_components_strided(const Real* src,
-                                               Real* dst,
-                                               std::size_t stride,
-                                               std::size_t offset) noexcept {
-    dst[0u * stride + offset] = src[0u];
-    dst[1u * stride + offset] = src[1u];
-    dst[2u * stride + offset] = src[2u];
-    dst[3u * stride + offset] = src[3u];
-    dst[4u * stride + offset] = src[4u];
-    dst[5u * stride + offset] = src[5u];
-    dst[6u * stride + offset] = src[6u];
-    dst[7u * stride + offset] = src[7u];
-    dst[8u * stride + offset] = src[8u];
+    dst[0] = hessian(0, 0);
+    dst[1] = hessian(0, 1);
+    dst[2] = hessian(0, 2);
+    dst[3] = hessian(1, 0);
+    dst[4] = hessian(1, 1);
+    dst[5] = hessian(1, 2);
+    dst[6] = hessian(2, 0);
+    dst[7] = hessian(2, 1);
+    dst[8] = hessian(2, 2);
 }
 
 [[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
     Hessian hessian{};
-    hessian(0u, 0u) = src[0u];
-    hessian(0u, 1u) = src[1u];
-    hessian(0u, 2u) = src[2u];
-    hessian(1u, 0u) = src[3u];
-    hessian(1u, 1u) = src[4u];
-    hessian(1u, 2u) = src[5u];
-    hessian(2u, 0u) = src[6u];
-    hessian(2u, 1u) = src[7u];
-    hessian(2u, 2u) = src[8u];
+    hessian(0, 0) = src[0];
+    hessian(0, 1) = src[1];
+    hessian(0, 2) = src[2];
+    hessian(1, 0) = src[3];
+    hessian(1, 1) = src[4];
+    hessian(1, 2) = src[5];
+    hessian(2, 0) = src[6];
+    hessian(2, 1) = src[7];
+    hessian(2, 2) = src[8];
     return hessian;
 }
 
 inline void add_scaled_hessian(Hessian& target,
                                const Hessian& source,
                                Real scale) noexcept {
-    target(0u, 0u) += scale * source(0u, 0u);
-    target(0u, 1u) += scale * source(0u, 1u);
-    target(0u, 2u) += scale * source(0u, 2u);
-    target(1u, 0u) += scale * source(1u, 0u);
-    target(1u, 1u) += scale * source(1u, 1u);
-    target(1u, 2u) += scale * source(1u, 2u);
-    target(2u, 0u) += scale * source(2u, 0u);
-    target(2u, 1u) += scale * source(2u, 1u);
-    target(2u, 2u) += scale * source(2u, 2u);
+    for (std::size_t r = 0; r < 3u; ++r) {
+        for (std::size_t c = 0; c < 3u; ++c) {
+            target(r, c) += scale * source(r, c);
+        }
+    }
 }
 
-/**
- * @brief Base interface for scalar and vector-valued basis families
- *
- * All basis implementations operate in reference space. Physical mappings are
- * handled by the Geometry module. Derivatives are returned with unused
- * components set to zero for lower dimensional elements.
- */
 class BasisFunction {
 public:
     virtual ~BasisFunction() = default;
 
-    /// Basis family identifier
     virtual BasisType basis_type() const noexcept = 0;
-
-    /// Underlying element type on the reference domain
     virtual ElementType element_type() const noexcept = 0;
-
-    /// Reference dimensionality (1, 2, or 3)
     virtual int dimension() const noexcept = 0;
-
-    /// Polynomial order (modal/nodal definition dependent)
     virtual int order() const noexcept = 0;
-
-    /// Number of basis functions (scalar or vector-valued)
     virtual std::size_t size() const noexcept = 0;
 
-    /**
-     * @brief Whether BasisCache can key this basis from common structural fields.
-     *
-     * Return true only when basis_type/element_type/dimension/order/size and
-     * vector-valued status fully determine evaluation behavior. Parameterized
-     * bases such as splines and custom user bases should keep the default false
-     * so BasisCache includes cache_identity() in the key.
-     */
-    virtual bool cache_identity_is_structural() const noexcept { return false; }
-
-    /// Whether the basis is vector-valued (H(div)/H(curl))
-    virtual bool is_vector_valued() const noexcept { return false; }
-
-    /// Whether vector-valued basis Jacobians are available.
-    virtual bool supports_vector_jacobians() const noexcept { return false; }
-
-    /// Whether vector-valued basis curls are available.
-    virtual bool supports_curl() const noexcept { return false; }
-
-    /// Whether vector-valued basis divergences are available.
-    virtual bool supports_divergence() const noexcept { return false; }
-
-    /**
-     * @brief Stable semantic identity used by BasisCache
-     *
-     * Derived classes should override this when evaluation depends on
-     * additional state beyond basis family / element / order metadata.
-     */
-    virtual std::string cache_identity() const;
-
-    /**
-     * @brief Optional exact structured identity payload for BasisCache keys.
-     *
-     * Parameterized bases may append stable integer/bit-pattern words and
-     * return true to let BasisCache avoid using cache_identity() as the exact
-     * key payload. The human-readable cache_identity() remains available for
-     * diagnostics and for custom bases that do not implement this path.
-     */
-    virtual bool cache_identity_words(std::vector<std::uint64_t>& words) const;
-
-    /**
-     * @brief Optional cached fingerprint for structured identity words.
-     *
-     * Implementations that precompute cache_identity_words() may also cache the
-     * corresponding fingerprint. BasisCache still retains exact identity words
-     * for equality after hash matches.
-     */
-    virtual bool cache_identity_fingerprint(std::uint64_t& hash_a,
-                                            std::uint64_t& hash_b) const;
-
-    /**
-     * @brief Evaluate scalar basis values at a reference point
-     * @param xi Reference coordinates (unused entries are ignored)
-     * @param[out] values Output array resized to size()
-     */
     virtual void evaluate_values(const math::Vector<Real, 3>& xi,
                                  std::vector<Real>& values) const = 0;
-
-    /**
-     * @brief Evaluate gradients of scalar basis functions
-     *
-     * Production bases must override this with analytic derivatives.
-     * Use numerical_gradient explicitly in tests or diagnostics when a finite
-     * difference approximation is intended.
-     */
     virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
-
-    /**
-     * @brief Evaluate Hessians of scalar basis functions
-     *
-     * Production bases must override this with analytic second derivatives.
-     * Use numerical_hessian explicitly in tests or diagnostics when a finite
-     * difference approximation is intended.
-     */
     virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
-
-    /**
-     * @brief Fused evaluation of values, gradients, and Hessians at one point
-     *
-     * Default implementation calls evaluate_values, evaluate_gradients, and
-     * evaluate_hessians in sequence. Bases that share intermediate
-     * computations (e.g., LagrangeBasis sharing per-axis 1D evaluations)
-     * should override this to avoid redundant work.
-     */
     virtual void evaluate_all(const math::Vector<Real, 3>& xi,
                               std::vector<Real>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
-    /**
-     * @brief Fill SoA buffers with basis evaluations at all quadrature points
-     *
-     * Outputs are written directly to caller-provided strided buffers in
-     * DOF-major SoA layout — no scratch+transpose required by the caller.
-     * Pass `nullptr` for any output that is not needed.
-     *
-     *   values_out:    size num_dofs * num_qpts; element [d * num_qpts + q]
-     *   gradients_out: size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   hessians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + r*3 + c) * num_qpts + q]
-     *
-     * Non-null output ranges must not overlap each other. Implementations may
-     * fill requested quantities in any order that is efficient for the basis.
-     *
-     * Default implementation calls evaluate_all (or evaluate_values/gradients/
-     * hessians as appropriate) per QP, materializing into temp buffers then
-     * scatter-writing to the output. Performance-sensitive bases must override
-     * this path so batched assembly does not fall back to Q virtual point
-     * evaluations. Unit coverage keeps an explicit list of hot bases that are
-     * expected to provide a direct strided implementation.
-     */
-    virtual void evaluate_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill strided SoA buffers with basis evaluations at quadrature points
-     *
-     * Same component layout as evaluate_at_quadrature_points, but each
-     * dof/component row advances by `output_stride` rather than `points.size()`.
-     * This lets padded SIMD cache storage be filled directly. Non-null output
-     * ranges have the same non-overlap requirement.
-     */
-    virtual void evaluate_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill zero-initialized scalar cache storage.
-     *
-     * BasisCache allocates and zero-initializes its scalar SoA buffers before
-     * calling this hook. The default implementation overwrites all requested
-     * entries through the public strided evaluator. Sparse-support bases may
-     * override this and write only active entries, relying on the caller's
-     * zero-initialization for inactive DOFs and unused derivative components.
-     */
-    virtual void fill_scalar_cache_entry(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill SoA buffers with vector-basis evaluations at all quadrature points
-     *
-     * Outputs are written in DOF-major SoA layout. Pass `nullptr` for any
-     * quantity that is not needed.
-     *
-     *   values_out:     size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   jacobians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + c*3 + r) * num_qpts + q]
-     *   curls_out:      size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   divergence_out: size num_dofs * num_qpts; element [d * num_qpts + q]
-     *
-     * Non-null output ranges must not overlap each other. Implementations may
-     * fill requested quantities in any order that is efficient for the basis.
-     */
-    virtual void evaluate_vector_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const;
-
-    /**
-     * @brief Fill strided SoA buffers with vector-basis evaluations
-     *
-     * Same component layout as evaluate_vector_at_quadrature_points, but each
-     * dof/component row advances by `output_stride` rather than `points.size()`.
-     * Non-null output ranges have the same non-overlap requirement.
-     *
-     * The base fallback loops over quadrature points through virtual point
-     * evaluation. H(div)/H(curl) bases used in assembly should override this
-     * method directly, and tests track the current hot vector families.
-     */
-    virtual void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const;
-
-    /**
-     * @brief Evaluate scalar basis values into a caller-provided raw buffer
-     *
-     * Caller is responsible for providing a buffer of at least size() Real
-     * entries. This avoids the per-call std::vector::resize() cost of the
-     * vector-output overload. Default implementation forwards through a temp
-     * vector; bases should override for direct write.
-     */
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
                                     Real* SVMP_RESTRICT values_out) const;
-
-    /**
-     * @brief Evaluate gradients into a flat caller-provided buffer
-     *
-     * Layout: gradients_out[i * 3 + c] = component c of gradient of basis i.
-     * Caller provides a buffer of size() * 3 Real entries.
-     */
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT gradients_out) const;
-
-    /**
-     * @brief Evaluate Hessians into a flat caller-provided buffer
-     *
-     * Layout: hessians_out[i * 9 + r * 3 + c] = H_i(r, c).
-     */
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                       Real* SVMP_RESTRICT hessians_out) const;
 
-    /**
-     * @brief Evaluate vector-valued basis functions (H(div)/H(curl))
-     *
-     * Default implementation throws; vector bases must override.
-     */
-    virtual void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                        std::vector<math::Vector<Real, 3>>& values) const;
-
-    /**
-     * @brief Evaluate reference-space Jacobians of vector-valued basis functions
-     *
-     * The returned matrix for basis function `i` has entries
-     * `jacobians[i](component, derivative_direction) = d phi_i_component / d xi_direction`.
-     * Unused rows/columns are zero-filled for lower-dimensional elements.
-     */
-    virtual void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                           std::vector<VectorJacobian>& jacobians) const;
-
-    /// Evaluate divergence of vector-valued basis functions (if applicable)
-    virtual void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>& divergence) const;
-
-    /// Evaluate curl of vector-valued basis functions (if applicable)
-    virtual void evaluate_curl(const math::Vector<Real, 3>& xi,
-                               std::vector<math::Vector<Real, 3>>& curl) const;
-
 protected:
-    /// Finite-difference helper for gradients of scalar bases
     void numerical_gradient(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients,
                             Real eps = Real(1e-6)) const;
-
-    /// Finite-difference helper for Hessians of scalar bases
     void numerical_hessian(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians,
                            Real eps = Real(1e-5)) const;
diff --git a/Code/Source/solver/FE/Basis/BasisTolerance.h b/Code/Source/solver/FE/Basis/BasisTolerance.h
deleted file mode 100644
index 423551f09..000000000
--- a/Code/Source/solver/FE/Basis/BasisTolerance.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_BASISTOLERANCE_H
-#define SVMP_FE_BASIS_BASISTOLERANCE_H
-
-#include "Types.h"
-
-#include <limits>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
-    return value < Real(0) ? -value : value;
-}
-
-[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
-    return lhs < rhs ? rhs : lhs;
-}
-
-[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
-                                                    Real multiplier = Real(64)) noexcept {
-    return multiplier * std::numeric_limits<Real>::epsilon() *
-           basis_max(Real(1), basis_abs(scale));
-}
-
-[[nodiscard]] constexpr bool basis_near_zero(Real value,
-                                             Real scale = Real(1),
-                                             Real multiplier = Real(64)) noexcept {
-    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
-                                                Real b,
-                                                Real multiplier = Real(64)) noexcept {
-    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
-    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_BASISTOLERANCE_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 835dfe705..d97b59f1f 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -11,6 +11,7 @@
 #include "Types.h"
 
 #include <cstddef>
+#include <limits>
 
 namespace svmp {
 namespace FE {
@@ -25,9 +26,39 @@ enum class BasisTopology {
     Tetrahedron,
     Hexahedron,
     Wedge,
-    Pyramid,
 };
 
+namespace detail {
+
+[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
+    return value < Real(0) ? -value : value;
+}
+
+[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
+    return lhs < rhs ? rhs : lhs;
+}
+
+[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
+                                                    Real multiplier = Real(64)) noexcept {
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           basis_max(Real(1), basis_abs(scale));
+}
+
+[[nodiscard]] constexpr bool basis_near_zero(Real value,
+                                             Real scale = Real(1),
+                                             Real multiplier = Real(64)) noexcept {
+    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
+                                                Real b,
+                                                Real multiplier = Real(64)) noexcept {
+    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
+    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+} // namespace detail
+
 [[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
     return type == ElementType::Point1;
 }
@@ -60,8 +91,8 @@ enum class BasisTopology {
 }
 
 [[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
-    return type == ElementType::Pyramid5 || type == ElementType::Pyramid13 ||
-           type == ElementType::Pyramid14;
+    (void)type;
+    return false;
 }
 
 [[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
@@ -98,9 +129,6 @@ enum class BasisTopology {
     if (is_wedge(type)) {
         return BasisTopology::Wedge;
     }
-    if (is_pyramid(type)) {
-        return BasisTopology::Pyramid;
-    }
     return BasisTopology::Unknown;
 }
 
@@ -124,9 +152,6 @@ enum class BasisTopology {
         case ElementType::Wedge6:
         case ElementType::Wedge18:
             return ElementType::Wedge6;
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return ElementType::Pyramid5;
         default:
             return type;
     }
@@ -140,7 +165,6 @@ enum class BasisTopology {
         case ElementType::Tetra4:
         case ElementType::Hex8:
         case ElementType::Wedge6:
-        case ElementType::Pyramid5:
             return 1;
         case ElementType::Line3:
         case ElementType::Triangle6:
@@ -148,7 +172,6 @@ enum class BasisTopology {
         case ElementType::Tetra10:
         case ElementType::Hex27:
         case ElementType::Wedge18:
-        case ElementType::Pyramid14:
             return 2;
         default:
             return -1;
@@ -179,14 +202,6 @@ enum class BasisTopology {
     return triangle_lagrange_size(order) * line_lagrange_size(order);
 }
 
-[[nodiscard]] constexpr std::size_t pyramid_lagrange_size(int order) noexcept {
-    if (order < 0) {
-        return 0u;
-    }
-    const std::size_t p = static_cast<std::size_t>(order);
-    return (p + 1u) * (p + 2u) * (2u * p + 3u) / 6u;
-}
-
 [[nodiscard]] constexpr std::size_t complete_lagrange_alias_size(ElementType type) noexcept {
     const int order = complete_lagrange_alias_order(type);
     switch (canonical_lagrange_type(type)) {
@@ -204,8 +219,6 @@ enum class BasisTopology {
             return hex_lagrange_size(order);
         case ElementType::Wedge6:
             return wedge_lagrange_size(order);
-        case ElementType::Pyramid5:
-            return pyramid_lagrange_size(order);
         default:
             return 0u;
     }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 63b947516..7516d514a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -6,16 +6,11 @@
  */
 
 #include "LagrangeBasis.h"
-#include "BasisTraits.h"
-#include "BasisTolerance.h"
-#include "LagrangeBasisFast.h"
 #include "NodeOrderingConventions.h"
-#include "LagrangeBasisPyramid.h"
-#include "LagrangeBasisSimplex.h"
-#include "LagrangeBasisUtility.h"
+
 #include <algorithm>
+#include <array>
 #include <cmath>
-#include <unordered_map>
 
 namespace svmp {
 namespace FE {
@@ -23,8299 +18,597 @@ namespace basis {
 
 namespace {
 
-using LagrangeTopology = BasisTopology;
-
-#if defined(_MSC_VER)
-#define SVMP_LAGRANGE_NOINLINE __declspec(noinline)
-#define SVMP_LAGRANGE_ALIGN64
-#elif defined(__GNUC__) || defined(__clang__)
-#define SVMP_LAGRANGE_NOINLINE __attribute__((noinline))
-#define SVMP_LAGRANGE_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SVMP_LAGRANGE_NOINLINE
-#define SVMP_LAGRANGE_ALIGN64
-#endif
+using Vec3 = math::Vector<Real, 3>;
 
-#ifndef FE_ALWAYS_INLINE
-#if defined(_MSC_VER)
-#define FE_ALWAYS_INLINE __forceinline
-#elif defined(__GNUC__) || defined(__clang__)
-#define FE_ALWAYS_INLINE __attribute__((always_inline)) inline
-#else
-#define FE_ALWAYS_INLINE inline
-#endif
-#endif
-
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out);
+inline constexpr Real equispaced_pm_one_coord(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
 
-struct LagrangeTopologyTraits {
-    LagrangeTopology topology;
-    int dimension;
+struct AxisEval {
+    std::vector<Real> value;
+    std::vector<Real> first;
+    std::vector<Real> second;
 };
 
-struct SimplexExponentHash {
-    std::size_t operator()(const std::array<int, 4>& exponents) const noexcept {
-        std::size_t seed = 0x9e3779b97f4a7c15ull;
-        for (const int exponent : exponents) {
-            const auto value = static_cast<std::size_t>(exponent);
-            seed ^= value + 0x9e3779b97f4a7c15ull + (seed << 6u) + (seed >> 2u);
-        }
-        return seed;
-    }
+struct SimplexEval {
+    std::vector<Real> value;
+    std::vector<Gradient> gradient;
+    std::vector<Hessian> hessian;
 };
 
-template<typename T, std::size_t N>
-void assign_array(std::vector<T>& out, const std::array<T, N>& values) {
-    out.assign(values.begin(), values.end());
-}
-
-bool coordinate_matches_expected(Real coord, Real expected) noexcept {
-    return detail::basis_nearly_equal(coord, expected);
-}
+struct NormalizedLagrangeRequest {
+    ElementType element_type;
+    int order;
+};
 
-template<typename FastBasis>
-void evaluate_fast_outputs(const math::Vector<Real, 3>& xi,
-                           std::vector<Real>* values,
-                           std::vector<Gradient>* gradients,
-                           std::vector<Hessian>* hessians) {
-    if (values != nullptr) {
-        std::array<Real, FastBasis::n_dofs> fast_values{};
-        FastBasis::evaluate(xi, fast_values);
-        assign_array(*values, fast_values);
-    }
-    if (gradients != nullptr) {
-        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-        FastBasis::evaluate_gradients(xi, fast_gradients);
-        assign_array(*gradients, fast_gradients);
-    }
-    if (hessians != nullptr) {
-        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-        FastBasis::evaluate_hessians(xi, fast_hessians);
-        assign_array(*hessians, fast_hessians);
+BasisTopology supported_lagrange_topology(ElementType type) {
+    const BasisTopology top = topology(type);
+    if (top == BasisTopology::Unknown) {
+        throw BasisElementCompatibilityException("LagrangeBasis: unsupported element type",
+                                                __FILE__, __LINE__, __func__);
     }
+    return top;
 }
 
-template<typename FastBasis>
-void evaluate_fast_outputs_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT values_out,
-                              Real* SVMP_RESTRICT gradients_out,
-                              Real* SVMP_RESTRICT hessians_out) {
-    if (values_out != nullptr) {
-        std::array<Real, FastBasis::n_dofs> fast_values{};
-        FastBasis::evaluate(xi, fast_values);
-        for (std::size_t i = 0; i < fast_values.size(); ++i) {
-            values_out[i] = fast_values[i];
-        }
-    }
-    if (gradients_out != nullptr) {
-        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-        FastBasis::evaluate_gradients(xi, fast_gradients);
-        for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-            gradients_out[i * 3u + 0u] = fast_gradients[i][0];
-            gradients_out[i * 3u + 1u] = fast_gradients[i][1];
-            gradients_out[i * 3u + 2u] = fast_gradients[i][2];
-        }
-    }
-    if (hessians_out != nullptr) {
-        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-        FastBasis::evaluate_hessians(xi, fast_hessians);
-        for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-            store_hessian(fast_hessians[i], hessians_out + i * 9u);
-        }
+NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
+    switch (element_type) {
+        case ElementType::Line3:
+            return {ElementType::Line2, std::max(order, 2)};
+        case ElementType::Triangle6:
+            return {ElementType::Triangle3, std::max(order, 2)};
+        case ElementType::Quad9:
+            return {ElementType::Quad4, std::max(order, 2)};
+        case ElementType::Tetra10:
+            return {ElementType::Tetra4, std::max(order, 2)};
+        case ElementType::Hex27:
+            return {ElementType::Hex8, std::max(order, 2)};
+        case ElementType::Wedge18:
+            return {ElementType::Wedge6, std::max(order, 2)};
+        case ElementType::Quad8:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Hex20:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Wedge15:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: pyramid support has been removed from the current solver basis scope",
+                __FILE__, __LINE__, __func__);
+        default:
+            return {element_type, order};
     }
 }
 
-template<typename FastBasis>
-void evaluate_fast_outputs_strided(const std::vector<math::Vector<Real, 3>>& points,
-                                   std::size_t output_stride,
-                                   Real* SVMP_RESTRICT values_out,
-                                   Real* SVMP_RESTRICT gradients_out,
-                                   Real* SVMP_RESTRICT hessians_out) {
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        if (values_out != nullptr) {
-            std::array<Real, FastBasis::n_dofs> fast_values{};
-            FastBasis::evaluate(xi, fast_values);
-            for (std::size_t i = 0; i < fast_values.size(); ++i) {
-                values_out[i * output_stride + q] = fast_values[i];
-            }
-        }
-        if (gradients_out != nullptr) {
-            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-            FastBasis::evaluate_gradients(xi, fast_gradients);
-            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-                Real* g = gradients_out + i * 3u * output_stride;
-                g[0u * output_stride + q] = fast_gradients[i][0];
-                g[1u * output_stride + q] = fast_gradients[i][1];
-                g[2u * output_stride + q] = fast_gradients[i][2];
-            }
-        }
-        if (hessians_out != nullptr) {
-            std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-            FastBasis::evaluate_hessians(xi, fast_hessians);
-            for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-                const Hessian& hessian = fast_hessians[i];
-                Real* H = hessians_out + i * 9u * output_stride;
-                H[0u * output_stride + q] = hessian(0, 0);
-                H[1u * output_stride + q] = hessian(0, 1);
-                H[2u * output_stride + q] = hessian(0, 2);
-                H[3u * output_stride + q] = hessian(1, 0);
-                H[4u * output_stride + q] = hessian(1, 1);
-                H[5u * output_stride + q] = hessian(1, 2);
-                H[6u * output_stride + q] = hessian(2, 0);
-                H[7u * output_stride + q] = hessian(2, 1);
-                H[8u * output_stride + q] = hessian(2, 2);
-            }
-        }
+std::size_t axis_index_pm_one(Real coord, int order) {
+    if (order <= 0) {
+        return 0u;
     }
+    const Real scaled = (coord + Real(1)) * Real(order) / Real(2);
+    return static_cast<std::size_t>(std::llround(scaled));
 }
 
-template<int Order>
-bool evaluate_fixed_lagrange_fast_order(LagrangeTopology topology,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs<LagrangeLineFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs<LagrangeQuadFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs<LagrangeHexFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs<LagrangeTriFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs<LagrangeTetFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        default:
-            return false;
+int simplex_lattice_index(Real value, int order) {
+    if (order <= 0) {
+        return 0;
     }
+    return static_cast<int>(std::llround(value * Real(order)));
 }
 
-template<int Order>
-bool evaluate_fixed_lagrange_fast_to_order(LagrangeTopology topology,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs_to<LagrangeLineFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs_to<LagrangeQuadFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs_to<LagrangeHexFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs_to<LagrangeTriFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs_to<LagrangeTetFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
+LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
+                                                           BasisTopology top,
+                                                           int order) {
+    LagrangeBasis::SimplexExponent e{0, 0, 0, 0};
+    if (order <= 0) {
+        return e;
     }
-}
-
-template<int Order>
-bool evaluate_fixed_lagrange_fast_strided_order(
-    LagrangeTopology topology,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs_strided<LagrangeLineFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs_strided<LagrangeQuadFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs_strided<LagrangeHexFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs_strided<LagrangeTriFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs_strided<LagrangeTetFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
+    if (top == BasisTopology::Triangle) {
+        e[1] = simplex_lattice_index(p[0], order);
+        e[2] = simplex_lattice_index(p[1], order);
+        e[0] = order - e[1] - e[2];
+    } else {
+        e[1] = simplex_lattice_index(p[0], order);
+        e[2] = simplex_lattice_index(p[1], order);
+        e[3] = simplex_lattice_index(p[2], order);
+        e[0] = order - e[1] - e[2] - e[3];
     }
+    return e;
 }
 
-void evaluate_triangle_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-
-    if (points.size() == 4u && output_stride == 4u) {
-        Real p10[4];
-        Real p11[4];
-        Real p12[4];
-        Real p20[4];
-        Real p21[4];
-        Real p22[4];
-        Real p30[4];
-        Real p31[4];
-        Real p32[4];
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-
-            p10[q] = Real(3) * l0;
-            p11[q] = Real(3) * l1;
-            p12[q] = Real(3) * l2;
-            p20[q] = Real(0.5) * p10[q] * (p10[q] - Real(1));
-            p21[q] = Real(0.5) * p11[q] * (p11[q] - Real(1));
-            p22[q] = Real(0.5) * p12[q] * (p12[q] - Real(1));
-            p30[q] = (p10[q] * (p10[q] - Real(1)) * (p10[q] - Real(2))) / Real(6);
-            p31[q] = (p11[q] * (p11[q] - Real(1)) * (p11[q] - Real(2))) / Real(6);
-            p32[q] = (p12[q] * (p12[q] - Real(1)) * (p12[q] - Real(2))) / Real(6);
-        }
+void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
+    const std::size_t n = nodes.size();
+    out.value.assign(n, Real(0));
+    out.first.assign(n, Real(0));
+    out.second.assign(n, Real(0));
 
-        row0[0] = p30[0]; row0[1] = p30[1]; row0[2] = p30[2]; row0[3] = p30[3];
-        row1[0] = p31[0]; row1[1] = p31[1]; row1[2] = p31[2]; row1[3] = p31[3];
-        row2[0] = p32[0]; row2[1] = p32[1]; row2[2] = p32[2]; row2[3] = p32[3];
-        row3[0] = p20[0] * p11[0];
-        row3[1] = p20[1] * p11[1];
-        row3[2] = p20[2] * p11[2];
-        row3[3] = p20[3] * p11[3];
-        row4[0] = p10[0] * p21[0];
-        row4[1] = p10[1] * p21[1];
-        row4[2] = p10[2] * p21[2];
-        row4[3] = p10[3] * p21[3];
-        row5[0] = p21[0] * p12[0];
-        row5[1] = p21[1] * p12[1];
-        row5[2] = p21[2] * p12[2];
-        row5[3] = p21[3] * p12[3];
-        row6[0] = p11[0] * p22[0];
-        row6[1] = p11[1] * p22[1];
-        row6[2] = p11[2] * p22[2];
-        row6[3] = p11[3] * p22[3];
-        row7[0] = p10[0] * p22[0];
-        row7[1] = p10[1] * p22[1];
-        row7[2] = p10[2] * p22[2];
-        row7[3] = p10[3] * p22[3];
-        row8[0] = p20[0] * p12[0];
-        row8[1] = p20[1] * p12[1];
-        row8[2] = p20[2] * p12[2];
-        row8[3] = p20[3] * p12[3];
-        row9[0] = p10[0] * p11[0] * p12[0];
-        row9[1] = p10[1] * p11[1] * p12[1];
-        row9[2] = p10[2] * p11[2] * p12[2];
-        row9[3] = p10[3] * p11[3] * p12[3];
+    if (n == 1u) {
+        out.value[0] = Real(1);
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        const Real p10 = Real(3) * l0;
-        const Real p11 = Real(3) * l1;
-        const Real p12 = Real(3) * l2;
-        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
-        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
-        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
-        const Real p30 = (p10 * (p10 - Real(1)) * (p10 - Real(2))) / Real(6);
-        const Real p31 = (p11 * (p11 - Real(1)) * (p11 - Real(2))) / Real(6);
-        const Real p32 = (p12 * (p12 - Real(1)) * (p12 - Real(2))) / Real(6);
-
-        row0[q] = p30;
-        row1[q] = p31;
-        row2[q] = p32;
-        row3[q] = p20 * p11;
-        row4[q] = p10 * p21;
-        row5[q] = p21 * p12;
-        row6[q] = p11 * p22;
-        row7[q] = p10 * p22;
-        row8[q] = p20 * p12;
-        row9[q] = p10 * p11 * p12;
-    }
-}
+    for (std::size_t i = 0; i < n; ++i) {
+        Real denom = Real(1);
+        for (std::size_t j = 0; j < n; ++j) {
+            if (j != i) {
+                denom *= nodes[i] - nodes[j];
+            }
+        }
 
-void evaluate_triangle_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
+        Real value = Real(1);
+        for (std::size_t j = 0; j < n; ++j) {
+            if (j != i) {
+                value *= x - nodes[j];
+            }
+        }
+        out.value[i] = value / denom;
 
-    if (points.size() == 4u && output_stride == 4u) {
-        Real l0[4];
-        Real l1[4];
-        Real l2[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            l1[q] = xi[0];
-            l2[q] = xi[1];
-            l0[q] = Real(1) - l1[q] - l2[q];
+        Real first = Real(0);
+        for (std::size_t m = 0; m < n; ++m) {
+            if (m == i) {
+                continue;
+            }
+            Real product = Real(1);
+            for (std::size_t j = 0; j < n; ++j) {
+                if (j != i && j != m) {
+                    product *= x - nodes[j];
+                }
+            }
+            first += product;
         }
+        out.first[i] = first / denom;
 
-        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
-        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
-        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
-        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
-        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
-        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
-        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
-        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
-        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
-        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
-        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
-        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
-        row3[0] = Real(4) * l0[0] * l1[0];
-        row3[1] = Real(4) * l0[1] * l1[1];
-        row3[2] = Real(4) * l0[2] * l1[2];
-        row3[3] = Real(4) * l0[3] * l1[3];
-        row4[0] = Real(4) * l1[0] * l2[0];
-        row4[1] = Real(4) * l1[1] * l2[1];
-        row4[2] = Real(4) * l1[2] * l2[2];
-        row4[3] = Real(4) * l1[3] * l2[3];
-        row5[0] = Real(4) * l0[0] * l2[0];
-        row5[1] = Real(4) * l0[1] * l2[1];
-        row5[2] = Real(4) * l0[2] * l2[2];
-        row5[3] = Real(4) * l0[3] * l2[3];
-        return;
+        Real second = Real(0);
+        for (std::size_t m = 0; m < n; ++m) {
+            if (m == i) {
+                continue;
+            }
+            for (std::size_t l = 0; l < n; ++l) {
+                if (l == i || l == m) {
+                    continue;
+                }
+                Real product = Real(1);
+                for (std::size_t j = 0; j < n; ++j) {
+                    if (j != i && j != m && j != l) {
+                        product *= x - nodes[j];
+                    }
+                }
+                second += product;
+            }
+        }
+        out.second[i] = second / denom;
     }
+}
 
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        row0[q] = l0 * (Real(2) * l0 - Real(1));
-        row1[q] = l1 * (Real(2) * l1 - Real(1));
-        row2[q] = l2 * (Real(2) * l2 - Real(1));
-        row3[q] = Real(4) * l0 * l1;
-        row4[q] = Real(4) * l1 * l2;
-        row5[q] = Real(4) * l0 * l2;
-    };
+std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
+    Real value = Real(1);
+    Real first = Real(0);
+    Real second = Real(0);
 
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
+    for (int m = 0; m < alpha; ++m) {
+        const Real factor = Real(order) * lambda - Real(m);
+        const Real inv = Real(1) / Real(m + 1);
+        const Real old_value = value;
+        const Real old_first = first;
+        const Real old_second = second;
+        value = old_value * factor * inv;
+        first = (old_first * factor + old_value * Real(order)) * inv;
+        second = (old_second * factor + Real(2) * old_first * Real(order)) * inv;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
+    return {value, first, second};
 }
 
-void evaluate_triangle_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
+void evaluate_simplex(const Vec3& xi,
+                      BasisTopology top,
+                      int order,
+                      const std::vector<LagrangeBasis::SimplexExponent>& exponents,
+                      SimplexEval& out) {
+    const std::size_t n = exponents.size();
+    out.value.assign(n, Real(0));
+    out.gradient.assign(n, Gradient{});
+    out.hessian.assign(n, Hessian{});
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        row0[q] = Real(1) - xi[0] - xi[1];
-        row1[q] = xi[0];
-        row2[q] = xi[1];
+    if (n == 1u && order == 0) {
+        out.value[0] = Real(1);
+        return;
     }
-}
 
-void evaluate_triangle_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
+    std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
+    std::array<Gradient, 4> lambda_grad{};
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        const Real g0 = Real(1) - Real(4) * l0;
-        row0[0u * output_stride + q] = g0;
-        row0[1u * output_stride + q] = g0;
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real(4) * l1 - Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(4) * l2 - Real(1);
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(4) * (l0 - l1);
-        row3[1u * output_stride + q] = Real(-4) * l1;
-        row3[2u * output_stride + q] = Real(0);
-        row4[0u * output_stride + q] = Real(4) * l2;
-        row4[1u * output_stride + q] = Real(4) * l1;
-        row4[2u * output_stride + q] = Real(0);
-        row5[0u * output_stride + q] = Real(-4) * l2;
-        row5[1u * output_stride + q] = Real(4) * (l0 - l2);
-        row5[2u * output_stride + q] = Real(0);
+    lambda[1] = xi[0];
+    lambda[2] = xi[1];
+    lambda_grad[1][0] = Real(1);
+    lambda_grad[2][1] = Real(1);
+    if (top == BasisTopology::Triangle) {
+        lambda[0] = Real(1) - xi[0] - xi[1];
+        lambda_grad[0][0] = Real(-1);
+        lambda_grad[0][1] = Real(-1);
+    } else {
+        lambda[3] = xi[2];
+        lambda[0] = Real(1) - xi[0] - xi[1] - xi[2];
+        lambda_grad[0][0] = Real(-1);
+        lambda_grad[0][1] = Real(-1);
+        lambda_grad[0][2] = Real(-1);
+        lambda_grad[3][2] = Real(1);
     }
-}
 
-inline void write_constant_hessian_q4(Real* SVMP_RESTRICT row,
-                                      std::size_t output_stride,
-                                      Real h00,
-                                      Real h01,
-                                      Real h02,
-                                      Real h10,
-                                      Real h11,
-                                      Real h12,
-                                      Real h20,
-                                      Real h21,
-                                      Real h22) {
-    Real* c0 = row + 0u * output_stride;
-    Real* c1 = row + 1u * output_stride;
-    Real* c2 = row + 2u * output_stride;
-    Real* c3 = row + 3u * output_stride;
-    Real* c4 = row + 4u * output_stride;
-    Real* c5 = row + 5u * output_stride;
-    Real* c6 = row + 6u * output_stride;
-    Real* c7 = row + 7u * output_stride;
-    Real* c8 = row + 8u * output_stride;
+    for (std::size_t i = 0; i < n; ++i) {
+        std::array<std::array<Real, 3>, 4> f{};
+        for (int a = 0; a < bary_count; ++a) {
+            f[static_cast<std::size_t>(a)] =
+                simplex_factor(exponents[i][static_cast<std::size_t>(a)],
+                               lambda[static_cast<std::size_t>(a)],
+                               order);
+        }
 
-    c0[0] = h00; c0[1] = h00; c0[2] = h00; c0[3] = h00;
-    c1[0] = h01; c1[1] = h01; c1[2] = h01; c1[3] = h01;
-    c2[0] = h02; c2[1] = h02; c2[2] = h02; c2[3] = h02;
-    c3[0] = h10; c3[1] = h10; c3[2] = h10; c3[3] = h10;
-    c4[0] = h11; c4[1] = h11; c4[2] = h11; c4[3] = h11;
-    c5[0] = h12; c5[1] = h12; c5[2] = h12; c5[3] = h12;
-    c6[0] = h20; c6[1] = h20; c6[2] = h20; c6[3] = h20;
-    c7[0] = h21; c7[1] = h21; c7[2] = h21; c7[3] = h21;
-    c8[0] = h22; c8[1] = h22; c8[2] = h22; c8[3] = h22;
-}
+        Real value = Real(1);
+        for (int a = 0; a < bary_count; ++a) {
+            value *= f[static_cast<std::size_t>(a)][0];
+        }
+        out.value[i] = value;
 
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(4), Real(0),
-                              Real(4), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
-                              output_stride,
-                              Real(-8), Real(-4), Real(0),
-                              Real(-4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(4), Real(0),
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(-4), Real(0),
-                              Real(-4), Real(-8), Real(0),
-                              Real(0), Real(0), Real(0));
-}
+        for (int a = 0; a < bary_count; ++a) {
+            Real product = f[static_cast<std::size_t>(a)][1];
+            for (int b = 0; b < bary_count; ++b) {
+                if (b != a) {
+                    product *= f[static_cast<std::size_t>(b)][0];
+                }
+            }
+            for (std::size_t c = 0; c < 3u; ++c) {
+                out.gradient[i][c] += product * lambda_grad[static_cast<std::size_t>(a)][c];
+            }
+        }
 
-SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(4), Real(4),
-                              Real(4), Real(4), Real(4),
-                              Real(4), Real(4), Real(4));
-    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(4));
-    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
-                              output_stride,
-                              Real(-8), Real(-4), Real(-4),
-                              Real(-4), Real(0), Real(0),
-                              Real(-4), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(4), Real(0),
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 6u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(-4), Real(0),
-                              Real(-4), Real(-8), Real(-4),
-                              Real(0), Real(-4), Real(0));
-    write_constant_hessian_q4(hessians_out + 7u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(-4),
-                              Real(0), Real(0), Real(-4),
-                              Real(-4), Real(-4), Real(-8));
-    write_constant_hessian_q4(hessians_out + 8u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(4),
-                              Real(0), Real(0), Real(0),
-                              Real(4), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 9u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(4),
-                              Real(0), Real(4), Real(0));
+        for (int a = 0; a < bary_count; ++a) {
+            for (int b = 0; b < bary_count; ++b) {
+                Real product = (a == b)
+                    ? f[static_cast<std::size_t>(a)][2]
+                    : f[static_cast<std::size_t>(a)][1] *
+                      f[static_cast<std::size_t>(b)][1];
+                for (int c = 0; c < bary_count; ++c) {
+                    if (c != a && c != b) {
+                        product *= f[static_cast<std::size_t>(c)][0];
+                    }
+                }
+                for (std::size_t r = 0; r < 3u; ++r) {
+                    for (std::size_t c = 0; c < 3u; ++c) {
+                        out.hessian[i](r, c) +=
+                            product *
+                            lambda_grad[static_cast<std::size_t>(a)][r] *
+                            lambda_grad[static_cast<std::size_t>(b)][c];
+                    }
+                }
+            }
+        }
+    }
 }
 
-void evaluate_tet_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        row0[q] = Real(1) - xi[0] - xi[1] - xi[2];
-        row1[q] = xi[0];
-        row2[q] = xi[1];
-        row3[q] = xi[2];
-    }
+void store_gradient(const Gradient& gradient, Real* dst) {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
 }
 
-void evaluate_tet_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
+} // namespace
 
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        row0[0u * output_stride + q] = Real(-1);
-        row0[1u * output_stride + q] = Real(-1);
-        row0[2u * output_stride + q] = Real(-1);
-        row1[0u * output_stride + q] = Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(1);
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(0);
-        row3[1u * output_stride + q] = Real(0);
-        row3[2u * output_stride + q] = Real(1);
-    }
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
+    const auto n = static_cast<std::size_t>(std::max(0, max_order) + 1);
+    prewarm_basis_function_scratch(std::max(n * n * n, max_qpts));
 }
 
-void evaluate_zero_hessians_strided(
-    std::size_t num_nodes,
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (num_qpts == 4u) {
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            write_constant_hessian_q4(hessians_out + node * 9u * output_stride,
-                                      output_stride,
-                                      Real(0), Real(0), Real(0),
-                                      Real(0), Real(0), Real(0),
-                                      Real(0), Real(0), Real(0));
-        }
-        return;
+LagrangeBasis::LagrangeBasis(ElementType type, int order)
+    : element_type_(type), order_(order) {
+    const auto normalized = normalize_lagrange_request(element_type_, order_);
+    element_type_ = normalized.element_type;
+    order_ = normalized.order;
+    if (order_ < 0) {
+        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
+                                          __FILE__, __LINE__, __func__);
     }
 
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        Real* row = hessians_out + node * 9u * output_stride;
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            row[0u * output_stride + q] = Real(0);
-            row[1u * output_stride + q] = Real(0);
-            row[2u * output_stride + q] = Real(0);
-            row[3u * output_stride + q] = Real(0);
-            row[4u * output_stride + q] = Real(0);
-            row[5u * output_stride + q] = Real(0);
-            row[6u * output_stride + q] = Real(0);
-            row[7u * output_stride + q] = Real(0);
-            row[8u * output_stride + q] = Real(0);
-        }
-    }
+    topology_ = supported_lagrange_topology(element_type_);
+    dimension_ = reference_dimension(element_type_);
+    init_nodes();
 }
 
-void evaluate_tet_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-
-    if (points.size() == 4u && output_stride == 4u) {
-        Real l0[4];
-        Real l1[4];
-        Real l2[4];
-        Real l3[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            l1[q] = xi[0];
-            l2[q] = xi[1];
-            l3[q] = xi[2];
-            l0[q] = Real(1) - l1[q] - l2[q] - l3[q];
-        }
-
-        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
-        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
-        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
-        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
-        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
-        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
-        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
-        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
-        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
-        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
-        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
-        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
-        row3[0] = l3[0] * (Real(2) * l3[0] - Real(1));
-        row3[1] = l3[1] * (Real(2) * l3[1] - Real(1));
-        row3[2] = l3[2] * (Real(2) * l3[2] - Real(1));
-        row3[3] = l3[3] * (Real(2) * l3[3] - Real(1));
-        row4[0] = Real(4) * l0[0] * l1[0];
-        row4[1] = Real(4) * l0[1] * l1[1];
-        row4[2] = Real(4) * l0[2] * l1[2];
-        row4[3] = Real(4) * l0[3] * l1[3];
-        row5[0] = Real(4) * l1[0] * l2[0];
-        row5[1] = Real(4) * l1[1] * l2[1];
-        row5[2] = Real(4) * l1[2] * l2[2];
-        row5[3] = Real(4) * l1[3] * l2[3];
-        row6[0] = Real(4) * l0[0] * l2[0];
-        row6[1] = Real(4) * l0[1] * l2[1];
-        row6[2] = Real(4) * l0[2] * l2[2];
-        row6[3] = Real(4) * l0[3] * l2[3];
-        row7[0] = Real(4) * l0[0] * l3[0];
-        row7[1] = Real(4) * l0[1] * l3[1];
-        row7[2] = Real(4) * l0[2] * l3[2];
-        row7[3] = Real(4) * l0[3] * l3[3];
-        row8[0] = Real(4) * l1[0] * l3[0];
-        row8[1] = Real(4) * l1[1] * l3[1];
-        row8[2] = Real(4) * l1[2] * l3[2];
-        row8[3] = Real(4) * l1[3] * l3[3];
-        row9[0] = Real(4) * l2[0] * l3[0];
-        row9[1] = Real(4) * l2[1] * l3[1];
-        row9[2] = Real(4) * l2[2] * l3[2];
-        row9[3] = Real(4) * l2[3] * l3[3];
-        return;
+void LagrangeBasis::init_equispaced_1d_nodes() {
+    nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
+    for (int i = 0; i <= order_; ++i) {
+        nodes_1d_[static_cast<std::size_t>(i)] =
+            equispaced_pm_one_coord(i, order_);
     }
+}
 
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        row0[q] = l0 * (Real(2) * l0 - Real(1));
-        row1[q] = l1 * (Real(2) * l1 - Real(1));
-        row2[q] = l2 * (Real(2) * l2 - Real(1));
-        row3[q] = l3 * (Real(2) * l3 - Real(1));
-        row4[q] = Real(4) * l0 * l1;
-        row5[q] = Real(4) * l1 * l2;
-        row6[q] = Real(4) * l0 * l2;
-        row7[q] = Real(4) * l0 * l3;
-        row8[q] = Real(4) * l1 * l3;
-        row9[q] = Real(4) * l2 * l3;
-    };
+void LagrangeBasis::init_nodes() {
+    nodes_.clear();
+    nodes_1d_.clear();
+    tensor_indices_.clear();
+    simplex_exponents_.clear();
+    wedge_indices_.clear();
 
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
+    switch (topology_) {
+        case BasisTopology::Point:
+            build_point_nodes();
+            return;
+        case BasisTopology::Line:
+            build_tensor_product_nodes(1);
+            return;
+        case BasisTopology::Quadrilateral:
+            build_tensor_product_nodes(2);
+            return;
+        case BasisTopology::Hexahedron:
+            build_tensor_product_nodes(3);
+            return;
+        case BasisTopology::Triangle:
+        case BasisTopology::Tetrahedron:
+            build_simplex_nodes();
+            return;
+        case BasisTopology::Wedge:
+            build_wedge_nodes();
+            return;
+        default:
+            break;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
+    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
+                                             __FILE__, __LINE__, __func__);
 }
 
-inline void write_tet_order2_gradient_q(Real* SVMP_RESTRICT row,
-                                        std::size_t output_stride,
-                                        std::size_t q,
-                                        Real gx,
-                                        Real gy,
-                                        Real gz) {
-    row[0u * output_stride + q] = gx;
-    row[1u * output_stride + q] = gy;
-    row[2u * output_stride + q] = gz;
+void LagrangeBasis::build_point_nodes() {
+    nodes_.push_back(Vec3{Real(0), Real(0), Real(0)});
 }
 
-SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-    Real* row9 = gradients_out + 9u * 3u * output_stride;
-
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        const Real four = Real(4);
-        const Real g0 = Real(1) - four * l0;
-
-        write_tet_order2_gradient_q(row0, output_stride, q, g0, g0, g0);
-        write_tet_order2_gradient_q(row1, output_stride, q, four * l1 - Real(1), Real(0), Real(0));
-        write_tet_order2_gradient_q(row2, output_stride, q, Real(0), four * l2 - Real(1), Real(0));
-        write_tet_order2_gradient_q(row3, output_stride, q, Real(0), Real(0), four * l3 - Real(1));
-        write_tet_order2_gradient_q(row4, output_stride, q, four * (l0 - l1), -four * l1, -four * l1);
-        write_tet_order2_gradient_q(row5, output_stride, q, four * l2, four * l1, Real(0));
-        write_tet_order2_gradient_q(row6, output_stride, q, -four * l2, four * (l0 - l2), -four * l2);
-        write_tet_order2_gradient_q(row7, output_stride, q, -four * l3, -four * l3, four * (l0 - l3));
-        write_tet_order2_gradient_q(row8, output_stride, q, four * l3, Real(0), four * l1);
-        write_tet_order2_gradient_q(row9, output_stride, q, Real(0), four * l3, four * l2);
-    };
-
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
-    }
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
+void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+    init_equispaced_1d_nodes();
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    tensor_indices_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        TensorNodeIndex idx{0u, 0u, 0u};
+        idx[0] = axis_index_pm_one(node[0], order_);
+        if (dimensions >= 2) {
+            idx[1] = axis_index_pm_one(node[1], order_);
+        }
+        if (dimensions >= 3) {
+            idx[2] = axis_index_pm_one(node[2], order_);
+        }
+        tensor_indices_.push_back(idx);
     }
 }
 
-inline void fill_simplex_order3_factor_values(Real lambda, Real* SVMP_RESTRICT phi) {
-    const Real t = Real(3) * lambda;
-    phi[0] = Real(1);
-    phi[1] = t;
-    phi[2] = phi[1] * (t - Real(1)) * Real(0.5);
-    phi[3] = phi[2] * (t - Real(2)) / Real(3);
-}
-
-void evaluate_tet_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-    Real* row16 = values_out + 16u * output_stride;
-    Real* row17 = values_out + 17u * output_stride;
-    Real* row18 = values_out + 18u * output_stride;
-    Real* row19 = values_out + 19u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        Real p0[4];
-        Real p1[4];
-        Real p2[4];
-        Real p3[4];
-        fill_simplex_order3_factor_values(l0, p0);
-        fill_simplex_order3_factor_values(l1, p1);
-        fill_simplex_order3_factor_values(l2, p2);
-        fill_simplex_order3_factor_values(l3, p3);
-
-        row0[q] = p0[3];
-        row1[q] = p1[3];
-        row2[q] = p2[3];
-        row3[q] = p3[3];
-        row4[q] = p0[2] * p1[1];
-        row5[q] = p0[1] * p1[2];
-        row6[q] = p1[2] * p2[1];
-        row7[q] = p1[1] * p2[2];
-        row8[q] = p0[1] * p2[2];
-        row9[q] = p0[2] * p2[1];
-        row10[q] = p0[2] * p3[1];
-        row11[q] = p0[1] * p3[2];
-        row12[q] = p1[2] * p3[1];
-        row13[q] = p1[1] * p3[2];
-        row14[q] = p2[2] * p3[1];
-        row15[q] = p2[1] * p3[2];
-        row16[q] = p0[1] * p1[1] * p2[1];
-        row17[q] = p0[1] * p1[1] * p3[1];
-        row18[q] = p1[1] * p2[1] * p3[1];
-        row19[q] = p0[1] * p2[1] * p3[1];
+void LagrangeBasis::build_simplex_nodes() {
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    simplex_exponents_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        simplex_exponents_.push_back(simplex_exponent_from_point(node, topology_, order_));
     }
 }
 
-void evaluate_triangle_order3_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* rows[10] = {
-        gradients_out + 0u * 3u * output_stride,
-        gradients_out + 1u * 3u * output_stride,
-        gradients_out + 2u * 3u * output_stride,
-        gradients_out + 3u * 3u * output_stride,
-        gradients_out + 4u * 3u * output_stride,
-        gradients_out + 5u * 3u * output_stride,
-        gradients_out + 6u * 3u * output_stride,
-        gradients_out + 7u * 3u * output_stride,
-        gradients_out + 8u * 3u * output_stride,
-        gradients_out + 9u * 3u * output_stride,
-    };
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        const Real p10 = Real(3) * l0;
-        const Real p11 = Real(3) * l1;
-        const Real p12 = Real(3) * l2;
-        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
-        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
-        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
-        const Real d10 = Real(3);
-        const Real d11 = Real(3);
-        const Real d12 = Real(3);
-        const Real d20 = Real(3) * p10 - Real(1.5);
-        const Real d21 = Real(3) * p11 - Real(1.5);
-        const Real d22 = Real(3) * p12 - Real(1.5);
-        const Real d30 = Real(1.5) * p10 * p10 - Real(3) * p10 + Real(1);
-        const Real d31 = Real(1.5) * p11 * p11 - Real(3) * p11 + Real(1);
-        const Real d32 = Real(1.5) * p12 * p12 - Real(3) * p12 + Real(1);
-
-        const Real dl0[10] = {
-            d30,
-            Real(0),
-            Real(0),
-            d20 * p11,
-            d10 * p21,
-            Real(0),
-            Real(0),
-            d10 * p22,
-            d20 * p12,
-            d10 * p11 * p12,
-        };
-        const Real dl1[10] = {
-            Real(0),
-            d31,
-            Real(0),
-            p20 * d11,
-            p10 * d21,
-            d21 * p12,
-            d11 * p22,
-            Real(0),
-            Real(0),
-            p10 * d11 * p12,
-        };
-        const Real dl2[10] = {
-            Real(0),
-            Real(0),
-            d32,
-            Real(0),
-            Real(0),
-            p21 * d12,
-            p11 * d22,
-            p10 * d22,
-            p20 * d12,
-            p10 * p11 * d12,
-        };
+void LagrangeBasis::build_wedge_nodes() {
+    init_equispaced_1d_nodes();
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    const auto tri_nodes =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
+    simplex_exponents_.reserve(tri_nodes.size());
+    for (const auto& tri_node : tri_nodes) {
+        simplex_exponents_.push_back(
+            simplex_exponent_from_point(tri_node, BasisTopology::Triangle, order_));
+    }
 
-        for (std::size_t node = 0; node < 10u; ++node) {
-            Real* g = rows[node];
-            g[0u * output_stride + q] = dl1[node] - dl0[node];
-            g[1u * output_stride + q] = dl2[node] - dl0[node];
-            g[2u * output_stride + q] = Real(0);
+    wedge_indices_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        const auto tri_exp =
+            simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
+        auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
+        if (it == simplex_exponents_.end()) {
+            throw BasisConstructionException("LagrangeBasis: wedge node triangle index lookup failed",
+                                             __FILE__, __LINE__, __func__);
         }
+        const std::size_t tri_index =
+            static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
+        wedge_indices_.push_back({tri_index, axis_index_pm_one(node[2], order_)});
     }
 }
 
-void evaluate_hex_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-
-    const auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        const Real lxly = lx * ly;
-        const Real uxly = ux * ly;
-        const Real uxuy = ux * uy;
-        const Real lxuy = lx * uy;
-        row0[q] = lxly * lz;
-        row1[q] = uxly * lz;
-        row2[q] = uxuy * lz;
-        row3[q] = lxuy * lz;
-        row4[q] = lxly * uz;
-        row5[q] = uxly * uz;
-        row6[q] = uxuy * uz;
-        row7[q] = lxuy * uz;
-    };
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
+void LagrangeBasis::evaluate_all_to(const Vec3& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    if (topology_ == BasisTopology::Point) {
+        if (values_out) {
+            values_out[0] = Real(1);
+        }
+        if (gradients_out) {
+            gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+        }
+        if (hessians_out) {
+            std::fill_n(hessians_out, 9u, Real(0));
+        }
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
-}
-
-template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-void evaluate_hex_order1_outputs_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr Real half = Real(0.5);
-    constexpr std::array<Real, 8> dx{{-half, half, half, -half, -half, half, half, -half}};
-    constexpr std::array<Real, 8> dy{{-half, -half, half, half, -half, -half, half, half}};
-    constexpr std::array<Real, 8> dz{{-half, -half, -half, -half, half, half, half, half}};
+    if (topology_ == BasisTopology::Line ||
+        topology_ == BasisTopology::Quadrilateral ||
+        topology_ == BasisTopology::Hexahedron) {
+        AxisEval ax;
+        AxisEval ay;
+        AxisEval az;
+        evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
+        if (dimension_ >= 2) {
+            evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+        }
+        if (dimension_ >= 3) {
+            evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+        }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * half;
-        const Real ly = (Real(1) - xi[1]) * half;
-        const Real lz = (Real(1) - xi[2]) * half;
-        const Real ux = (Real(1) + xi[0]) * half;
-        const Real uy = (Real(1) + xi[1]) * half;
-        const Real uz = (Real(1) + xi[2]) * half;
-        const Real xval[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
-        const Real yval[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
-        const Real zval[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+        for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
+            const auto& idx = tensor_indices_[node];
+            const Real vx = ax.value[idx[0]];
+            const Real dx = ax.first[idx[0]];
+            const Real d2x = ax.second[idx[0]];
+            const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
+            const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
+            const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
+            const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
+            const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
+            const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
 
-        for (std::size_t node = 0; node < 8u; ++node) {
-            if constexpr (NeedValues) {
-                values_out[node * output_stride + q] =
-                    xval[node] * yval[node] * zval[node];
+            if (values_out) {
+                values_out[node] = vx * vy * vz;
             }
-            if constexpr (NeedGradients) {
-                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dx[node] * yval[node] * zval[node];
-                g[1u * output_stride + q] = xval[node] * dy[node] * zval[node];
-                g[2u * output_stride + q] = xval[node] * yval[node] * dz[node];
+            if (gradients_out) {
+                Real* g = gradients_out + node * 3u;
+                g[0] = dx * vy * vz;
+                g[1] = vx * dy * vz;
+                g[2] = vx * vy * dz;
             }
-            if constexpr (NeedHessians) {
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                const Real hxy = dx[node] * dy[node] * zval[node];
-                const Real hxz = dx[node] * yval[node] * dz[node];
-                const Real hyz = xval[node] * dy[node] * dz[node];
-                H[0u * output_stride + q] = Real(0);
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = Real(0);
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = Real(0);
+            if (hessians_out) {
+                Real* h = hessians_out + node * 9u;
+                h[0] = d2x * vy * vz;
+                h[1] = dx * dy * vz;
+                h[2] = dx * vy * dz;
+                h[3] = h[1];
+                h[4] = vx * d2y * vz;
+                h[5] = vx * dy * dz;
+                h[6] = h[2];
+                h[7] = h[5];
+                h[8] = vx * vy * d2z;
             }
         }
+        return;
     }
-}
-
-void evaluate_quad_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
 
-    if (points.size() == 4u && output_stride == 4u) {
-        Real lx[4];
-        Real ux[4];
-        Real ly[4];
-        Real uy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            lx[q] = (Real(1) - xi[0]) * Real(0.5);
-            ux[q] = (Real(1) + xi[0]) * Real(0.5);
-            ly[q] = (Real(1) - xi[1]) * Real(0.5);
-            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+    if (topology_ == BasisTopology::Triangle || topology_ == BasisTopology::Tetrahedron) {
+        SimplexEval simplex;
+        evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
+        for (std::size_t i = 0; i < simplex.value.size(); ++i) {
+            if (values_out) {
+                values_out[i] = simplex.value[i];
+            }
+            if (gradients_out) {
+                store_gradient(simplex.gradient[i], gradients_out + i * 3u);
+            }
+            if (hessians_out) {
+                store_hessian(simplex.hessian[i], hessians_out + i * 9u);
+            }
         }
-        row0[0] = lx[0] * ly[0];
-        row0[1] = lx[1] * ly[1];
-        row0[2] = lx[2] * ly[2];
-        row0[3] = lx[3] * ly[3];
-        row1[0] = ux[0] * ly[0];
-        row1[1] = ux[1] * ly[1];
-        row1[2] = ux[2] * ly[2];
-        row1[3] = ux[3] * ly[3];
-        row2[0] = ux[0] * uy[0];
-        row2[1] = ux[1] * uy[1];
-        row2[2] = ux[2] * uy[2];
-        row2[3] = ux[3] * uy[3];
-        row3[0] = lx[0] * uy[0];
-        row3[1] = lx[1] * uy[1];
-        row3[2] = lx[2] * uy[2];
-        row3[3] = lx[3] * uy[3];
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        row0[q] = lx * ly;
-        row1[q] = ux * ly;
-        row2[q] = ux * uy;
-        row3[q] = lx * uy;
-    }
-}
+    if (topology_ == BasisTopology::Wedge) {
+        SimplexEval tri;
+        AxisEval z_axis;
+        evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
+        evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
 
-void evaluate_quad_order1_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
+        for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
+            const auto [tri_idx, z_idx] = wedge_indices_[node];
+            const Real tv = tri.value[tri_idx];
+            const Real zv = z_axis.value[z_idx];
+            const Real dz = z_axis.first[z_idx];
+            const Real d2z = z_axis.second[z_idx];
 
-    if (points.size() == 4u) {
-        Real lx[4];
-        Real ly[4];
-        Real ux[4];
-        Real uy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            lx[q] = (Real(1) - xi[0]) * Real(0.5);
-            ly[q] = (Real(1) - xi[1]) * Real(0.5);
-            ux[q] = (Real(1) + xi[0]) * Real(0.5);
-            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+            if (values_out) {
+                values_out[node] = tv * zv;
+            }
+            if (gradients_out) {
+                Real* g = gradients_out + node * 3u;
+                g[0] = tri.gradient[tri_idx][0] * zv;
+                g[1] = tri.gradient[tri_idx][1] * zv;
+                g[2] = tv * dz;
+            }
+            if (hessians_out) {
+                Real* h = hessians_out + node * 9u;
+                const Hessian& th = tri.hessian[tri_idx];
+                const Gradient& tg = tri.gradient[tri_idx];
+                h[0] = th(0, 0) * zv;
+                h[1] = th(0, 1) * zv;
+                h[2] = tg[0] * dz;
+                h[3] = h[1];
+                h[4] = th(1, 1) * zv;
+                h[5] = tg[1] * dz;
+                h[6] = h[2];
+                h[7] = h[5];
+                h[8] = tv * d2z;
+            }
         }
-
-        auto write_component = [](Real* SVMP_RESTRICT row,
-                                  Real a0,
-                                  Real a1,
-                                  Real a2,
-                                  Real a3) {
-            row[0] = a0;
-            row[1] = a1;
-            row[2] = a2;
-            row[3] = a3;
-        };
-
-        write_component(row0, Real(-0.5) * ly[0], Real(-0.5) * ly[1],
-                        Real(-0.5) * ly[2], Real(-0.5) * ly[3]);
-        write_component(row0 + output_stride, Real(-0.5) * lx[0], Real(-0.5) * lx[1],
-                        Real(-0.5) * lx[2], Real(-0.5) * lx[3]);
-        write_component(row0 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row1, Real(0.5) * ly[0], Real(0.5) * ly[1],
-                        Real(0.5) * ly[2], Real(0.5) * ly[3]);
-        write_component(row1 + output_stride, Real(-0.5) * ux[0], Real(-0.5) * ux[1],
-                        Real(-0.5) * ux[2], Real(-0.5) * ux[3]);
-        write_component(row1 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row2, Real(0.5) * uy[0], Real(0.5) * uy[1],
-                        Real(0.5) * uy[2], Real(0.5) * uy[3]);
-        write_component(row2 + output_stride, Real(0.5) * ux[0], Real(0.5) * ux[1],
-                        Real(0.5) * ux[2], Real(0.5) * ux[3]);
-        write_component(row2 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row3, Real(-0.5) * uy[0], Real(-0.5) * uy[1],
-                        Real(-0.5) * uy[2], Real(-0.5) * uy[3]);
-        write_component(row3 + output_stride, Real(0.5) * lx[0], Real(0.5) * lx[1],
-                        Real(0.5) * lx[2], Real(0.5) * lx[3]);
-        write_component(row3 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        row0[0u * output_stride + q] = Real(-0.5) * ly;
-        row0[1u * output_stride + q] = Real(-0.5) * lx;
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real( 0.5) * ly;
-        row1[1u * output_stride + q] = Real(-0.5) * ux;
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real( 0.5) * uy;
-        row2[1u * output_stride + q] = Real( 0.5) * ux;
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(-0.5) * uy;
-        row3[1u * output_stride + q] = Real( 0.5) * lx;
-        row3[2u * output_stride + q] = Real(0);
-    }
+    throw BasisEvaluationException("Unsupported element in LagrangeBasis evaluation",
+                                   __FILE__, __LINE__, __func__);
 }
 
-inline void write_quad_order1_hessian_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real xy) {
-    row[0u * output_stride + q] = Real(0);
-    row[1u * output_stride + q] = xy;
-    row[2u * output_stride + q] = Real(0);
-    row[3u * output_stride + q] = xy;
-    row[4u * output_stride + q] = Real(0);
-    row[5u * output_stride + q] = Real(0);
-    row[6u * output_stride + q] = Real(0);
-    row[7u * output_stride + q] = Real(0);
-    row[8u * output_stride + q] = Real(0);
+void LagrangeBasis::evaluate_values(const Vec3& xi,
+                                    std::vector<Real>& values) const {
+    values.resize(size());
+    evaluate_values_to(xi, values.data());
 }
 
-void evaluate_quad_order1_hessians_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-
-    constexpr Real positive = Real(0.25);
-    constexpr Real negative = Real(-0.25);
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        write_quad_order1_hessian_q(row0, output_stride, q, positive);
-        write_quad_order1_hessian_q(row1, output_stride, q, negative);
-        write_quad_order1_hessian_q(row2, output_stride, q, positive);
-        write_quad_order1_hessian_q(row3, output_stride, q, negative);
+void LagrangeBasis::evaluate_gradients(const Vec3& xi,
+                                       std::vector<Gradient>& gradients) const {
+    gradients.resize(size());
+    std::vector<Real> flat(size() * 3u, Real(0));
+    evaluate_gradients_to(xi, flat.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        gradients[i][0] = flat[i * 3u + 0u];
+        gradients[i][1] = flat[i * 3u + 1u];
+        gradients[i][2] = flat[i * 3u + 2u];
     }
 }
 
-template <std::size_t Q>
-inline void write_quad_order1_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][2],
-    const Real ly[4][2],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = (i == 0u) ? Real(-0.5) : Real(0.5);
-    const Real yd = (j == 0u) ? Real(-0.5) : Real(0.5);
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = Real(0);
-    hess_row[4u * output_stride + Q] = Real(0);
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order1_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<1>();
-
-    Real lx[4][2];
-    Real ly[4][2];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        lx[q][0] = (Real(1) - xi[0]) * Real(0.5);
-        lx[q][1] = (Real(1) + xi[0]) * Real(0.5);
-        ly[q][0] = (Real(1) - xi[1]) * Real(0.5);
-        ly[q][1] = (Real(1) + xi[1]) * Real(0.5);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order1_all_q4<0u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<1u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<2u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<3u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+void LagrangeBasis::evaluate_hessians(const Vec3& xi,
+                                      std::vector<Hessian>& hessians) const {
+    hessians.resize(size());
+    std::vector<Real> flat(size() * 9u, Real(0));
+    evaluate_hessians_to(xi, flat.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        hessians[i] = load_hessian(flat.data() + i * 9u);
     }
 }
 
-void evaluate_quad_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-
-        row0[q] = x0 * y0;
-        row1[q] = x1 * y0;
-        row2[q] = x1 * y1;
-        row3[q] = x0 * y1;
-        row4[q] = x2 * y0;
-        row5[q] = x1 * y2;
-        row6[q] = x2 * y1;
-        row7[q] = x0 * y2;
-        row8[q] = x2 * y2;
+void LagrangeBasis::evaluate_all(const Vec3& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    values.resize(size());
+    gradients.resize(size());
+    hessians.resize(size());
+    std::vector<Real> flat_g(size() * 3u, Real(0));
+    std::vector<Real> flat_h(size() * 9u, Real(0));
+    evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        gradients[i][0] = flat_g[i * 3u + 0u];
+        gradients[i][1] = flat_g[i * 3u + 1u];
+        gradients[i][2] = flat_g[i * 3u + 2u];
+        hessians[i] = load_hessian(flat_h.data() + i * 9u);
     }
 }
 
-inline void write_quad_order2_gradient_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real dx,
-    Real dy) {
-    row[0u * output_stride + q] = dx;
-    row[1u * output_stride + q] = dy;
-    row[2u * output_stride + q] = Real(0);
+void LagrangeBasis::evaluate_values_to(const Vec3& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    evaluate_all_to(xi, values_out, nullptr, nullptr);
 }
 
-void evaluate_quad_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    if (points.size() == 4u) {
-        Real xv[4][3];
-        Real yv[4][3];
-        Real xd[4][3];
-        Real yd[4][3];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            const Real x = xi[0];
-            const Real y = xi[1];
-            xv[q][0] = x * (x - Real(1)) * Real(0.5);
-            xv[q][1] = x * (x + Real(1)) * Real(0.5);
-            xv[q][2] = Real(1) - x * x;
-            yv[q][0] = y * (y - Real(1)) * Real(0.5);
-            yv[q][1] = y * (y + Real(1)) * Real(0.5);
-            yv[q][2] = Real(1) - y * y;
-            xd[q][0] = x - Real(0.5);
-            xd[q][1] = x + Real(0.5);
-            xd[q][2] = Real(-2) * x;
-            yd[q][0] = y - Real(0.5);
-            yd[q][1] = y + Real(0.5);
-            yd[q][2] = Real(-2) * y;
-        }
-
-        auto write_node = [&](std::size_t node, std::size_t i, std::size_t j) {
-            Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
-            row[0u] = xd[0][i] * yv[0][j];
-            row[1u] = xd[1][i] * yv[1][j];
-            row[2u] = xd[2][i] * yv[2][j];
-            row[3u] = xd[3][i] * yv[3][j];
-            row[output_stride + 0u] = xv[0][i] * yd[0][j];
-            row[output_stride + 1u] = xv[1][i] * yd[1][j];
-            row[output_stride + 2u] = xv[2][i] * yd[2][j];
-            row[output_stride + 3u] = xv[3][i] * yd[3][j];
-            row[2u * output_stride + 0u] = Real(0);
-            row[2u * output_stride + 1u] = Real(0);
-            row[2u * output_stride + 2u] = Real(0);
-            row[2u * output_stride + 3u] = Real(0);
-        };
-
-        write_node(0u, 0u, 0u);
-        write_node(1u, 1u, 0u);
-        write_node(2u, 1u, 1u);
-        write_node(3u, 0u, 1u);
-        write_node(4u, 2u, 0u);
-        write_node(5u, 1u, 2u);
-        write_node(6u, 2u, 1u);
-        write_node(7u, 0u, 2u);
-        write_node(8u, 2u, 2u);
-        return;
-    }
-
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real dx0 = x - Real(0.5);
-        const Real dx1 = x + Real(0.5);
-        const Real dx2 = Real(-2) * x;
-        const Real dy0 = y - Real(0.5);
-        const Real dy1 = y + Real(0.5);
-        const Real dy2 = Real(-2) * y;
+void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+}
 
-        write_quad_order2_gradient_q(row0, output_stride, q, dx0 * y0, x0 * dy0);
-        write_quad_order2_gradient_q(row1, output_stride, q, dx1 * y0, x1 * dy0);
-        write_quad_order2_gradient_q(row2, output_stride, q, dx1 * y1, x1 * dy1);
-        write_quad_order2_gradient_q(row3, output_stride, q, dx0 * y1, x0 * dy1);
-        write_quad_order2_gradient_q(row4, output_stride, q, dx2 * y0, x2 * dy0);
-        write_quad_order2_gradient_q(row5, output_stride, q, dx1 * y2, x1 * dy2);
-        write_quad_order2_gradient_q(row6, output_stride, q, dx2 * y1, x2 * dy1);
-        write_quad_order2_gradient_q(row7, output_stride, q, dx0 * y2, x0 * dy2);
-        write_quad_order2_gradient_q(row8, output_stride, q, dx2 * y2, x2 * dy2);
-    }
-}
-
-inline void write_quad_order2_hessian_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real hxx,
-    Real hxy,
-    Real hyy) {
-    row[0u * output_stride + q] = hxx;
-    row[1u * output_stride + q] = hxy;
-    row[2u * output_stride + q] = Real(0);
-    row[3u * output_stride + q] = hxy;
-    row[4u * output_stride + q] = hyy;
-    row[5u * output_stride + q] = Real(0);
-    row[6u * output_stride + q] = Real(0);
-    row[7u * output_stride + q] = Real(0);
-    row[8u * output_stride + q] = Real(0);
-}
-
-void evaluate_quad_order2_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-    Real* row4 = hessians_out + 4u * 9u * output_stride;
-    Real* row5 = hessians_out + 5u * 9u * output_stride;
-    Real* row6 = hessians_out + 6u * 9u * output_stride;
-    Real* row7 = hessians_out + 7u * 9u * output_stride;
-    Real* row8 = hessians_out + 8u * 9u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real dx0 = x - Real(0.5);
-        const Real dx1 = x + Real(0.5);
-        const Real dx2 = Real(-2) * x;
-        const Real dy0 = y - Real(0.5);
-        const Real dy1 = y + Real(0.5);
-        const Real dy2 = Real(-2) * y;
-
-        write_quad_order2_hessian_q(row0, output_stride, q, y0, dx0 * dy0, x0);
-        write_quad_order2_hessian_q(row1, output_stride, q, y0, dx1 * dy0, x1);
-        write_quad_order2_hessian_q(row2, output_stride, q, y1, dx1 * dy1, x1);
-        write_quad_order2_hessian_q(row3, output_stride, q, y1, dx0 * dy1, x0);
-        write_quad_order2_hessian_q(row4, output_stride, q, Real(-2) * y0, dx2 * dy0, x2);
-        write_quad_order2_hessian_q(row5, output_stride, q, y2, dx1 * dy2, Real(-2) * x1);
-        write_quad_order2_hessian_q(row6, output_stride, q, Real(-2) * y1, dx2 * dy1, x2);
-        write_quad_order2_hessian_q(row7, output_stride, q, y2, dx0 * dy2, Real(-2) * x0);
-        write_quad_order2_hessian_q(row8, output_stride, q, Real(-2) * y2, dx2 * dy2, Real(-2) * x2);
-    }
-}
-
-inline void fill_order3_axis_values(Real x, Real* SVMP_RESTRICT values) {
-    const Real x2 = x * x;
-    values[0] = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
-    values[1] = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
-    values[2] = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
-    values[3] = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
-}
-
-inline void fill_order3_axis_value_scalars(Real x,
-                                           Real& v0,
-                                           Real& v1,
-                                           Real& v2,
-                                           Real& v3) {
-    const Real x2 = x * x;
-    v0 = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
-    v1 = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
-    v2 = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
-    v3 = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
-}
-
-void evaluate_line_order1_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const Real x = points[q][0];
-        row0[q] = (Real(1) - x) * Real(0.5);
-        row1[q] = (Real(1) + x) * Real(0.5);
-    }
-}
-
-void evaluate_line_order2_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const Real x = points[q][0];
-        row0[q] = x * (x - Real(1)) * Real(0.5);
-        row1[q] = x * (x + Real(1)) * Real(0.5);
-        row2[q] = Real(1) - x * x;
-    }
-}
-
-void evaluate_line_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        fill_order3_axis_values(points[q][0], values);
-        row0[q] = values[0];
-        row1[q] = values[1];
-        row2[q] = values[2];
-        row3[q] = values[3];
-    }
-}
-
-inline void fill_order3_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first);
-
-inline void fill_order3_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second);
-
-inline void write_line_gradient_q4_row(Real* SVMP_RESTRICT row,
-                                       std::size_t output_stride,
-                                       Real g0,
-                                       Real g1,
-                                       Real g2,
-                                       Real g3) {
-    row[0] = g0;
-    row[1] = g1;
-    row[2] = g2;
-    row[3] = g3;
-    row[output_stride + 0u] = Real(0);
-    row[output_stride + 1u] = Real(0);
-    row[output_stride + 2u] = Real(0);
-    row[output_stride + 3u] = Real(0);
-    row[2u * output_stride + 0u] = Real(0);
-    row[2u * output_stride + 1u] = Real(0);
-    row[2u * output_stride + 2u] = Real(0);
-    row[2u * output_stride + 3u] = Real(0);
-}
-
-inline void write_line_hessian_q4_row(Real* SVMP_RESTRICT row,
-                                      std::size_t output_stride,
-                                      Real h0,
-                                      Real h1,
-                                      Real h2,
-                                      Real h3) {
-    row[0] = h0;
-    row[1] = h1;
-    row[2] = h2;
-    row[3] = h3;
-    for (std::size_t component = 1u; component < 9u; ++component) {
-        Real* slot = row + component * output_stride;
-        slot[0] = Real(0);
-        slot[1] = Real(0);
-        slot[2] = Real(0);
-        slot[3] = Real(0);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_gradients_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
-                               output_stride,
-                               Real(-0.5), Real(-0.5), Real(-0.5), Real(-0.5));
-    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
-                               output_stride,
-                               Real(0.5), Real(0.5), Real(0.5), Real(0.5));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
-                              output_stride, Real(0), Real(0), Real(0), Real(0));
-    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
-                              output_stride, Real(0), Real(0), Real(0), Real(0));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_line_order1_values_q4(points, output_stride, values_out);
-    evaluate_line_order1_gradients_q4(output_stride, gradients_out);
-    evaluate_line_order1_hessians_q4(output_stride, hessians_out);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    const Real x0 = points[0][0];
-    const Real x1 = points[1][0];
-    const Real x2 = points[2][0];
-    const Real x3 = points[3][0];
-    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
-                               output_stride,
-                               x0 - Real(0.5), x1 - Real(0.5),
-                               x2 - Real(0.5), x3 - Real(0.5));
-    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
-                               output_stride,
-                               x0 + Real(0.5), x1 + Real(0.5),
-                               x2 + Real(0.5), x3 + Real(0.5));
-    write_line_gradient_q4_row(gradients_out + 2u * 3u * output_stride,
-                               output_stride,
-                               Real(-2) * x0, Real(-2) * x1,
-                               Real(-2) * x2, Real(-2) * x3);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
-                              output_stride, Real(1), Real(1), Real(1), Real(1));
-    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
-                              output_stride, Real(1), Real(1), Real(1), Real(1));
-    write_line_hessian_q4_row(hessians_out + 2u * 9u * output_stride,
-                              output_stride, Real(-2), Real(-2), Real(-2), Real(-2));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_line_order2_values_q4(points, output_stride, values_out);
-    evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
-    evaluate_line_order2_hessians_q4(output_stride, hessians_out);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real first[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        fill_order3_axis_values_first(points[q][0], values, first[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][node], first[1][node],
-                                   first[2][node], first[3][node]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real second[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        Real first[4];
-        fill_order3_axis_values_first_second(points[q][0], values, first, second[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][node], second[1][node],
-                                  second[2][node], second[3][node]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][4];
-    Real first[4][4];
-    Real second[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_order3_axis_values_first_second(points[q][0], values[q], first[q], second[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        Real* value_row = values_out + node * output_stride;
-        value_row[0] = values[0][node];
-        value_row[1] = values[1][node];
-        value_row[2] = values[2][node];
-        value_row[3] = values[3][node];
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][node], first[1][node],
-                                   first[2][node], first[3][node]);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][node], second[1][node],
-                                  second[2][node], second[3][node]);
-    }
-}
-
-inline void fill_order3_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    fill_order3_axis_values(x, values);
-    const Real x2 = x * x;
-    first[0] = Real(-9.0 / 16.0) * (Real(3) * x2 - Real(2) * x - Real(1.0 / 9.0));
-    first[1] = Real( 9.0 / 16.0) * (Real(3) * x2 + Real(2) * x - Real(1.0 / 9.0));
-    first[2] = Real(27.0 / 16.0) * (Real(3) * x2 - Real(2.0 / 3.0) * x - Real(1));
-    first[3] = Real(-27.0 / 16.0) * (Real(3) * x2 + Real(2.0 / 3.0) * x - Real(1));
-}
-
-inline void fill_order3_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second) {
-    fill_order3_axis_values_first(x, values, first);
-    second[0] = Real(-9.0 / 16.0) * (Real(6) * x - Real(2));
-    second[1] = Real( 9.0 / 16.0) * (Real(6) * x + Real(2));
-    second[2] = Real(27.0 / 16.0) * (Real(6) * x - Real(2.0 / 3.0));
-    second[3] = Real(-27.0 / 16.0) * (Real(6) * x + Real(2.0 / 3.0));
-}
-
-inline void write_quad_order3_value_row_q4(Real* SVMP_RESTRICT row,
-                                           const Real lx[4][4],
-                                           const Real ly[4][4],
-                                           std::size_t i,
-                                           std::size_t j) {
-    row[0] = lx[0][i] * ly[0][j];
-    row[1] = lx[1][i] * ly[1][j];
-    row[2] = lx[2][i] * ly[2][j];
-    row[3] = lx[3][i] * ly[3][j];
-}
-
-void evaluate_quad_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    if (output_stride == 4u) {
-        Real* row0 = values_out + 0u * 4u;
-        Real* row1 = values_out + 1u * 4u;
-        Real* row2 = values_out + 2u * 4u;
-        Real* row3 = values_out + 3u * 4u;
-        Real* row4 = values_out + 4u * 4u;
-        Real* row5 = values_out + 5u * 4u;
-        Real* row6 = values_out + 6u * 4u;
-        Real* row7 = values_out + 7u * 4u;
-        Real* row8 = values_out + 8u * 4u;
-        Real* row9 = values_out + 9u * 4u;
-        Real* row10 = values_out + 10u * 4u;
-        Real* row11 = values_out + 11u * 4u;
-        Real* row12 = values_out + 12u * 4u;
-        Real* row13 = values_out + 13u * 4u;
-        Real* row14 = values_out + 14u * 4u;
-        Real* row15 = values_out + 15u * 4u;
-
-        auto write_q = [&](std::size_t q) {
-            const auto& xi = points[q];
-            Real x0;
-            Real x1;
-            Real x2;
-            Real x3;
-            Real y0;
-            Real y1;
-            Real y2;
-            Real y3;
-            fill_order3_axis_value_scalars(xi[0], x0, x1, x2, x3);
-            fill_order3_axis_value_scalars(xi[1], y0, y1, y2, y3);
-            row0[q] = x0 * y0;
-            row1[q] = x1 * y0;
-            row2[q] = x1 * y1;
-            row3[q] = x0 * y1;
-            row4[q] = x2 * y0;
-            row5[q] = x3 * y0;
-            row6[q] = x1 * y2;
-            row7[q] = x1 * y3;
-            row8[q] = x3 * y1;
-            row9[q] = x2 * y1;
-            row10[q] = x0 * y3;
-            row11[q] = x0 * y2;
-            row12[q] = x2 * y2;
-            row13[q] = x3 * y2;
-            row14[q] = x2 * y3;
-            row15[q] = x3 * y3;
-        };
-
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
-    }
-
-    Real lx[4][4];
-    Real ly[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values(xi[0], lx[q]);
-        fill_order3_axis_values(xi[1], ly[q]);
-    }
-
-    write_quad_order3_value_row_q4(values_out + 0u * output_stride, lx, ly, 0u, 0u);
-    write_quad_order3_value_row_q4(values_out + 1u * output_stride, lx, ly, 1u, 0u);
-    write_quad_order3_value_row_q4(values_out + 2u * output_stride, lx, ly, 1u, 1u);
-    write_quad_order3_value_row_q4(values_out + 3u * output_stride, lx, ly, 0u, 1u);
-    write_quad_order3_value_row_q4(values_out + 4u * output_stride, lx, ly, 2u, 0u);
-    write_quad_order3_value_row_q4(values_out + 5u * output_stride, lx, ly, 3u, 0u);
-    write_quad_order3_value_row_q4(values_out + 6u * output_stride, lx, ly, 1u, 2u);
-    write_quad_order3_value_row_q4(values_out + 7u * output_stride, lx, ly, 1u, 3u);
-    write_quad_order3_value_row_q4(values_out + 8u * output_stride, lx, ly, 3u, 1u);
-    write_quad_order3_value_row_q4(values_out + 9u * output_stride, lx, ly, 2u, 1u);
-    write_quad_order3_value_row_q4(values_out + 10u * output_stride, lx, ly, 0u, 3u);
-    write_quad_order3_value_row_q4(values_out + 11u * output_stride, lx, ly, 0u, 2u);
-    write_quad_order3_value_row_q4(values_out + 12u * output_stride, lx, ly, 2u, 2u);
-    write_quad_order3_value_row_q4(values_out + 13u * output_stride, lx, ly, 3u, 2u);
-    write_quad_order3_value_row_q4(values_out + 14u * output_stride, lx, ly, 2u, 3u);
-    write_quad_order3_value_row_q4(values_out + 15u * output_stride, lx, ly, 3u, 3u);
-}
-
-void evaluate_quad_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    if (points.size() == 4u) {
-        evaluate_quad_order3_values_q4(points, output_stride, values_out);
-        return;
-    }
-
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        fill_order3_axis_values(xi[0], lx);
-        fill_order3_axis_values(xi[1], ly);
-        row0[q] = lx[0] * ly[0];
-        row1[q] = lx[1] * ly[0];
-        row2[q] = lx[1] * ly[1];
-        row3[q] = lx[0] * ly[1];
-        row4[q] = lx[2] * ly[0];
-        row5[q] = lx[3] * ly[0];
-        row6[q] = lx[1] * ly[2];
-        row7[q] = lx[1] * ly[3];
-        row8[q] = lx[3] * ly[1];
-        row9[q] = lx[2] * ly[1];
-        row10[q] = lx[0] * ly[3];
-        row11[q] = lx[0] * ly[2];
-        row12[q] = lx[2] * ly[2];
-        row13[q] = lx[3] * ly[2];
-        row14[q] = lx[2] * ly[3];
-        row15[q] = lx[3] * ly[3];
-    }
-}
-
-template <std::size_t N>
-inline void write_quad_gradient_row_q4(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    const Real (&lx)[4][N],
-    const Real (&ly)[4][N],
-    const Real (&dx)[4][N],
-    const Real (&dy)[4][N],
-    std::size_t i,
-    std::size_t j) {
-    row[0u] = dx[0][i] * ly[0][j];
-    row[1u] = dx[1][i] * ly[1][j];
-    row[2u] = dx[2][i] * ly[2][j];
-    row[3u] = dx[3][i] * ly[3][j];
-    row[output_stride + 0u] = lx[0][i] * dy[0][j];
-    row[output_stride + 1u] = lx[1][i] * dy[1][j];
-    row[output_stride + 2u] = lx[2][i] * dy[2][j];
-    row[output_stride + 3u] = lx[3][i] * dy[3][j];
-    row[2u * output_stride + 0u] = Real(0);
-    row[2u * output_stride + 1u] = Real(0);
-    row[2u * output_stride + 2u] = Real(0);
-    row[2u * output_stride + 3u] = Real(0);
-}
-
-inline void fill_order4_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    const Real r = (x + Real(1)) * Real(2);
-    const Real r2 = r * r;
-    const Real r3 = r2 * r;
-    const Real f0 = r;
-    const Real f1 = r - Real(1);
-    const Real f2 = r - Real(2);
-    const Real f3 = r - Real(3);
-    const Real f4 = r - Real(4);
-    const Real f01 = f0 * f1;
-    const Real f12 = f1 * f2;
-    const Real f23 = f2 * f3;
-    const Real f34 = f3 * f4;
-
-    values[0] = (f12 * f34) / Real(24);
-    values[1] = (f01 * f23) / Real(24);
-    values[2] = -(f0 * f2 * f34) / Real(6);
-    values[3] = (f01 * f34) / Real(4);
-    values[4] = -(f01 * f2 * f4) / Real(6);
-
-    first[0] = (Real(4) * r3 - Real(30) * r2 + Real(70) * r - Real(50)) / Real(12);
-    first[1] = (Real(4) * r3 - Real(18) * r2 + Real(22) * r - Real(6)) / Real(12);
-    first[2] = (-Real(4) * r3 + Real(27) * r2 - Real(52) * r + Real(24)) / Real(3);
-    first[3] = Real(2) * r3 - Real(12) * r2 + Real(19) * r - Real(6);
-    first[4] = (-Real(4) * r3 + Real(21) * r2 - Real(28) * r + Real(8)) / Real(3);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_quad_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        write_quad_gradient_row_q4(
-            gradients_out + node * 3u * output_stride,
-            output_stride,
-            lx,
-            ly,
-            dx,
-            dy,
-            axes[0],
-            axes[1]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_quad_order4_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<4>();
-
-    Real lx[4][5];
-    Real ly[4][5];
-    Real dx[4][5];
-    Real dy[4][5];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order4_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order4_axis_values_first(xi[1], ly[q], dy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        write_quad_gradient_row_q4(
-            gradients_out + node * 3u * output_stride,
-            output_stride,
-            lx,
-            ly,
-            dx,
-            dy,
-            axes[0],
-            axes[1]);
-    }
-}
-
-void evaluate_quad_order3_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    if (points.size() == 4u) {
-        evaluate_quad_order3_gradients_q4(points, output_stride, gradients_out);
-        return;
-    }
-
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-    Real* row9 = gradients_out + 9u * 3u * output_stride;
-    Real* row10 = gradients_out + 10u * 3u * output_stride;
-    Real* row11 = gradients_out + 11u * 3u * output_stride;
-    Real* row12 = gradients_out + 12u * 3u * output_stride;
-    Real* row13 = gradients_out + 13u * 3u * output_stride;
-    Real* row14 = gradients_out + 14u * 3u * output_stride;
-    Real* row15 = gradients_out + 15u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        Real dx[4];
-        Real dy[4];
-        fill_order3_axis_values_first(xi[0], lx, dx);
-        fill_order3_axis_values_first(xi[1], ly, dy);
-        write_quad_order2_gradient_q(row0, output_stride, q, dx[0] * ly[0], lx[0] * dy[0]);
-        write_quad_order2_gradient_q(row1, output_stride, q, dx[1] * ly[0], lx[1] * dy[0]);
-        write_quad_order2_gradient_q(row2, output_stride, q, dx[1] * ly[1], lx[1] * dy[1]);
-        write_quad_order2_gradient_q(row3, output_stride, q, dx[0] * ly[1], lx[0] * dy[1]);
-        write_quad_order2_gradient_q(row4, output_stride, q, dx[2] * ly[0], lx[2] * dy[0]);
-        write_quad_order2_gradient_q(row5, output_stride, q, dx[3] * ly[0], lx[3] * dy[0]);
-        write_quad_order2_gradient_q(row6, output_stride, q, dx[1] * ly[2], lx[1] * dy[2]);
-        write_quad_order2_gradient_q(row7, output_stride, q, dx[1] * ly[3], lx[1] * dy[3]);
-        write_quad_order2_gradient_q(row8, output_stride, q, dx[3] * ly[1], lx[3] * dy[1]);
-        write_quad_order2_gradient_q(row9, output_stride, q, dx[2] * ly[1], lx[2] * dy[1]);
-        write_quad_order2_gradient_q(row10, output_stride, q, dx[0] * ly[3], lx[0] * dy[3]);
-        write_quad_order2_gradient_q(row11, output_stride, q, dx[0] * ly[2], lx[0] * dy[2]);
-        write_quad_order2_gradient_q(row12, output_stride, q, dx[2] * ly[2], lx[2] * dy[2]);
-        write_quad_order2_gradient_q(row13, output_stride, q, dx[3] * ly[2], lx[3] * dy[2]);
-        write_quad_order2_gradient_q(row14, output_stride, q, dx[2] * ly[3], lx[2] * dy[3]);
-        write_quad_order2_gradient_q(row15, output_stride, q, dx[3] * ly[3], lx[3] * dy[3]);
-    }
-}
-
-void evaluate_quad_order3_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-    Real* row4 = hessians_out + 4u * 9u * output_stride;
-    Real* row5 = hessians_out + 5u * 9u * output_stride;
-    Real* row6 = hessians_out + 6u * 9u * output_stride;
-    Real* row7 = hessians_out + 7u * 9u * output_stride;
-    Real* row8 = hessians_out + 8u * 9u * output_stride;
-    Real* row9 = hessians_out + 9u * 9u * output_stride;
-    Real* row10 = hessians_out + 10u * 9u * output_stride;
-    Real* row11 = hessians_out + 11u * 9u * output_stride;
-    Real* row12 = hessians_out + 12u * 9u * output_stride;
-    Real* row13 = hessians_out + 13u * 9u * output_stride;
-    Real* row14 = hessians_out + 14u * 9u * output_stride;
-    Real* row15 = hessians_out + 15u * 9u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        Real dx[4];
-        Real dy[4];
-        Real hx[4];
-        Real hy[4];
-        fill_order3_axis_values_first_second(xi[0], lx, dx, hx);
-        fill_order3_axis_values_first_second(xi[1], ly, dy, hy);
-        write_quad_order2_hessian_q(row0, output_stride, q, hx[0] * ly[0], dx[0] * dy[0], lx[0] * hy[0]);
-        write_quad_order2_hessian_q(row1, output_stride, q, hx[1] * ly[0], dx[1] * dy[0], lx[1] * hy[0]);
-        write_quad_order2_hessian_q(row2, output_stride, q, hx[1] * ly[1], dx[1] * dy[1], lx[1] * hy[1]);
-        write_quad_order2_hessian_q(row3, output_stride, q, hx[0] * ly[1], dx[0] * dy[1], lx[0] * hy[1]);
-        write_quad_order2_hessian_q(row4, output_stride, q, hx[2] * ly[0], dx[2] * dy[0], lx[2] * hy[0]);
-        write_quad_order2_hessian_q(row5, output_stride, q, hx[3] * ly[0], dx[3] * dy[0], lx[3] * hy[0]);
-        write_quad_order2_hessian_q(row6, output_stride, q, hx[1] * ly[2], dx[1] * dy[2], lx[1] * hy[2]);
-        write_quad_order2_hessian_q(row7, output_stride, q, hx[1] * ly[3], dx[1] * dy[3], lx[1] * hy[3]);
-        write_quad_order2_hessian_q(row8, output_stride, q, hx[3] * ly[1], dx[3] * dy[1], lx[3] * hy[1]);
-        write_quad_order2_hessian_q(row9, output_stride, q, hx[2] * ly[1], dx[2] * dy[1], lx[2] * hy[1]);
-        write_quad_order2_hessian_q(row10, output_stride, q, hx[0] * ly[3], dx[0] * dy[3], lx[0] * hy[3]);
-        write_quad_order2_hessian_q(row11, output_stride, q, hx[0] * ly[2], dx[0] * dy[2], lx[0] * hy[2]);
-        write_quad_order2_hessian_q(row12, output_stride, q, hx[2] * ly[2], dx[2] * dy[2], lx[2] * hy[2]);
-        write_quad_order2_hessian_q(row13, output_stride, q, hx[3] * ly[2], dx[3] * dy[2], lx[3] * hy[2]);
-        write_quad_order2_hessian_q(row14, output_stride, q, hx[2] * ly[3], dx[2] * dy[3], lx[2] * hy[3]);
-        write_quad_order2_hessian_q(row15, output_stride, q, hx[3] * ly[3], dx[3] * dy[3], lx[3] * hy[3]);
-    }
-}
-
-template <std::size_t Q>
-inline void write_quad_order3_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][4],
-    const Real ly[4][4],
-    const Real dx[4][4],
-    const Real dy[4][4],
-    const Real hx[4][4],
-    const Real hy[4][4],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real hx[4][4];
-    Real hy[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order3_all_q4<0u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<1u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<2u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<3u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values(xi[0], lx[q]);
-        fill_order3_axis_values(xi[1], ly[q]);
-        fill_order3_axis_values(xi[2], lz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = values_out + node * output_stride;
-        row[0] = lx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = lx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = lx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = lx[3][i] * ly[3][j] * lz[3][k];
-    }
-}
-
-void evaluate_hex_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real dz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
-        fill_order3_axis_values_first(xi[2], lz[q], dz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
-        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
-        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
-        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
-        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
-        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
-        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
-        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
-        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
-    }
-}
-
-template <std::size_t Q, bool WriteValue, bool WriteGradient>
-inline void write_hex_order3_q4_hessian_outputs(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    std::size_t k,
-    const Real lx[4][4],
-    const Real ly[4][4],
-    const Real lz[4][4],
-    const Real dx[4][4],
-    const Real dy[4][4],
-    const Real dz[4][4],
-    const Real hx[4][4],
-    const Real hy[4][4],
-    const Real hz[4][4],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real zv = lz[Q][k];
-    const Real yz = yv * zv;
-
-    if constexpr (WriteValue) {
-        value_row[Q] = xv * yz;
-    }
-
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real zd = dz[Q][k];
-    const Real yd_z = yd * zv;
-    const Real yv_zd = yv * zd;
-
-    if constexpr (WriteGradient) {
-        grad_row[0u * output_stride + Q] = xd * yz;
-        grad_row[1u * output_stride + Q] = xv * yd_z;
-        grad_row[2u * output_stride + Q] = xv * yv_zd;
-    }
-
-    const Real hxy = xd * yd_z;
-    const Real hxz = xd * yv_zd;
-    const Real hyz = xv * yd * zd;
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
-    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <bool WriteValue, bool WriteGradient>
-void evaluate_hex_order3_q4_hessian_outputs(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real dz[4][4];
-    Real hx[4][4];
-    Real hy[4][4];
-    Real hz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-        fill_order3_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_hex_order3_q4_hessian_outputs<0u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<1u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<2u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<3u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order3_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_hex_order3_q4_hessian_outputs<false, false>(
-        points, output_stride, nullptr, nullptr, hessians_out);
-}
-
-void evaluate_hex_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_hex_order3_q4_hessian_outputs<true, true>(
-        points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-void evaluate_hex_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-    Real* row16 = values_out + 16u * output_stride;
-    Real* row17 = values_out + 17u * output_stride;
-    Real* row18 = values_out + 18u * output_stride;
-    Real* row19 = values_out + 19u * output_stride;
-    Real* row20 = values_out + 20u * output_stride;
-    Real* row21 = values_out + 21u * output_stride;
-    Real* row22 = values_out + 22u * output_stride;
-    Real* row23 = values_out + 23u * output_stride;
-    Real* row24 = values_out + 24u * output_stride;
-    Real* row25 = values_out + 25u * output_stride;
-    Real* row26 = values_out + 26u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real z0 = z * (z - Real(1)) * Real(0.5);
-        const Real z1 = z * (z + Real(1)) * Real(0.5);
-        const Real z2 = Real(1) - z * z;
-        const Real x0y0 = x0 * y0;
-        const Real x1y0 = x1 * y0;
-        const Real x1y1 = x1 * y1;
-        const Real x0y1 = x0 * y1;
-        const Real x2y0 = x2 * y0;
-        const Real x1y2 = x1 * y2;
-        const Real x2y1 = x2 * y1;
-        const Real x0y2 = x0 * y2;
-        const Real x2y2 = x2 * y2;
-
-        row0[q] = x0y0 * z0;
-        row1[q] = x1y0 * z0;
-        row2[q] = x1y1 * z0;
-        row3[q] = x0y1 * z0;
-        row4[q] = x0y0 * z1;
-        row5[q] = x1y0 * z1;
-        row6[q] = x1y1 * z1;
-        row7[q] = x0y1 * z1;
-        row8[q] = x2y0 * z0;
-        row9[q] = x1y2 * z0;
-        row10[q] = x2y1 * z0;
-        row11[q] = x0y2 * z0;
-        row12[q] = x2y0 * z1;
-        row13[q] = x1y2 * z1;
-        row14[q] = x2y1 * z1;
-        row15[q] = x0y2 * z1;
-        row16[q] = x0y0 * z2;
-        row17[q] = x1y0 * z2;
-        row18[q] = x1y1 * z2;
-        row19[q] = x0y1 * z2;
-        row20[q] = x2y2 * z0;
-        row21[q] = x2y2 * z1;
-        row22[q] = x2y0 * z2;
-        row23[q] = x1y2 * z2;
-        row24[q] = x2y1 * z2;
-        row25[q] = x0y2 * z2;
-        row26[q] = x2y2 * z2;
-    }
-}
-
-inline void fill_order2_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    values[0] = x * (x - Real(1)) * Real(0.5);
-    values[1] = x * (x + Real(1)) * Real(0.5);
-    values[2] = Real(1) - x * x;
-    first[0] = x - Real(0.5);
-    first[1] = x + Real(0.5);
-    first[2] = Real(-2) * x;
-}
-
-inline void fill_order2_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second) {
-    fill_order2_axis_values_first(x, values, first);
-    second[0] = Real(1);
-    second[1] = Real(1);
-    second[2] = Real(-2);
-}
-
-template <std::size_t Q>
-inline void write_hex_order2_hessian_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    std::size_t k,
-    const Real lx[4][3],
-    const Real ly[4][3],
-    const Real lz[4][3],
-    const Real dx[4][3],
-    const Real dy[4][3],
-    const Real dz[4][3],
-    const Real hx[4][3],
-    const Real hy[4][3],
-    const Real hz[4][3],
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real zv = lz[Q][k];
-    const Real yz = yv * zv;
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real zd = dz[Q][k];
-    const Real yd_z = yd * zv;
-    const Real yv_zd = yv * zd;
-    const Real hxy = xd * yd_z;
-    const Real hxz = xd * yv_zd;
-    const Real hyz = xv * yd * zd;
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
-    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-void evaluate_hex_order2_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<2>();
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real lz[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real dz[4][3];
-    Real hx[4][3];
-    Real hy[4][3];
-    Real hz[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-        fill_order2_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_hex_order2_hessian_q4<0u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<1u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<2u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<3u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-    }
-}
-
-template <std::size_t Q>
-inline void write_quad_order2_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][3],
-    const Real ly[4][3],
-    const Real dx[4][3],
-    const Real dy[4][3],
-    const Real hx[4][3],
-    const Real hy[4][3],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order2_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<2>();
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real hx[4][3];
-    Real hy[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order2_all_q4<0u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<1u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<2u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<3u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order2_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr std::array<std::array<std::size_t, 3>, 27> node_axes = {{
-        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
-        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
-        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
-        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
-        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
-        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
-        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
-    }};
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real lz[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real dz[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order2_axis_values_first(xi[1], ly[q], dy[q]);
-        fill_order2_axis_values_first(xi[2], lz[q], dz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
-        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
-        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
-        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
-        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
-        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
-        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
-        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
-        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
-    }
-}
-
-template<typename FastBasis>
-void evaluate_constant_fast_hessians_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-    FastBasis::evaluate_hessians(math::Vector<Real, 3>{}, fast_hessians);
-    for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-        const Hessian& hessian = fast_hessians[i];
-        Real* H = hessians_out + i * 9u * output_stride;
-        const Real h00 = hessian(0, 0);
-        const Real h01 = hessian(0, 1);
-        const Real h02 = hessian(0, 2);
-        const Real h10 = hessian(1, 0);
-        const Real h11 = hessian(1, 1);
-        const Real h12 = hessian(1, 2);
-        const Real h20 = hessian(2, 0);
-        const Real h21 = hessian(2, 1);
-        const Real h22 = hessian(2, 2);
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            H[0u * output_stride + q] = h00;
-            H[1u * output_stride + q] = h01;
-            H[2u * output_stride + q] = h02;
-            H[3u * output_stride + q] = h10;
-            H[4u * output_stride + q] = h11;
-            H[5u * output_stride + q] = h12;
-            H[6u * output_stride + q] = h20;
-            H[7u * output_stride + q] = h21;
-            H[8u * output_stride + q] = h22;
-        }
-    }
-}
-
-template<typename FastBasis>
-void evaluate_fast_outputs_with_constant_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        if (values_out != nullptr) {
-            std::array<Real, FastBasis::n_dofs> fast_values{};
-            FastBasis::evaluate(xi, fast_values);
-            for (std::size_t i = 0; i < fast_values.size(); ++i) {
-                values_out[i * output_stride + q] = fast_values[i];
-            }
-        }
-        if (gradients_out != nullptr) {
-            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-            FastBasis::evaluate_gradients(xi, fast_gradients);
-            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-                Real* g = gradients_out + i * 3u * output_stride;
-                g[0u * output_stride + q] = fast_gradients[i][0];
-                g[1u * output_stride + q] = fast_gradients[i][1];
-                g[2u * output_stride + q] = fast_gradients[i][2];
-            }
-        }
-    }
-    evaluate_constant_fast_hessians_strided<FastBasis>(
-        points.size(), output_stride, hessians_out);
-}
-
-template<int Order>
-void evaluate_wedge_fast_outputs_strided(
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 1 && Order <= 2,
-                  "wedge fast outputs rely on low-order public triangle ordering");
-    using TriFast = LagrangeTriFast<Order>;
-    constexpr std::size_t axis_size = static_cast<std::size_t>(Order + 1);
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        std::array<Real, TriFast::n_dofs> tri_values{};
-        std::array<Gradient, TriFast::n_dofs> tri_gradients{};
-        std::array<Hessian, TriFast::n_dofs> tri_hessians{};
-        std::array<Real, axis_size> z_values{};
-        std::array<Real, axis_size> z_first{};
-        std::array<Real, axis_size> z_second{};
-
-        TriFast::evaluate(xi, tri_values);
-        if (need_grad || need_hess) {
-            TriFast::evaluate_gradients(xi, tri_gradients);
-        }
-        if (need_hess) {
-            TriFast::evaluate_hessians(xi, tri_hessians);
-            detail::fill_axis_values_first_second<Order>(xi[2], z_values, z_first, z_second);
-        } else if (need_grad) {
-            detail::fill_axis_values_first<Order>(xi[2], z_values, z_first);
-        } else {
-            detail::fill_axis_values<Order>(xi[2], z_values);
-        }
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            const Real tri_v = tri_values[tri];
-            const Real zv = z_values[z];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = tri_v * zv;
-            }
-
-            if (gradients_out != nullptr) {
-                Real* g = gradients_out + node * 3u * output_stride;
-                const Gradient& tri_g = tri_gradients[tri];
-                g[0u * output_stride + q] = tri_g[0] * zv;
-                g[1u * output_stride + q] = tri_g[1] * zv;
-                g[2u * output_stride + q] = tri_v * z_first[z];
-            }
-
-            if (hessians_out != nullptr) {
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Gradient& tri_g = tri_gradients[tri];
-                const Hessian& tri_H = tri_hessians[tri];
-                const Real zd = z_first[z];
-                const Real hxz = tri_g[0] * zd;
-                const Real hxy = tri_H(0, 1) * zv;
-                const Real hyz = tri_g[1] * zd;
-                H[0u * output_stride + q] = tri_H(0, 0) * zv;
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = tri_H(1, 1) * zv;
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = tri_v * z_second[z];
-            }
-        }
-    }
-}
-
-template <int Order>
-inline void fill_triangle_simplex_product_factors(Real lambda, Real* SVMP_RESTRICT factors) {
-    const Real t = static_cast<Real>(Order) * lambda;
-    factors[0] = Real(1);
-    for (int a = 1; a <= Order; ++a) {
-        factors[a] =
-            factors[a - 1] *
-            (t - static_cast<Real>(a - 1)) /
-            static_cast<Real>(a);
-    }
-}
-
-template <int Order>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool evaluate_wedge_values_product_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    constexpr std::size_t tri_count =
-        static_cast<std::size_t>((Order + 1) * (Order + 2) / 2);
-    if (simplex_exponents.size() != tri_count || points.size() != 4u) {
-        return false;
-    }
-
-    Real tri_values[4][tri_count];
-    std::array<Real, Order + 1> z_values[4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        Real f0[Order + 1];
-        Real f1[Order + 1];
-        Real f2[Order + 1];
-        fill_triangle_simplex_product_factors<Order>(l0, f0);
-        fill_triangle_simplex_product_factors<Order>(l1, f1);
-        fill_triangle_simplex_product_factors<Order>(l2, f2);
-        detail::fill_axis_values<Order>(xi[2], z_values[q]);
-
-        for (std::size_t tri = 0; tri < tri_count; ++tri) {
-            const auto& e = simplex_exponents[tri];
-            tri_values[q][tri] =
-                f0[static_cast<std::size_t>(e[0])] *
-                f1[static_cast<std::size_t>(e[1])] *
-                f2[static_cast<std::size_t>(e[2])];
-        }
-    }
-
-    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-        const auto& index = wedge_indices[node];
-        const std::size_t tri = index[0];
-        const std::size_t z = index[1];
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = tri_values[0][tri] * z_values[0][z];
-        row[1] = tri_values[1][tri] * z_values[1][z];
-        row[2] = tri_values[2][tri] * z_values[2][z];
-        row[3] = tri_values[3][tri] * z_values[3][z];
-    }
-    return true;
-}
-
-bool try_evaluate_wedge_values_product_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-        case 4:
-            return evaluate_wedge_values_product_q4<4>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 5:
-            return evaluate_wedge_values_product_q4<5>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 6:
-            return evaluate_wedge_values_product_q4<6>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 7:
-            return evaluate_wedge_values_product_q4<7>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 8:
-            return evaluate_wedge_values_product_q4<8>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        default:
-            return false;
-    }
-}
-
-void evaluate_wedge_order1_values_q4(
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real tri[4][3];
-    Real axis[4][2];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        tri[q][0] = Real(1) - xi[0] - xi[1];
-        tri[q][1] = xi[0];
-        tri[q][2] = xi[1];
-        axis[q][0] = (Real(1) - xi[2]) * Real(0.5);
-        axis[q][1] = (Real(1) + xi[2]) * Real(0.5);
-    }
-
-    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-        const auto& index = wedge_indices[node];
-        const std::size_t tri_node = index[0];
-        const std::size_t axis_node = index[1];
-        Real* row = values_out + node * output_stride;
-        row[0] = tri[0][tri_node] * axis[0][axis_node];
-        row[1] = tri[1][tri_node] * axis[1][axis_node];
-        row[2] = tri[2][tri_node] * axis[2][axis_node];
-        row[3] = tri[3][tri_node] * axis[3][axis_node];
-    }
-}
-
-bool evaluate_wedge_fast_strided(
-    int order,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (order == 3) {
-        return false;
-    }
-    if (order == 1 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_wedge_order1_values_q4(wedge_indices, points, output_stride, values_out);
-        return true;
-    }
-
-    switch (order) {
-        case 1:
-            evaluate_wedge_fast_outputs_strided<1>(
-                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 2:
-            evaluate_wedge_fast_outputs_strided<2>(
-                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast(LagrangeTopology topology,
-                                  int order,
-                                  const math::Vector<Real, 3>& xi,
-                                  std::vector<Real>* values,
-                                  std::vector<Gradient>* gradients,
-                                  std::vector<Hessian>* hessians) {
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_order<1>(
-                topology, xi, values, gradients, hessians);
-        case 2:
-            return evaluate_fixed_lagrange_fast_order<2>(
-                topology, xi, values, gradients, hessians);
-        case 3:
-            return evaluate_fixed_lagrange_fast_order<3>(
-                topology, xi, values, gradients, hessians);
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast_strided(LagrangeTopology topology,
-                                          int order,
-                                          const std::vector<math::Vector<Real, 3>>& points,
-                                          std::size_t output_stride,
-                                          Real* SVMP_RESTRICT values_out,
-                                          Real* SVMP_RESTRICT gradients_out,
-                                          Real* SVMP_RESTRICT hessians_out) {
-    if (topology == LagrangeTopology::Line &&
-        points.size() == 4u) {
-        const bool values_only =
-            values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr;
-        const bool gradients_only =
-            values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr;
-        const bool hessians_only =
-            values_out == nullptr && gradients_out == nullptr && hessians_out != nullptr;
-        const bool all_outputs =
-            values_out != nullptr && gradients_out != nullptr && hessians_out != nullptr;
-        if (values_only) {
-            if (order == 1) {
-                evaluate_line_order1_values_q4(points, output_stride, values_out);
-                return true;
-            }
-            if (order == 2) {
-                evaluate_line_order2_values_q4(points, output_stride, values_out);
-                return true;
-            }
-            if (order == 3) {
-                evaluate_line_order3_values_q4(points, output_stride, values_out);
-                return true;
-            }
-        }
-        if (order == 1) {
-            if (gradients_only) {
-                evaluate_line_order1_gradients_q4(output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order1_hessians_q4(output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order1_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-        if (order == 2) {
-            if (gradients_only) {
-                evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order2_hessians_q4(output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order2_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-        if (order == 3) {
-            if (gradients_only) {
-                evaluate_line_order3_gradients_q4(points, output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order3_hessians_q4(points, output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order3_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-    }
-
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 3 &&
-        (gradients_out != nullptr || hessians_out != nullptr)) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        hessians_out != nullptr) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
-        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order1_values_strided(points, output_stride, values_out);
-        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
-        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
-        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order2_values_strided(points, output_stride, values_out);
-        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
-        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order3_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order1_outputs_strided<false, true, false>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order1_outputs_strided<false, false, true>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order1_outputs_strided<true, true, true>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order1_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order1_hessians_strided(points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order1_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order2_hessians_strided(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order2_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order3_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order3_hessians_strided(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order2_values_strided(points, output_stride, values_out);
-        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
-        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        output_stride == 4u &&
-        hessians_out != nullptr) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order3_values_q4(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order3_gradients_q4(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order3_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (hessians_out != nullptr && order > 1 &&
-        (topology == LagrangeTopology::Quadrilateral ||
-         topology == LagrangeTopology::Hexahedron)) {
-        return false;
-    }
-    if (hessians_out != nullptr) {
-        const bool hessians_only = values_out == nullptr && gradients_out == nullptr;
-        if (order == 1) {
-            if (topology == LagrangeTopology::Triangle && hessians_only) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<1>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-            if (topology == LagrangeTopology::Tetrahedron) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<1>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        } else if (order == 2) {
-            if (topology == LagrangeTopology::Triangle && hessians_only) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<2>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-            if (topology == LagrangeTopology::Tetrahedron) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<2>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-    }
-
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_strided_order<1>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        case 2:
-            return evaluate_fixed_lagrange_fast_strided_order<2>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        case 3:
-            return evaluate_fixed_lagrange_fast_strided_order<3>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast_to(LagrangeTopology topology,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     Real* SVMP_RESTRICT values_out,
-                                     Real* SVMP_RESTRICT gradients_out,
-                                     Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_to_order<1>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        case 2:
-            return evaluate_fixed_lagrange_fast_to_order<2>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        case 3:
-            return evaluate_fixed_lagrange_fast_to_order<3>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-template<std::size_t N>
-struct AxisMonomialCoefficientTable {
-    std::array<Real, N * N> values{};
-    std::array<Real, N * (N > 1 ? N - 1 : 0)> first{};
-    std::array<Real, N * (N > 2 ? N - 2 : 0)> second{};
-};
-
-template<std::size_t N>
-constexpr AxisMonomialCoefficientTable<N> make_axis_monomial_coefficient_table() {
-    AxisMonomialCoefficientTable<N> table{};
-    std::array<Real, N> nodes{};
-    constexpr int order = static_cast<int>(N) - 1;
-    for (std::size_t i = 0; i < N; ++i) {
-        nodes[i] = detail::equispaced_pm_one_coord(static_cast<int>(i), order);
-    }
-
-    for (std::size_t i = 0; i < N; ++i) {
-        std::array<Real, N> coeffs{};
-        std::array<Real, N> next{};
-        coeffs[0] = Real(1);
-        std::size_t degree = 0;
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j == i) {
-                continue;
-            }
-            next = {};
-            for (std::size_t k = 0; k <= degree; ++k) {
-                next[k] -= nodes[j] * coeffs[k];
-                next[k + 1] += coeffs[k];
-            }
-            coeffs = next;
-            ++degree;
-        }
-
-        Real denominator = Real(1);
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j != i) {
-                denominator *= nodes[i] - nodes[j];
-            }
-        }
-        const Real inv_denominator = Real(1) / denominator;
-        for (std::size_t k = 0; k < N; ++k) {
-            table.values[i * N + k] = coeffs[k] * inv_denominator;
-        }
-        if constexpr (N >= 2) {
-            for (std::size_t k = 1; k < N; ++k) {
-                table.first[i * (N - 1) + (k - 1)] =
-                    static_cast<Real>(k) * table.values[i * N + k];
-            }
-        }
-        if constexpr (N >= 3) {
-            for (std::size_t k = 2; k < N; ++k) {
-                table.second[i * (N - 2) + (k - 2)] =
-                    static_cast<Real>(k * (k - 1)) * table.values[i * N + k];
-            }
-        }
-    }
-
-    return table;
-}
-
-template<std::size_t N>
-void assign_axis_coefficient_table(const AxisMonomialCoefficientTable<N>& table,
-                                   std::vector<Real>& values,
-                                   std::vector<Real>& first,
-                                   std::vector<Real>& second) {
-    assign_array(values, table.values);
-    assign_array(first, table.first);
-    assign_array(second, table.second);
-}
-
-bool assign_precomputed_axis_coefficients(int n_axis,
-                                          std::vector<Real>& values,
-                                          std::vector<Real>& first,
-                                          std::vector<Real>& second) {
-    static constexpr auto kAxisCoefficients1 = make_axis_monomial_coefficient_table<1>();
-    static constexpr auto kAxisCoefficients2 = make_axis_monomial_coefficient_table<2>();
-    static constexpr auto kAxisCoefficients3 = make_axis_monomial_coefficient_table<3>();
-    static constexpr auto kAxisCoefficients4 = make_axis_monomial_coefficient_table<4>();
-    static constexpr auto kAxisCoefficients5 = make_axis_monomial_coefficient_table<5>();
-
-    switch (n_axis) {
-        case 1:
-            assign_axis_coefficient_table(kAxisCoefficients1, values, first, second);
-            return true;
-        case 2:
-            assign_axis_coefficient_table(kAxisCoefficients2, values, first, second);
-            return true;
-        case 3:
-            assign_axis_coefficient_table(kAxisCoefficients3, values, first, second);
-            return true;
-        case 4:
-            assign_axis_coefficient_table(kAxisCoefficients4, values, first, second);
-            return true;
-        case 5:
-            assign_axis_coefficient_table(kAxisCoefficients5, values, first, second);
-            return true;
-        default:
-            return false;
-    }
-}
-
-LagrangeTopologyTraits lagrange_topology_traits(ElementType type) {
-    const auto topo = topology(type);
-    if (topo != LagrangeTopology::Unknown) {
-        return {topo, reference_dimension(type)};
-    }
-
-    throw BasisElementCompatibilityException("Unsupported element type for LagrangeBasis",
-                                             __FILE__, __LINE__, __func__);
-}
-
-std::size_t lattice_index_pm_one(Real coord, int order, const char* context) {
-    if (order <= 0) {
-        if (!coordinate_matches_expected(coord, Real(0))) {
-            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-        }
-        return 0;
-    }
-
-    const Real scaled = (coord + Real(1)) * static_cast<Real>(order) / Real(2);
-    const long idx = std::lround(scaled);
-    if (idx < 0 || idx > order ||
-        !coordinate_matches_expected(
-            coord,
-            detail::equispaced_pm_one_coord(static_cast<int>(idx), order))) {
-        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-    }
-    return static_cast<std::size_t>(idx);
-}
-
-int simplex_lattice_index(Real coord, int order, const char* context) {
-    if (order <= 0) {
-        if (!coordinate_matches_expected(coord, Real(0)) &&
-            !coordinate_matches_expected(coord, Real(0.25)) &&
-            !coordinate_matches_expected(coord, Real(1) / Real(3))) {
-            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-        }
-        return 0;
-    }
-
-    const Real scaled = coord * static_cast<Real>(order);
-    const long idx = std::lround(scaled);
-    const Real reconstructed = static_cast<Real>(idx) / static_cast<Real>(order);
-    if (idx < 0 || idx > order || !coordinate_matches_expected(coord, reconstructed)) {
-        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-    }
-    return static_cast<int>(idx);
-}
-
-std::array<int, 4> triangle_exponents_from_public_node(const math::Vector<Real, 3>& node,
-                                                       int order) {
-    if (order == 0) {
-        return {0, 0, 0, 0};
-    }
-
-    const int j = simplex_lattice_index(node[0], order,
-                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
-    const int k = simplex_lattice_index(node[1], order,
-                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
-    const int i = order - j - k;
-    if (i < 0) {
-        throw BasisNodeOrderingException("LagrangeBasis: invalid triangle barycentric coordinates for public ordering",
-                                         __FILE__, __LINE__, __func__);
-    }
-    return {i, j, k, 0};
-}
-
-std::array<int, 4> tetrahedron_exponents_from_public_node(const math::Vector<Real, 3>& node,
-                                                          int order) {
-    if (order == 0) {
-        return {0, 0, 0, 0};
-    }
-
-    const int j = simplex_lattice_index(node[0], order,
-                                        "LagrangeBasis: invalid tetrahedron node x-coordinate for public ordering");
-    const int k = simplex_lattice_index(node[1], order,
-                                        "LagrangeBasis: invalid tetrahedron node y-coordinate for public ordering");
-    const int l = simplex_lattice_index(node[2], order,
-                                        "LagrangeBasis: invalid tetrahedron node z-coordinate for public ordering");
-    const int i = order - j - k - l;
-    if (i < 0) {
-        throw BasisNodeOrderingException("LagrangeBasis: invalid tetrahedron barycentric coordinates for public ordering",
-                                         __FILE__, __LINE__, __func__);
-    }
-    return {i, j, k, l};
-}
-
-struct NormalizedLagrangeRequest {
-    ElementType element_type;
-    int order;
-};
-
-// Non-owning view of the per-axis 1D Lagrange basis evaluations
-// (values, first derivative, second derivative), each of length `size`.
-struct AxisBasisEvaluations {
-    const Real* values;
-    const Real* first;
-    const Real* second;
-    std::size_t size;
-};
-
-AxisBasisEvaluations constant_axis_basis() {
-    static const Real kOne[1]  = {Real(1)};
-    static const Real kZero[1] = {Real(0)};
-    return AxisBasisEvaluations{kOne, kZero, kZero, 1};
-}
-
-// Horner-form evaluator for the precomputed 1D Lagrange basis.
-//
-// Inputs are precomputed monomial coefficients of L_i(x), L_i'(x), L_i''(x)
-// (built once at LagrangeBasis construction). Evaluation is purely
-// multiply-add on the coefficients — no divisions and no node-position
-// lookups in the hot path. Templated on N for compile-time loop unrolling
-// and FMA-friendly straight-line code on the common Hex/Quad/Line orders.
-//
-// Layout:
-//   v_coeffs:  N * N entries; row i holds [c_i0, c_i1, ..., c_i(N-1)]
-//              such that L_i(x) = sum_k c_ik * x^k
-//   d_coeffs:  N * (N-1) entries; row i holds derivative coefficients of L_i'(x)
-//   d2_coeffs: N * (N-2) entries; row i holds coefficients of L_i''(x)
-//              (only valid when N >= 3)
-template<int N>
-inline void evaluate_1d_horner_impl(const Real* v_coeffs,
-                                    const Real* d_coeffs,
-                                    const Real* d2_coeffs,
-                                    Real xi,
-                                    Real* values, Real* first, Real* second) {
-    if constexpr (N == 1) {
-        values[0] = v_coeffs[0];
-        if (first)  first[0]  = Real(0);
-        if (second) second[0] = Real(0);
-        return;
-    } else {
-        // Values: degree N-1 polynomials.
-        for (int i = 0; i < N; ++i) {
-            const Real* c = v_coeffs + i * N;
-            Real r = c[N - 1];
-            for (int k = N - 1; k > 0; --k) {
-                r = r * xi + c[k - 1];
-            }
-            values[i] = r;
-        }
-
-        if (!first && !second) return;
-
-        if (first) {
-            // First derivatives: degree N-2 polynomials (per row of d_coeffs).
-            for (int i = 0; i < N; ++i) {
-                const Real* c = d_coeffs + i * (N - 1);
-                Real r = c[N - 2];
-                for (int k = N - 2; k > 0; --k) {
-                    r = r * xi + c[k - 1];
-                }
-                first[i] = r;
-            }
-        }
-
-        if (!second) return;
-
-        if constexpr (N <= 2) {
-            for (int i = 0; i < N; ++i) second[i] = Real(0);
-        } else {
-            // Second derivatives: degree N-3 polynomials (per row of d2_coeffs).
-            for (int i = 0; i < N; ++i) {
-                const Real* c = d2_coeffs + i * (N - 2);
-                Real r = c[N - 3];
-                for (int k = N - 3; k > 0; --k) {
-                    r = r * xi + c[k - 1];
-                }
-                second[i] = r;
-            }
-        }
-    }
-}
-
-void fill_equispaced_barycentric_weights(int n_axis, Real* weights) {
-    const int order = n_axis - 1;
-    Real weight = (order % 2 == 0) ? Real(1) : Real(-1);
-    Real max_abs = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        weights[i] = weight;
-        max_abs = std::max(max_abs, std::abs(weight));
-        if (i < order) {
-            weight *= -static_cast<Real>(order - i) / static_cast<Real>(i + 1);
-        }
-    }
-
-    if (max_abs > Real(0)) {
-        const Real inv_scale = Real(1) / max_abs;
-        for (int i = 0; i < n_axis; ++i) {
-            weights[i] *= inv_scale;
-        }
-    }
-}
-
-bool coordinate_matches_axis_node(Real xi, Real node) {
-    return coordinate_matches_expected(xi, node);
-}
-
-struct CompensatedSum {
-    Real sum{Real(0)};
-    Real compensation{Real(0)};
-
-    void add(Real value) noexcept {
-        const Real y = value - compensation;
-        const Real t = sum + y;
-        compensation = (t - sum) - y;
-        sum = t;
-    }
-};
-
-void distribute_residual_by_abs(int n_axis, Real* values, Real residual) {
-    if (values == nullptr || n_axis <= 0 || residual == Real(0)) {
-        return;
-    }
-
-    CompensatedSum abs_sum;
-    int largest_index = 0;
-    Real largest_abs = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        const Real magnitude = std::abs(values[i]);
-        abs_sum.add(magnitude);
-        if (magnitude > largest_abs) {
-            largest_abs = magnitude;
-            largest_index = i;
-        }
-    }
-
-    if (abs_sum.sum <= Real(0)) {
-        values[0] += residual;
-        return;
-    }
-
-    const Real inv_abs_sum = Real(1) / abs_sum.sum;
-    CompensatedSum applied;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real correction = residual * std::abs(values[i]) * inv_abs_sum;
-        values[i] += correction;
-        applied.add(correction);
-    }
-    values[largest_index] += residual - applied.sum;
-}
-
-void evaluate_1d_barycentric_runtime(int n_axis,
-                                     Real xi,
-                                     const Real* weights,
-                                     Real* values,
-                                     Real* first,
-                                     Real* second) {
-    const int order = n_axis - 1;
-    BASIS_CHECK_EVAL(weights != nullptr,
-                     "LagrangeBasis: missing cached barycentric weights for runtime axis evaluation");
-
-    int node_index = -1;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        if (coordinate_matches_axis_node(xi, node)) {
-            node_index = i;
-            break;
-        }
-    }
-
-    if (node_index >= 0) {
-        std::fill(values, values + n_axis, Real(0));
-        values[node_index] = Real(1);
-        if (!first && !second) {
-            return;
-        }
-
-        const Real xk = detail::equispaced_pm_one_coord(node_index, order);
-        const Real wk = weights[static_cast<std::size_t>(node_index)];
-        Real reciprocal_sum = Real(0);
-        if (second) {
-            for (int m = 0; m < n_axis; ++m) {
-                if (m == node_index) {
-                    continue;
-                }
-                const Real xm = detail::equispaced_pm_one_coord(m, order);
-                reciprocal_sum += Real(1) / (xk - xm);
-            }
-        }
-
-        Real first_diagonal = Real(0);
-        Real second_diagonal = Real(0);
-        if (first) {
-            std::fill(first, first + n_axis, Real(0));
-        }
-        if (second) {
-            std::fill(second, second + n_axis, Real(0));
-        }
-
-        for (int j = 0; j < n_axis; ++j) {
-            if (j == node_index) {
-                continue;
-            }
-            const Real xj = detail::equispaced_pm_one_coord(j, order);
-            const Real distance = xk - xj;
-            const Real offdiag_first = weights[static_cast<std::size_t>(j)] / (wk * distance);
-            first_diagonal -= offdiag_first;
-            if (first) {
-                first[j] = offdiag_first;
-            }
-            if (second) {
-                const Real offdiag_second =
-                    Real(2) * offdiag_first * (reciprocal_sum - Real(1) / distance);
-                second[j] = offdiag_second;
-                second_diagonal -= offdiag_second;
-            }
-        }
-        if (first) {
-            first[node_index] = first_diagonal;
-        }
-        if (second) {
-            second[node_index] = second_diagonal;
-        }
-        return;
-    }
-
-    Real sum0 = Real(0);
-    Real sum1 = Real(0);
-    Real sum2 = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        const Real inv_distance = Real(1) / (xi - node);
-        const Real weighted = weights[static_cast<std::size_t>(i)] * inv_distance;
-        sum0 += weighted;
-        sum1 += weighted * inv_distance;
-        sum2 += weighted * inv_distance * inv_distance;
-    }
-
-    const Real inv_sum0 = Real(1) / sum0;
-    const Real first_ratio = sum1 * inv_sum0;
-    const Real second_ratio = sum2 * inv_sum0;
-    const Real first_ratio_sq = first_ratio * first_ratio;
-
-    CompensatedSum value_sum;
-    CompensatedSum first_sum;
-    CompensatedSum second_sum;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        const Real inv_distance = Real(1) / (xi - node);
-        const Real value = weights[static_cast<std::size_t>(i)] * inv_distance * inv_sum0;
-        values[i] = value;
-        value_sum.add(value);
-        if (first || second) {
-            const Real derivative_factor = first_ratio - inv_distance;
-            if (first) {
-                first[i] = value * derivative_factor;
-                first_sum.add(first[i]);
-            }
-            if (second) {
-                second[i] = value * (derivative_factor * derivative_factor +
-                                     inv_distance * inv_distance -
-                                     Real(2) * second_ratio +
-                                     first_ratio_sq);
-                second_sum.add(second[i]);
-            }
-        }
-    }
-
-    distribute_residual_by_abs(n_axis, values, Real(1) - value_sum.sum);
-    if (first) {
-        distribute_residual_by_abs(n_axis, first, -first_sum.sum);
-    }
-    if (second) {
-        distribute_residual_by_abs(n_axis, second, -second_sum.sum);
-    }
-}
-
-// 1D Lagrange-basis evaluator. Writes n_axis entries to each non-null output
-// buffer. Dispatches to compile-time Horner specializations for sizes 1..9
-// (orders 0..8 — the Lagrange performance sweep) and uses barycentric
-// evaluation above that threshold to avoid high-order monomial conditioning
-// issues.
-void evaluate_1d_basis_to(const Real* v_coeffs,
-                          const Real* d_coeffs,
-                          const Real* d2_coeffs,
-                          const Real* barycentric_weights,
-                          int n_axis, Real xi,
-                          Real* values, Real* first, Real* second) {
-    switch (n_axis) {
-        case 1: evaluate_1d_horner_impl<1>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 2: evaluate_1d_horner_impl<2>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 3: evaluate_1d_horner_impl<3>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 4: evaluate_1d_horner_impl<4>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 5: evaluate_1d_horner_impl<5>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 6: evaluate_1d_horner_impl<6>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 7: evaluate_1d_horner_impl<7>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 8: evaluate_1d_horner_impl<8>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 9: evaluate_1d_horner_impl<9>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        default:
-            evaluate_1d_barycentric_runtime(n_axis, xi, barycentric_weights, values, first, second);
-            return;
-    }
-}
-
-// Selects which derivative passes are computed by the 1D evaluator.
-enum class AxisDeriv {
-    ValuesOnly,           // skip first and second
-    ValuesAndFirst,       // for gradients
-    ValuesAndFirstAndSecond, // for hessians or fused evaluate_all
-};
-
-// Per-axis storage (values, first derivative, second derivative). Backed by
-// per-thread scratch that grows lazily; subsequent calls reuse capacity with no
-// reallocation.
-struct AxisScratch {
-    std::vector<Real> values;
-    std::vector<Real> first;
-    std::vector<Real> second;
-
-    void reserveFor(std::size_t n) {
-        if (values.size() < n) values.resize(n);
-        if (first.size() < n) first.resize(n);
-        if (second.size() < n) second.resize(n);
-    }
-};
-
-struct AxisBatchScratch {
-    std::vector<Real> values;
-    std::vector<Real> first;
-    std::vector<Real> second;
-
-    void resizeFor(std::size_t count, AxisDeriv level) {
-        if (values.size() < count) values.resize(count);
-        if (level != AxisDeriv::ValuesOnly && first.size() < count) first.resize(count);
-        if (level == AxisDeriv::ValuesAndFirstAndSecond && second.size() < count) second.resize(count);
-    }
-};
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-inline void fill_simplex_factor_sequence_fixed(Real lambda,
-                                               Real* SVMP_RESTRICT phi,
-                                               Real* SVMP_RESTRICT dphi,
-                                               Real* SVMP_RESTRICT d2phi) {
-    static_assert(!NeedSecond || NeedFirst,
-                  "second derivative factors require first-derivative recurrence state");
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        dphi[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        d2phi[0] = Real(0);
-    }
-
-    const Real t = static_cast<Real>(Order) * lambda;
-    const Real dt_dlambda = static_cast<Real>(Order);
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-    for (int a = 1; a <= Order; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt_old = dphi_dt_prev;
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
-            dphi[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
-                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-template<int Order, bool NeedSecond>
-inline void fill_triangle_factors_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real (&phi0)[4][Order + 1],
-    Real (&phi1)[4][Order + 1],
-    Real (&phi2)[4][Order + 1],
-    Real (&dphi0)[4][Order + 1],
-    Real (&dphi1)[4][Order + 1],
-    Real (&dphi2)[4][Order + 1],
-    Real (&d2phi0)[4][Order + 1],
-    Real (&d2phi1)[4][Order + 1],
-    Real (&d2phi2)[4][Order + 1]) {
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        if constexpr (NeedSecond) {
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l0, phi0[q], dphi0[q], d2phi0[q]);
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l1, phi1[q], dphi1[q], d2phi1[q]);
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l2, phi2[q], dphi2[q], d2phi2[q]);
-        } else {
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l0, phi0[q], dphi0[q], nullptr);
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l1, phi1[q], dphi1[q], nullptr);
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l2, phi2[q], dphi2[q], nullptr);
-        }
-    }
-}
-
-template<std::size_t Q>
-inline void write_wedge_gradient_strided_q(std::size_t tri_stride,
-                                           std::size_t axis_stride,
-                                           std::size_t tri,
-                                           std::size_t z,
-                                           std::size_t output_stride,
-                                           const Real* SVMP_RESTRICT tri_values,
-                                           const Real* SVMP_RESTRICT tri_g,
-                                           const AxisBatchScratch& axis_batch,
-                                           Real* SVMP_RESTRICT g) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    g[0u * output_stride + Q] = tri_g[0u * tri_stride + Q] * zv;
-    g[1u * output_stride + Q] = tri_g[1u * tri_stride + Q] * zv;
-    g[2u * output_stride + Q] = tri_v * axis_batch.first[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_gradient_stride4_q(std::size_t tri_stride,
-                                           std::size_t axis_stride,
-                                           std::size_t tri,
-                                           std::size_t z,
-                                           const Real* SVMP_RESTRICT tri_values,
-                                           const Real* SVMP_RESTRICT tri_g,
-                                           const AxisBatchScratch& axis_batch,
-                                           Real* SVMP_RESTRICT g) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    g[Q] = tri_g[0u * tri_stride + Q] * zv;
-    g[4u + Q] = tri_g[1u * tri_stride + Q] * zv;
-    g[8u + Q] = tri_v * axis_batch.first[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_hessian_strided_q(std::size_t tri_stride,
-                                          std::size_t axis_stride,
-                                          std::size_t tri,
-                                          std::size_t z,
-                                          std::size_t output_stride,
-                                          const Real* SVMP_RESTRICT tri_values,
-                                          const Real* SVMP_RESTRICT tri_g,
-                                          const Real* SVMP_RESTRICT tri_H,
-                                          const AxisBatchScratch& axis_batch,
-                                          Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    H[0u * output_stride + Q] = tri_hxx * zv;
-    H[1u * output_stride + Q] = hxy;
-    H[2u * output_stride + Q] = hxz;
-    H[3u * output_stride + Q] = hxy;
-    H[4u * output_stride + Q] = tri_hyy * zv;
-    H[5u * output_stride + Q] = hyz;
-    H[6u * output_stride + Q] = hxz;
-    H[7u * output_stride + Q] = hyz;
-    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_hessian_stride4_q(std::size_t tri_stride,
-                                          std::size_t axis_stride,
-                                          std::size_t tri,
-                                          std::size_t z,
-                                          const Real* SVMP_RESTRICT tri_values,
-                                          const Real* SVMP_RESTRICT tri_g,
-                                          const Real* SVMP_RESTRICT tri_H,
-                                          const AxisBatchScratch& axis_batch,
-                                          Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    H[Q] = tri_hxx * zv;
-    H[4u + Q] = hxy;
-    H[8u + Q] = hxz;
-    H[12u + Q] = hxy;
-    H[16u + Q] = tri_hyy * zv;
-    H[20u + Q] = hyz;
-    H[24u + Q] = hxz;
-    H[28u + Q] = hyz;
-    H[32u + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_all_strided_q(std::size_t tri_stride,
-                                      std::size_t axis_stride,
-                                      std::size_t tri,
-                                      std::size_t z,
-                                      std::size_t output_stride,
-                                      const Real* SVMP_RESTRICT tri_values,
-                                      const Real* SVMP_RESTRICT tri_g,
-                                      const Real* SVMP_RESTRICT tri_H,
-                                      const AxisBatchScratch& axis_batch,
-                                      Real* SVMP_RESTRICT value_row,
-                                      Real* SVMP_RESTRICT g,
-                                      Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    value_row[Q] = tri_v * zv;
-    g[0u * output_stride + Q] = tri_gx * zv;
-    g[1u * output_stride + Q] = tri_gy * zv;
-    g[2u * output_stride + Q] = tri_v * zd;
-    H[0u * output_stride + Q] = tri_hxx * zv;
-    H[1u * output_stride + Q] = hxy;
-    H[2u * output_stride + Q] = hxz;
-    H[3u * output_stride + Q] = hxy;
-    H[4u * output_stride + Q] = tri_hyy * zv;
-    H[5u * output_stride + Q] = hyz;
-    H[6u * output_stride + Q] = hxz;
-    H[7u * output_stride + Q] = hyz;
-    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_all_stride4_q(std::size_t tri_stride,
-                                      std::size_t axis_stride,
-                                      std::size_t tri,
-                                      std::size_t z,
-                                      const Real* SVMP_RESTRICT tri_values,
-                                      const Real* SVMP_RESTRICT tri_g,
-                                      const Real* SVMP_RESTRICT tri_H,
-                                      const AxisBatchScratch& axis_batch,
-                                      Real* SVMP_RESTRICT value_row,
-                                      Real* SVMP_RESTRICT g,
-                                      Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    value_row[Q] = tri_v * zv;
-    g[Q] = tri_gx * zv;
-    g[4u + Q] = tri_gy * zv;
-    g[8u + Q] = tri_v * zd;
-    H[Q] = tri_hxx * zv;
-    H[4u + Q] = hxy;
-    H[8u + Q] = hxz;
-    H[12u + Q] = hxy;
-    H[16u + Q] = tri_hyy * zv;
-    H[20u + Q] = hyz;
-    H[24u + Q] = hxz;
-    H[28u + Q] = hyz;
-    H[32u + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<int Order, bool NeedHess>
-bool evaluate_wedge_fused_stride4_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    const std::vector<math::Vector<Real, 3>>& points,
-    const AxisBatchScratch& axis_batch,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 3 && Order <= 8, "fused wedge q4 path covers orders 3..8");
-    const std::size_t tri_count = simplex_exponents.size();
-    const std::size_t z_count = static_cast<std::size_t>(n_axis);
-    if (points.size() != 4u ||
-        z_count != static_cast<std::size_t>(Order + 1) ||
-        wedge_node_by_tri_z.size() != tri_count * z_count) {
-        return false;
-    }
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-    fill_triangle_factors_q4<Order, NeedHess>(
-        points, phi0, phi1, phi2, dphi0, dphi1, dphi2, d2phi0, d2phi1, d2phi2);
-
-    for (std::size_t tri = 0; tri < tri_count; ++tri) {
-        const auto& e = simplex_exponents[tri];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-        Real tri_v[4];
-        Real tri_gx[4];
-        Real tri_gy[4];
-        Real tri_hxx[4];
-        Real tri_hxy[4];
-        Real tri_hyy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real dl0 = D0 * v1 * v2;
-            tri_v[q] = v0 * v1 * v2;
-            tri_gx[q] = v0 * D1 * v2 - dl0;
-            tri_gy[q] = v0 * v1 * D2 - dl0;
-
-            if constexpr (NeedHess) {
-                const Real DD0 = d2phi0[q][i0];
-                const Real DD1 = d2phi1[q][i1];
-                const Real DD2 = d2phi2[q][i2];
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-                tri_hxx[q] = H00 - Real(2) * H01 + H11;
-                tri_hxy[q] = H00 - H01 - H02 + H12;
-                tri_hyy[q] = H00 - Real(2) * H02 + H22;
-            }
-        }
-
-        for (std::size_t z = 0; z < z_count; ++z) {
-            const std::size_t node = wedge_node_by_tri_z[tri * z_count + z];
-            Real* SVMP_RESTRICT value_row =
-                values_out != nullptr ? values_out + node * 4u : nullptr;
-            Real* SVMP_RESTRICT g =
-                gradients_out != nullptr ? gradients_out + node * 12u : nullptr;
-            Real* SVMP_RESTRICT H =
-                hessians_out != nullptr ? hessians_out + node * 36u : nullptr;
-
-            const Real z0 = axis_batch.values[z];
-            const Real z1 = axis_batch.values[z_count + z];
-            const Real z2 = axis_batch.values[2u * z_count + z];
-            const Real z3 = axis_batch.values[3u * z_count + z];
-            const Real dz0 = axis_batch.first[z];
-            const Real dz1 = axis_batch.first[z_count + z];
-            const Real dz2 = axis_batch.first[2u * z_count + z];
-            const Real dz3 = axis_batch.first[3u * z_count + z];
-
-            if (value_row != nullptr) {
-                value_row[0] = tri_v[0] * z0;
-                value_row[1] = tri_v[1] * z1;
-                value_row[2] = tri_v[2] * z2;
-                value_row[3] = tri_v[3] * z3;
-            }
-            if (g != nullptr) {
-                g[0] = tri_gx[0] * z0;
-                g[1] = tri_gx[1] * z1;
-                g[2] = tri_gx[2] * z2;
-                g[3] = tri_gx[3] * z3;
-                g[4] = tri_gy[0] * z0;
-                g[5] = tri_gy[1] * z1;
-                g[6] = tri_gy[2] * z2;
-                g[7] = tri_gy[3] * z3;
-                g[8] = tri_v[0] * dz0;
-                g[9] = tri_v[1] * dz1;
-                g[10] = tri_v[2] * dz2;
-                g[11] = tri_v[3] * dz3;
-            }
-            if constexpr (NeedHess) {
-                if (H != nullptr) {
-                    const Real d2z0 = axis_batch.second[z];
-                    const Real d2z1 = axis_batch.second[z_count + z];
-                    const Real d2z2 = axis_batch.second[2u * z_count + z];
-                    const Real d2z3 = axis_batch.second[3u * z_count + z];
-                    const Real hxz0 = tri_gx[0] * dz0;
-                    const Real hxz1 = tri_gx[1] * dz1;
-                    const Real hxz2 = tri_gx[2] * dz2;
-                    const Real hxz3 = tri_gx[3] * dz3;
-                    const Real hyz0 = tri_gy[0] * dz0;
-                    const Real hyz1 = tri_gy[1] * dz1;
-                    const Real hyz2 = tri_gy[2] * dz2;
-                    const Real hyz3 = tri_gy[3] * dz3;
-                    H[0] = tri_hxx[0] * z0;
-                    H[1] = tri_hxx[1] * z1;
-                    H[2] = tri_hxx[2] * z2;
-                    H[3] = tri_hxx[3] * z3;
-                    H[4] = tri_hxy[0] * z0;
-                    H[5] = tri_hxy[1] * z1;
-                    H[6] = tri_hxy[2] * z2;
-                    H[7] = tri_hxy[3] * z3;
-                    H[8] = hxz0;
-                    H[9] = hxz1;
-                    H[10] = hxz2;
-                    H[11] = hxz3;
-                    H[12] = H[4];
-                    H[13] = H[5];
-                    H[14] = H[6];
-                    H[15] = H[7];
-                    H[16] = tri_hyy[0] * z0;
-                    H[17] = tri_hyy[1] * z1;
-                    H[18] = tri_hyy[2] * z2;
-                    H[19] = tri_hyy[3] * z3;
-                    H[20] = hyz0;
-                    H[21] = hyz1;
-                    H[22] = hyz2;
-                    H[23] = hyz3;
-                    H[24] = hxz0;
-                    H[25] = hxz1;
-                    H[26] = hxz2;
-                    H[27] = hxz3;
-                    H[28] = hyz0;
-                    H[29] = hyz1;
-                    H[30] = hyz2;
-                    H[31] = hyz3;
-                    H[32] = tri_v[0] * d2z0;
-                    H[33] = tri_v[1] * d2z1;
-                    H[34] = tri_v[2] * d2z2;
-                    H[35] = tri_v[3] * d2z3;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-template<bool NeedHess>
-bool try_evaluate_wedge_fused_stride4_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    const AxisBatchScratch& axis_batch,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-        case 3:
-            return evaluate_wedge_fused_stride4_q4<3, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 4:
-            return evaluate_wedge_fused_stride4_q4<4, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 5:
-            return evaluate_wedge_fused_stride4_q4<5, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 6:
-            return evaluate_wedge_fused_stride4_q4<6, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 7:
-            return evaluate_wedge_fused_stride4_q4<7, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 8:
-            return evaluate_wedge_fused_stride4_q4<8, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-struct TensorProductTableScratch {
-    std::vector<Real> vv;
-    std::vector<Real> dv;
-    std::vector<Real> vd;
-    std::vector<Real> d2v;
-    std::vector<Real> vd2;
-    std::vector<Real> dd;
-
-    void resizeFor(std::size_t count) {
-        if (vv.size() < count) vv.resize(count);
-        if (dv.size() < count) dv.resize(count);
-        if (vd.size() < count) vd.resize(count);
-        if (d2v.size() < count) d2v.resize(count);
-        if (vd2.size() < count) vd2.resize(count);
-        if (dd.size() < count) dd.resize(count);
-    }
-};
-
-// Caller-provided scratch buffers used by tensor-product evaluation. Three
-// independent axes plus reusable simplex/wedge intermediates.
-struct LagrangeEvaluateScratch {
-    AxisScratch axis_x;
-    AxisScratch axis_y;
-    AxisScratch axis_z;
-    AxisBatchScratch axis_x_batch;
-    AxisBatchScratch axis_y_batch;
-    AxisBatchScratch axis_z_batch;
-    TensorProductTableScratch tensor_tables;
-
-    std::vector<Real> tri_values;
-    std::vector<Gradient> tri_gradients;
-    std::vector<Hessian> tri_hessians;
-    std::vector<Real> tri_gradient_components;
-    std::vector<Real> tri_hessian_components;
-    std::vector<Real> wedge_tri_values_batch;
-    std::vector<Real> wedge_tri_gradient_batch;
-    std::vector<Real> wedge_tri_hessian_batch;
-
-    std::vector<Real> strided_values_tmp;
-    std::vector<Real> strided_gradients_tmp;
-    std::vector<Real> strided_hessians_tmp;
-
-    void prewarm(int max_order, std::size_t max_qpts) {
-        const int clamped_order = std::max(max_order, 0);
-        const std::size_t axis_size = static_cast<std::size_t>(clamped_order) + 1u;
-        const std::size_t axis_batch_size = axis_size * max_qpts;
-        const std::size_t tensor_table_size =
-            axis_size * axis_size * std::max<std::size_t>(max_qpts, 1u);
-        const std::size_t tensor_dofs = tensor_table_size * axis_size;
-        const std::size_t tri_count = axis_size * (axis_size + 1u) / 2u;
-
-        axis_x.reserveFor(axis_size);
-        axis_y.reserveFor(axis_size);
-        axis_z.reserveFor(axis_size);
-        axis_x_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        axis_y_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        axis_z_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        tensor_tables.resizeFor(tensor_table_size);
-        tri_values.reserve(tri_count);
-        tri_gradients.reserve(tri_count);
-        tri_hessians.reserve(tri_count);
-        tri_gradient_components.reserve(tri_count * 3u);
-        tri_hessian_components.reserve(tri_count * 9u);
-        wedge_tri_values_batch.reserve(tri_count * max_qpts);
-        wedge_tri_gradient_batch.reserve(tri_count * 3u * max_qpts);
-        wedge_tri_hessian_batch.reserve(tri_count * 9u * max_qpts);
-        strided_values_tmp.reserve(tensor_dofs);
-        strided_gradients_tmp.reserve(tensor_dofs * 3u);
-        strided_hessians_tmp.reserve(tensor_dofs * 9u);
-    }
-};
-
-LagrangeEvaluateScratch& evaluate_scratch() {
-    // Scratch is intentionally thread-local: assembly and benchmark callers run
-    // evaluation on persistent worker threads, so capacity is reused by thread.
-    static thread_local LagrangeEvaluateScratch s;
-    return s;
-}
-
-// Fill axis scratch and return a non-owning view. Uncomputed slots still have
-// valid pointers to scratch storage (they may hold stale data) — callers must
-// only read the slots they requested via `level`. Common low orders use
-// precomputed Horner coefficients; high orders use barycentric axis evaluation.
-AxisBasisEvaluations fill_axis_scratch(AxisScratch& s,
-                                       const Real* v_coeffs,
-                                       const Real* d_coeffs,
-                                       const Real* d2_coeffs,
-                                       const Real* barycentric_weights,
-                                       int n_axis, Real xi,
-                                       AxisDeriv level) {
-    const std::size_t n = static_cast<std::size_t>(n_axis);
-    s.reserveFor(n);
-    Real* first  = (level == AxisDeriv::ValuesOnly) ? nullptr : s.first.data();
-    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? s.second.data() : nullptr;
-    evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights,
-                         n_axis, xi, s.values.data(), first, second);
-    return AxisBasisEvaluations{s.values.data(), s.first.data(), s.second.data(), n};
-}
-
-void fill_axis_batch(AxisBatchScratch& scratch,
-                     const std::vector<math::Vector<Real, 3>>& points,
-                     std::size_t component,
-                     const Real* v_coeffs,
-                     const Real* d_coeffs,
-                     const Real* d2_coeffs,
-                     const Real* barycentric_weights,
-                     int n_axis,
-                     AxisDeriv level) {
-    const std::size_t count = points.size() * static_cast<std::size_t>(n_axis);
-    scratch.resizeFor(count, level);
-    Real* first = (level == AxisDeriv::ValuesOnly) ? nullptr : scratch.first.data();
-    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? scratch.second.data() : nullptr;
-    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights, n_axis,
-                             points[q][component],
-                             scratch.values.data() + q * axis_stride,
-                             first ? first + q * axis_stride : nullptr,
-                             second ? second + q * axis_stride : nullptr);
-    }
-}
-
-// Maximum yz-table footprint that fits comfortably on the stack for the
-// Lagrange performance sweep. Order-8 hex q=4 needs 4*(9x9) entries per table.
-// Higher orders fall back to thread_local heap buffers.
-inline constexpr std::size_t kMaxStackYZ = 384;
-
-struct TensorProductVectorSink {
-    std::vector<Real>* values;
-    std::vector<Gradient>* gradients;
-    std::vector<Hessian>* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t n_nodes) const {
-        if (values)    values->resize(n_nodes);
-        if (gradients) gradients->resize(n_nodes);
-        if (hessians)  hessians->resize(n_nodes);
-    }
-
-    void write_value(std::size_t n, Real value) const {
-        (*values)[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
-        auto& g = (*gradients)[n];
-        g[0] = dx;
-        g[1] = dy;
-        g[2] = dz;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        (*hessians)[n] = make_symmetric_hessian(xx, yy, zz, xy, xz, yz);
-    }
-};
-
-struct TensorProductRawSink {
-    Real* values;
-    Real* gradients;
-    Real* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t) const {}
-
-    void write_value(std::size_t n, Real value) const {
-        values[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
-        Real* g = gradients + n * 3u;
-        g[0] = dx;
-        g[1] = dy;
-        g[2] = dz;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Real* H = hessians + n * 9u;
-        H[0] = xx;
-        H[4] = yy;
-        H[8] = zz;
-        H[1] = xy; H[3] = xy;
-        H[2] = xz; H[6] = xz;
-        H[5] = yz; H[7] = yz;
-    }
-};
-
-// Fused sum-factorized tensor-product evaluator.
-//
-// Precomputes one to six (ny x nz)-shaped tables of partial products
-// `M_xy[j*nz + k]` so that the inner per-node loop performs at most one
-// multiplication per output instead of two. With all three output buffers
-// supplied, this is the fused values + gradients + hessians path that shares
-// every per-axis evaluation.
-//
-// Per-node multiply count (vs. the unfactored variants):
-//   values only       : 1  (was 2)
-//   gradients only    : 3  (was 6)
-//   hessians only     : 6  (was 12)
-//   all three         : 10 (was 20)
-//
-// Dimensional scope: works uniformly for Line/Quadrilateral/Hexahedron with
-// the unused axes' size folded to 1 via constant_axis_basis().
-template <typename Sink>
-void evaluate_tensor_product_factorized_impl(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    const Sink& sink) {
-    const std::size_t ny = y_axis.size;
-    const std::size_t nz = z_axis.size;
-    const std::size_t nyz = ny * nz;
-    const bool need_values = sink.wants_values();
-    const bool need_grad = sink.wants_gradients();
-    const bool need_hess = sink.wants_hessians();
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    Real Md2v_stack[kMaxStackYZ];
-    Real Mvd2_stack[kMaxStackYZ];
-    Real Mdd_stack[kMaxStackYZ];
-
-    Real* Mvv;
-    Real* Mdv;
-    Real* Mvd;
-    Real* Md2v;
-    Real* Mvd2;
-    Real* Mdd;
-    if (nyz <= kMaxStackYZ) {
-        Mvv = Mvv_stack;
-        Mdv = Mdv_stack;
-        Mvd = Mvd_stack;
-        Md2v = Md2v_stack;
-        Mvd2 = Mvd2_stack;
-        Mdd = Mdd_stack;
-    } else {
-        auto& tables = evaluate_scratch().tensor_tables;
-        tables.resizeFor(nyz);
-        Mvv = tables.vv.data();
-        Mdv = tables.dv.data();
-        Mvd = tables.vd.data();
-        Md2v = tables.d2v.data();
-        Mvd2 = tables.vd2.data();
-        Mdd = tables.dd.data();
-    }
-
-    // M_vv is required by every output (values, ∂ξ, ∂ξ²).
-    for (std::size_t j = 0; j < ny; ++j) {
-        const Real yv = y_axis.values[j];
-        for (std::size_t k = 0; k < nz; ++k) {
-            Mvv[j * nz + k] = yv * z_axis.values[k];
-        }
-    }
-
-    if (need_grad || need_hess) {
-        for (std::size_t j = 0; j < ny; ++j) {
-            const Real yv = y_axis.values[j];
-            const Real yd = y_axis.first[j];
-            for (std::size_t k = 0; k < nz; ++k) {
-                Mdv[j * nz + k] = yd * z_axis.values[k];
-                Mvd[j * nz + k] = yv * z_axis.first[k];
-            }
-        }
-    }
-
-    if (need_hess) {
-        for (std::size_t j = 0; j < ny; ++j) {
-            const Real yv = y_axis.values[j];
-            const Real yd = y_axis.first[j];
-            const Real yd2 = y_axis.second[j];
-            for (std::size_t k = 0; k < nz; ++k) {
-                Md2v[j * nz + k] = yd2 * z_axis.values[k];
-                Mvd2[j * nz + k] = yv  * z_axis.second[k];
-                Mdd[j * nz + k]  = yd  * z_axis.first[k];
-            }
-        }
-    }
-
-    const std::size_t n_nodes = tensor_indices.size();
-    sink.prepare(n_nodes);
-
-    for (std::size_t n = 0; n < n_nodes; ++n) {
-        const auto& idx = tensor_indices[n];
-        const std::size_t i = idx[0];
-        const std::size_t jk = idx[1] * nz + idx[2];
-
-        const Real Lx = x_axis.values[i];
-
-        if (need_values) {
-            sink.write_value(n, Lx * Mvv[jk]);
-        }
-
-        if (need_grad) {
-            const Real dLx = x_axis.first[i];
-            sink.write_gradient(n,
-                                dLx * Mvv[jk],
-                                Lx  * Mdv[jk],
-                                Lx  * Mvd[jk]);
-        }
-
-        if (need_hess) {
-            const Real dLx  = x_axis.first[i];
-            const Real d2Lx = x_axis.second[i];
-            sink.write_hessian(n,
-                               d2Lx * Mvv[jk],
-                               Lx   * Md2v[jk],
-                               Lx   * Mvd2[jk],
-                               dLx  * Mdv[jk],
-                               dLx  * Mvd[jk],
-                               Lx   * Mdd[jk]);
-        }
-    }
-}
-
-void evaluate_tensor_product_factorized(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    std::vector<Real>* values_out,
-    std::vector<Gradient>* gradients_out,
-    std::vector<Hessian>* hessians_out) {
-    const TensorProductVectorSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
-}
-
-void evaluate_tensor_product_factorized_to(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const TensorProductRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_value_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    Real* SVMP_RESTRICT value_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    value_row[Q] = x_batch.values[q_axis + i] * Mvv[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_hessian_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
-    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
-    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_hessian_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[Q] = x2 * Mvv[slot];
-    hess_row[16u + Q] = xv * Md2v[slot];
-    hess_row[32u + Q] = xv * Mvd2[slot];
-    hess_row[4u + Q] = hxy;
-    hess_row[12u + Q] = hxy;
-    hess_row[8u + Q] = hxz;
-    hess_row[24u + Q] = hxz;
-    hess_row[20u + Q] = hyz;
-    hess_row[28u + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_gradient_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    Real* SVMP_RESTRICT grad_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
-    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
-    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_gradient_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    Real* SVMP_RESTRICT grad_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    grad_row[Q] = xd * Mvv[slot];
-    grad_row[4u + Q] = xv * Mdv[slot];
-    grad_row[8u + Q] = xv * Mvd[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_all_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    value_row[Q] = xv * Mvv[slot];
-    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
-    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
-    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
-
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
-    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
-    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_all_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    value_row[Q] = xv * Mvv[slot];
-    grad_row[Q] = xd * Mvv[slot];
-    grad_row[4u + Q] = xv * Mdv[slot];
-    grad_row[8u + Q] = xv * Mvd[slot];
-
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[Q] = x2 * Mvv[slot];
-    hess_row[16u + Q] = xv * Md2v[slot];
-    hess_row[32u + Q] = xv * Mvd2[slot];
-    hess_row[4u + Q] = hxy;
-    hess_row[12u + Q] = hxy;
-    hess_row[8u + Q] = hxz;
-    hess_row[24u + Q] = hxz;
-    hess_row[20u + Q] = hyz;
-    hess_row[28u + Q] = hyz;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_values_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT values_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || values_out == nullptr) {
-        return false;
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            Mvv_stack[base + 0u] = yv0 * z_batch.values[k];
-            Mvv_stack[base + 1u] = yv1 * z_batch.values[axis_stride + k];
-            Mvv_stack[base + 2u] = yv2 * z_batch.values[2u * axis_stride + k];
-            Mvv_stack[base + 3u] = yv3 * z_batch.values[3u * axis_stride + k];
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-        Real* SVMP_RESTRICT value_row = values_out + node * 4u;
-        value_row[0u] = x_batch.values[i] * Mvv_stack[jk + 0u];
-        value_row[1u] = x_batch.values[axis_stride + i] * Mvv_stack[jk + 1u];
-        value_row[2u] = x_batch.values[2u * axis_stride + i] * Mvv_stack[jk + 2u];
-        value_row[3u] = x_batch.values[3u * axis_stride + i] * Mvv_stack[jk + 3u];
-    }
-
-    return true;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_gradients_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT gradients_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || gradients_out == nullptr) {
-        return false;
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        const Real yd0 = y_batch.first[j];
-        const Real yd1 = y_batch.first[axis_stride + j];
-        const Real yd2 = y_batch.first[2u * axis_stride + j];
-        const Real yd3 = y_batch.first[3u * axis_stride + j];
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            const Real zv0 = z_batch.values[k];
-            const Real zv1 = z_batch.values[axis_stride + k];
-            const Real zv2 = z_batch.values[2u * axis_stride + k];
-            const Real zv3 = z_batch.values[3u * axis_stride + k];
-            const Real zd0 = z_batch.first[k];
-            const Real zd1 = z_batch.first[axis_stride + k];
-            const Real zd2 = z_batch.first[2u * axis_stride + k];
-            const Real zd3 = z_batch.first[3u * axis_stride + k];
-
-            Mvv_stack[base + 0u] = yv0 * zv0;
-            Mvv_stack[base + 1u] = yv1 * zv1;
-            Mvv_stack[base + 2u] = yv2 * zv2;
-            Mvv_stack[base + 3u] = yv3 * zv3;
-            Mdv_stack[base + 0u] = yd0 * zv0;
-            Mdv_stack[base + 1u] = yd1 * zv1;
-            Mdv_stack[base + 2u] = yd2 * zv2;
-            Mdv_stack[base + 3u] = yd3 * zv3;
-            Mvd_stack[base + 0u] = yv0 * zd0;
-            Mvd_stack[base + 1u] = yv1 * zd1;
-            Mvd_stack[base + 2u] = yv2 * zd2;
-            Mvd_stack[base + 3u] = yv3 * zd3;
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-
-        const Real xv0 = x_batch.values[i];
-        const Real xv1 = x_batch.values[axis_stride + i];
-        const Real xv2 = x_batch.values[2u * axis_stride + i];
-        const Real xv3 = x_batch.values[3u * axis_stride + i];
-        const Real xd0 = x_batch.first[i];
-        const Real xd1 = x_batch.first[axis_stride + i];
-        const Real xd2 = x_batch.first[2u * axis_stride + i];
-        const Real xd3 = x_batch.first[3u * axis_stride + i];
-
-        Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
-        grad_row[0u] = xd0 * Mvv_stack[jk + 0u];
-        grad_row[1u] = xd1 * Mvv_stack[jk + 1u];
-        grad_row[2u] = xd2 * Mvv_stack[jk + 2u];
-        grad_row[3u] = xd3 * Mvv_stack[jk + 3u];
-        grad_row[4u] = xv0 * Mdv_stack[jk + 0u];
-        grad_row[5u] = xv1 * Mdv_stack[jk + 1u];
-        grad_row[6u] = xv2 * Mdv_stack[jk + 2u];
-        grad_row[7u] = xv3 * Mdv_stack[jk + 3u];
-        grad_row[8u] = xv0 * Mvd_stack[jk + 0u];
-        grad_row[9u] = xv1 * Mvd_stack[jk + 1u];
-        grad_row[10u] = xv2 * Mvd_stack[jk + 2u];
-        grad_row[11u] = xv3 * Mvd_stack[jk + 3u];
-    }
-
-    return true;
-}
-
-template<bool NeedAllOutputs>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_second_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || hessians_out == nullptr) {
-        return false;
-    }
-    if constexpr (NeedAllOutputs) {
-        if (values_out == nullptr || gradients_out == nullptr) {
-            return false;
-        }
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    Real Md2v_stack[kMaxStackYZ];
-    Real Mvd2_stack[kMaxStackYZ];
-    Real Mdd_stack[kMaxStackYZ];
-
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        const Real yd0 = y_batch.first[j];
-        const Real yd1 = y_batch.first[axis_stride + j];
-        const Real yd2 = y_batch.first[2u * axis_stride + j];
-        const Real yd3 = y_batch.first[3u * axis_stride + j];
-        const Real y20 = y_batch.second[j];
-        const Real y21 = y_batch.second[axis_stride + j];
-        const Real y22 = y_batch.second[2u * axis_stride + j];
-        const Real y23 = y_batch.second[3u * axis_stride + j];
-
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            const Real zv0 = z_batch.values[k];
-            const Real zv1 = z_batch.values[axis_stride + k];
-            const Real zv2 = z_batch.values[2u * axis_stride + k];
-            const Real zv3 = z_batch.values[3u * axis_stride + k];
-            const Real zd0 = z_batch.first[k];
-            const Real zd1 = z_batch.first[axis_stride + k];
-            const Real zd2 = z_batch.first[2u * axis_stride + k];
-            const Real zd3 = z_batch.first[3u * axis_stride + k];
-            const Real z20 = z_batch.second[k];
-            const Real z21 = z_batch.second[axis_stride + k];
-            const Real z22 = z_batch.second[2u * axis_stride + k];
-            const Real z23 = z_batch.second[3u * axis_stride + k];
-
-            Mvv_stack[base + 0u] = yv0 * zv0;
-            Mvv_stack[base + 1u] = yv1 * zv1;
-            Mvv_stack[base + 2u] = yv2 * zv2;
-            Mvv_stack[base + 3u] = yv3 * zv3;
-            Mdv_stack[base + 0u] = yd0 * zv0;
-            Mdv_stack[base + 1u] = yd1 * zv1;
-            Mdv_stack[base + 2u] = yd2 * zv2;
-            Mdv_stack[base + 3u] = yd3 * zv3;
-            Mvd_stack[base + 0u] = yv0 * zd0;
-            Mvd_stack[base + 1u] = yv1 * zd1;
-            Mvd_stack[base + 2u] = yv2 * zd2;
-            Mvd_stack[base + 3u] = yv3 * zd3;
-            Md2v_stack[base + 0u] = y20 * zv0;
-            Md2v_stack[base + 1u] = y21 * zv1;
-            Md2v_stack[base + 2u] = y22 * zv2;
-            Md2v_stack[base + 3u] = y23 * zv3;
-            Mvd2_stack[base + 0u] = yv0 * z20;
-            Mvd2_stack[base + 1u] = yv1 * z21;
-            Mvd2_stack[base + 2u] = yv2 * z22;
-            Mvd2_stack[base + 3u] = yv3 * z23;
-            Mdd_stack[base + 0u] = yd0 * zd0;
-            Mdd_stack[base + 1u] = yd1 * zd1;
-            Mdd_stack[base + 2u] = yd2 * zd2;
-            Mdd_stack[base + 3u] = yd3 * zd3;
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-
-        const Real xv0 = x_batch.values[i];
-        const Real xv1 = x_batch.values[axis_stride + i];
-        const Real xv2 = x_batch.values[2u * axis_stride + i];
-        const Real xv3 = x_batch.values[3u * axis_stride + i];
-        const Real xd0 = x_batch.first[i];
-        const Real xd1 = x_batch.first[axis_stride + i];
-        const Real xd2 = x_batch.first[2u * axis_stride + i];
-        const Real xd3 = x_batch.first[3u * axis_stride + i];
-        const Real x20 = x_batch.second[i];
-        const Real x21 = x_batch.second[axis_stride + i];
-        const Real x22 = x_batch.second[2u * axis_stride + i];
-        const Real x23 = x_batch.second[3u * axis_stride + i];
-
-        const Real mvv0 = Mvv_stack[jk + 0u];
-        const Real mvv1 = Mvv_stack[jk + 1u];
-        const Real mvv2 = Mvv_stack[jk + 2u];
-        const Real mvv3 = Mvv_stack[jk + 3u];
-        const Real mdv0 = Mdv_stack[jk + 0u];
-        const Real mdv1 = Mdv_stack[jk + 1u];
-        const Real mdv2 = Mdv_stack[jk + 2u];
-        const Real mdv3 = Mdv_stack[jk + 3u];
-        const Real mvd0 = Mvd_stack[jk + 0u];
-        const Real mvd1 = Mvd_stack[jk + 1u];
-        const Real mvd2 = Mvd_stack[jk + 2u];
-        const Real mvd3 = Mvd_stack[jk + 3u];
-        const Real md2v0 = Md2v_stack[jk + 0u];
-        const Real md2v1 = Md2v_stack[jk + 1u];
-        const Real md2v2 = Md2v_stack[jk + 2u];
-        const Real md2v3 = Md2v_stack[jk + 3u];
-        const Real mvd20 = Mvd2_stack[jk + 0u];
-        const Real mvd21 = Mvd2_stack[jk + 1u];
-        const Real mvd22 = Mvd2_stack[jk + 2u];
-        const Real mvd23 = Mvd2_stack[jk + 3u];
-        const Real mdd0 = Mdd_stack[jk + 0u];
-        const Real mdd1 = Mdd_stack[jk + 1u];
-        const Real mdd2 = Mdd_stack[jk + 2u];
-        const Real mdd3 = Mdd_stack[jk + 3u];
-
-        if constexpr (NeedAllOutputs) {
-            Real* SVMP_RESTRICT value_row = values_out + node * 4u;
-            value_row[0u] = xv0 * mvv0;
-            value_row[1u] = xv1 * mvv1;
-            value_row[2u] = xv2 * mvv2;
-            value_row[3u] = xv3 * mvv3;
-
-            Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
-            grad_row[0u] = xd0 * mvv0;
-            grad_row[1u] = xd1 * mvv1;
-            grad_row[2u] = xd2 * mvv2;
-            grad_row[3u] = xd3 * mvv3;
-            grad_row[4u] = xv0 * mdv0;
-            grad_row[5u] = xv1 * mdv1;
-            grad_row[6u] = xv2 * mdv2;
-            grad_row[7u] = xv3 * mdv3;
-            grad_row[8u] = xv0 * mvd0;
-            grad_row[9u] = xv1 * mvd1;
-            grad_row[10u] = xv2 * mvd2;
-            grad_row[11u] = xv3 * mvd3;
-        }
-
-        const Real hxy0 = xd0 * mdv0;
-        const Real hxy1 = xd1 * mdv1;
-        const Real hxy2 = xd2 * mdv2;
-        const Real hxy3 = xd3 * mdv3;
-        const Real hxz0 = xd0 * mvd0;
-        const Real hxz1 = xd1 * mvd1;
-        const Real hxz2 = xd2 * mvd2;
-        const Real hxz3 = xd3 * mvd3;
-        const Real hyz0 = xv0 * mdd0;
-        const Real hyz1 = xv1 * mdd1;
-        const Real hyz2 = xv2 * mdd2;
-        const Real hyz3 = xv3 * mdd3;
-
-        Real* SVMP_RESTRICT hess_row = hessians_out + node * 36u;
-        hess_row[0u] = x20 * mvv0;
-        hess_row[1u] = x21 * mvv1;
-        hess_row[2u] = x22 * mvv2;
-        hess_row[3u] = x23 * mvv3;
-        hess_row[4u] = hxy0;
-        hess_row[5u] = hxy1;
-        hess_row[6u] = hxy2;
-        hess_row[7u] = hxy3;
-        hess_row[8u] = hxz0;
-        hess_row[9u] = hxz1;
-        hess_row[10u] = hxz2;
-        hess_row[11u] = hxz3;
-        hess_row[12u] = hxy0;
-        hess_row[13u] = hxy1;
-        hess_row[14u] = hxy2;
-        hess_row[15u] = hxy3;
-        hess_row[16u] = xv0 * md2v0;
-        hess_row[17u] = xv1 * md2v1;
-        hess_row[18u] = xv2 * md2v2;
-        hess_row[19u] = xv3 * md2v3;
-        hess_row[20u] = hyz0;
-        hess_row[21u] = hyz1;
-        hess_row[22u] = hyz2;
-        hess_row[23u] = hyz3;
-        hess_row[24u] = hxz0;
-        hess_row[25u] = hxz1;
-        hess_row[26u] = hxz2;
-        hess_row[27u] = hxz3;
-        hess_row[28u] = hyz0;
-        hess_row[29u] = hyz1;
-        hess_row[30u] = hyz2;
-        hess_row[31u] = hyz3;
-        hess_row[32u] = xv0 * mvd20;
-        hess_row[33u] = xv1 * mvd21;
-        hess_row[34u] = xv2 * mvd22;
-        hess_row[35u] = xv3 * mvd23;
-    }
-
-    return true;
-}
-
-template<int N>
-constexpr std::size_t line_public_axis_index(std::size_t node) noexcept {
-    return node == 0u ? 0u : (node == 1u ? static_cast<std::size_t>(N - 1) : node - 1u);
-}
-
-template<int N>
-constexpr std::array<Real, N> make_line_axis_inv_denoms() noexcept {
-    std::array<Real, N> inv_denoms{};
-    for (int i = 0; i < N; ++i) {
-        Real denom = Real(1);
-        for (int j = 0; j < N; ++j) {
-            if (j != i) {
-                denom *= static_cast<Real>(i - j);
-            }
-        }
-        inv_denoms[static_cast<std::size_t>(i)] = Real(1) / denom;
-    }
-    return inv_denoms;
-}
-
-template<int N>
-void fill_line_values_product(Real x, Real* SVMP_RESTRICT values) {
-    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
-    const Real p = static_cast<Real>(N - 1);
-    const Real r = (x + Real(1)) * p * Real(0.5);
-    Real prefix[N];
-    Real suffix[N];
-    prefix[0] = Real(1);
-    for (int i = 1; i < N; ++i) {
-        prefix[i] = prefix[i - 1] * (r - static_cast<Real>(i - 1));
-    }
-    suffix[N - 1] = Real(1);
-    for (int i = N - 2; i >= 0; --i) {
-        suffix[i] = suffix[i + 1] * (r - static_cast<Real>(i + 1));
-    }
-    for (int i = 0; i < N; ++i) {
-        const std::size_t slot = static_cast<std::size_t>(i);
-        values[slot] = prefix[i] * suffix[i] * inv_denoms[slot];
-    }
-}
-
-template<int N>
-void fill_line_values_product_derivatives(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first,
-                                          Real* SVMP_RESTRICT second) {
-    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
-    const Real p = static_cast<Real>(N - 1);
-    const Real drdx = p * Real(0.5);
-    const Real d2rdx2 = drdx * drdx;
-    const Real r = (x + Real(1)) * drdx;
-
-    Real prefix[N + 1];
-    Real prefix_d1[N + 1];
-    Real prefix_d2[N + 1];
-    Real suffix[N + 1];
-    Real suffix_d1[N + 1];
-    Real suffix_d2[N + 1];
-
-    const bool need_second = second != nullptr;
-
-    prefix[0] = Real(1);
-    prefix_d1[0] = Real(0);
-    if (need_second) {
-        prefix_d2[0] = Real(0);
-    }
-    for (int i = 0; i < N; ++i) {
-        const Real factor = r - static_cast<Real>(i);
-        prefix[i + 1] = prefix[i] * factor;
-        prefix_d1[i + 1] = prefix_d1[i] * factor + prefix[i];
-        if (need_second) {
-            prefix_d2[i + 1] = prefix_d2[i] * factor + Real(2) * prefix_d1[i];
-        }
-    }
-
-    suffix[N] = Real(1);
-    suffix_d1[N] = Real(0);
-    if (need_second) {
-        suffix_d2[N] = Real(0);
-    }
-    for (int i = N - 1; i >= 0; --i) {
-        const Real factor = r - static_cast<Real>(i);
-        suffix[i] = suffix[i + 1] * factor;
-        suffix_d1[i] = suffix_d1[i + 1] * factor + suffix[i + 1];
-        if (need_second) {
-            suffix_d2[i] = suffix_d2[i + 1] * factor + Real(2) * suffix_d1[i + 1];
-        }
-    }
-
-    for (int i = 0; i < N; ++i) {
-        const std::size_t slot = static_cast<std::size_t>(i);
-        const Real inv = inv_denoms[slot];
-        const Real pre = prefix[i];
-        const Real suf = suffix[i + 1];
-        const Real pre_d1 = prefix_d1[i];
-        const Real suf_d1 = suffix_d1[i + 1];
-        values[slot] = pre * suf * inv;
-        if (first != nullptr) {
-            first[slot] = (pre_d1 * suf + pre * suf_d1) * inv * drdx;
-        }
-        if (second != nullptr) {
-            const Real d2 =
-                prefix_d2[i] * suf +
-                Real(2) * pre_d1 * suf_d1 +
-                pre * suffix_d2[i + 1];
-            second[slot] = d2 * inv * d2rdx2;
-        }
-    }
-}
-
-template<int N>
-void fill_axis_batch_product_q4(
-    AxisBatchScratch& scratch,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t component,
-    AxisDeriv level) {
-    constexpr std::size_t axis_stride = static_cast<std::size_t>(N);
-    scratch.resizeFor(4u * axis_stride, level);
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real* values = scratch.values.data() + q * axis_stride;
-        if (level == AxisDeriv::ValuesOnly) {
-            fill_line_values_product<N>(points[q][component], values);
-        } else {
-            Real* first = scratch.first.data() + q * axis_stride;
-            Real* second = level == AxisDeriv::ValuesAndFirstAndSecond
-                ? scratch.second.data() + q * axis_stride
-                : nullptr;
-            fill_line_values_product_derivatives<N>(
-                points[q][component], values, first, second);
-        }
-    }
-}
-
-bool try_fill_axis_batch_product_q4(
-    AxisBatchScratch& scratch,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t component,
-    int n_axis,
-    AxisDeriv level) {
-    switch (n_axis) {
-        case 5:
-            fill_axis_batch_product_q4<5>(scratch, points, component, level);
-            return true;
-        case 6:
-            fill_axis_batch_product_q4<6>(scratch, points, component, level);
-            return true;
-        case 7:
-            fill_axis_batch_product_q4<7>(scratch, points, component, level);
-            return true;
-        case 8:
-            fill_axis_batch_product_q4<8>(scratch, points, component, level);
-            return true;
-        case 9:
-            fill_axis_batch_product_q4<9>(scratch, points, component, level);
-            return true;
-        default:
-            return false;
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real q0[N];
-    Real q1[N];
-    Real q2[N];
-    Real q3[N];
-    fill_line_values_product<N>(points[0][0], q0);
-    fill_line_values_product<N>(points[1][0], q1);
-    fill_line_values_product<N>(points[2][0], q2);
-    fill_line_values_product<N>(points[3][0], q3);
-
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        Real* row = values_out + node * output_stride;
-        row[0] = q0[i];
-        row[1] = q1[i];
-        row[2] = q2[i];
-        row[3] = q3[i];
-    }
-}
-
-FE_ALWAYS_INLINE void write_line_order4_values_q(
-    Real x,
-    std::size_t q,
-    Real* SVMP_RESTRICT row0,
-    Real* SVMP_RESTRICT row1,
-    Real* SVMP_RESTRICT row2,
-    Real* SVMP_RESTRICT row3,
-    Real* SVMP_RESTRICT row4) {
-    const Real r = (x + Real(1)) * Real(2);
-    const Real f0 = r;
-    const Real f1 = r - Real(1);
-    const Real f2 = r - Real(2);
-    const Real f3 = r - Real(3);
-    const Real f4 = r - Real(4);
-    const Real f01 = f0 * f1;
-    const Real f12 = f1 * f2;
-    const Real f23 = f2 * f3;
-    const Real f34 = f3 * f4;
-    const Real v0 = (f12 * f34) / Real(24);
-    const Real v1 = -(f0 * f2 * f34) / Real(6);
-    const Real v2 = (f01 * f34) / Real(4);
-    const Real v3 = -(f01 * f2 * f4) / Real(6);
-    const Real v4 = (f01 * f23) / Real(24);
-    row0[q] = v0;
-    row1[q] = v4;
-    row2[q] = v1;
-    row3[q] = v2;
-    row4[q] = v3;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_order4_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    write_line_order4_values_q(points[0][0], 0u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[1][0], 1u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[2][0], 2u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[3][0], 3u, row0, row1, row2, row3, row4);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* SVMP_RESTRICT row0 = gradients_out + 0u * 3u * output_stride;
-    Real* SVMP_RESTRICT row1 = gradients_out + 1u * 3u * output_stride;
-    Real* SVMP_RESTRICT row2 = gradients_out + 2u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        row0[0u * output_stride + q] = Real(-1);
-        row0[1u * output_stride + q] = Real(-1);
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(1);
-        row2[2u * output_stride + q] = Real(0);
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE void evaluate_line_hessians_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][N];
-    Real second[4][N];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], values[q], nullptr, second[q]);
-    }
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][i], second[1][i],
-                                  second[2][i], second[3][i]);
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE void evaluate_line_all_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][N];
-    Real first[4][N];
-    Real second[4][N];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], values[q], first[q], second[q]);
-    }
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        Real* value_row = values_out + node * output_stride;
-        value_row[0] = values[0][i];
-        value_row[1] = values[1][i];
-        value_row[2] = values[2][i];
-        value_row[3] = values[3][i];
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][i], first[1][i],
-                                   first[2][i], first[3][i]);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][i], second[1][i],
-                                  second[2][i], second[3][i]);
-    }
-}
-
-inline void write_quad_product_value_row_q4(
-    Real* SVMP_RESTRICT row,
-    const Real* SVMP_RESTRICT x0,
-    const Real* SVMP_RESTRICT x1,
-    const Real* SVMP_RESTRICT x2,
-    const Real* SVMP_RESTRICT x3,
-    const Real* SVMP_RESTRICT y0,
-    const Real* SVMP_RESTRICT y1,
-    const Real* SVMP_RESTRICT y2,
-    const Real* SVMP_RESTRICT y3,
-    std::size_t i,
-    std::size_t j) {
-    row[0] = x0[i] * y0[j];
-    row[1] = x1[i] * y1[j];
-    row[2] = x2[i] * y2[j];
-    row[3] = x3[i] * y3[j];
-}
-
-template<int N>
-void evaluate_quad_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real x0[N];
-    Real x1[N];
-    Real x2[N];
-    Real x3[N];
-    Real y0[N];
-    Real y1[N];
-    Real y2[N];
-    Real y3[N];
-    fill_line_values_product<N>(points[0][0], x0);
-    fill_line_values_product<N>(points[1][0], x1);
-    fill_line_values_product<N>(points[2][0], x2);
-    fill_line_values_product<N>(points[3][0], x3);
-    fill_line_values_product<N>(points[0][1], y0);
-    fill_line_values_product<N>(points[1][1], y1);
-    fill_line_values_product<N>(points[2][1], y2);
-    fill_line_values_product<N>(points[3][1], y3);
-
-    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
-    std::size_t node = 0u;
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, 0u);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, p, 0u);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, p, p);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, p);
-
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, 0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                            x0, x1, x2, x3, y0, y1, y2, y3, i, j);
-        }
-    }
-}
-
-template<int N>
-void evaluate_quad_derivatives_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    Real xv[4][N];
-    Real xd[4][N];
-    Real x2[4][N];
-    Real yv[4][N];
-    Real yd[4][N];
-    Real y2[4][N];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], xv[q], (need_grad || need_hess) ? xd[q] : nullptr,
-            need_hess ? x2[q] : nullptr);
-        fill_line_values_product_derivatives<N>(
-            points[q][1], yv[q], (need_grad || need_hess) ? yd[q] : nullptr,
-            need_hess ? y2[q] : nullptr);
-    }
-
-    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
-    std::size_t node = 0u;
-    auto write_node = [&](std::size_t i, std::size_t j) {
-        Real* value_row = values_out != nullptr ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out != nullptr ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out != nullptr ? hessians_out + node * 9u * output_stride : nullptr;
-        if (grad_row != nullptr) {
-            grad_row[2u * output_stride + 0u] = Real(0);
-            grad_row[2u * output_stride + 1u] = Real(0);
-            grad_row[2u * output_stride + 2u] = Real(0);
-            grad_row[2u * output_stride + 3u] = Real(0);
-        }
-        if (hess_row != nullptr) {
-            hess_row[2u * output_stride + 0u] = Real(0);
-            hess_row[2u * output_stride + 1u] = Real(0);
-            hess_row[2u * output_stride + 2u] = Real(0);
-            hess_row[2u * output_stride + 3u] = Real(0);
-            hess_row[5u * output_stride + 0u] = Real(0);
-            hess_row[5u * output_stride + 1u] = Real(0);
-            hess_row[5u * output_stride + 2u] = Real(0);
-            hess_row[5u * output_stride + 3u] = Real(0);
-            hess_row[6u * output_stride + 0u] = Real(0);
-            hess_row[6u * output_stride + 1u] = Real(0);
-            hess_row[6u * output_stride + 2u] = Real(0);
-            hess_row[6u * output_stride + 3u] = Real(0);
-            hess_row[7u * output_stride + 0u] = Real(0);
-            hess_row[7u * output_stride + 1u] = Real(0);
-            hess_row[7u * output_stride + 2u] = Real(0);
-            hess_row[7u * output_stride + 3u] = Real(0);
-            hess_row[8u * output_stride + 0u] = Real(0);
-            hess_row[8u * output_stride + 1u] = Real(0);
-            hess_row[8u * output_stride + 2u] = Real(0);
-            hess_row[8u * output_stride + 3u] = Real(0);
-        }
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real x_value = xv[q][i];
-            const Real y_value = yv[q][j];
-            if (value_row != nullptr) {
-                value_row[q] = x_value * y_value;
-            }
-            if (grad_row != nullptr) {
-                grad_row[0u * output_stride + q] = xd[q][i] * y_value;
-                grad_row[1u * output_stride + q] = x_value * yd[q][j];
-            }
-            if (hess_row != nullptr) {
-                const Real hxy = xd[q][i] * yd[q][j];
-                hess_row[0u * output_stride + q] = x2[q][i] * y_value;
-                hess_row[1u * output_stride + q] = hxy;
-                hess_row[3u * output_stride + q] = hxy;
-                hess_row[4u * output_stride + q] = x_value * y2[q][j];
-            }
-        }
-        ++node;
-    };
-
-    write_node(0u, 0u);
-    write_node(p, 0u);
-    write_node(p, p);
-    write_node(0u, p);
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_node(i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_node(p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_node(i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_node(0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_node(i, j);
-        }
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_quad_order8_gradients_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr int N = 9;
-    constexpr std::size_t p = 8u;
-    Real xv[4][N];
-    Real xd[4][N];
-    Real yv[4][N];
-    Real yd[4][N];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(points[q][0], xv[q], xd[q], nullptr);
-        fill_line_values_product_derivatives<N>(points[q][1], yv[q], yd[q], nullptr);
-    }
-
-    std::size_t node = 0u;
-    auto write_node = [&](std::size_t i, std::size_t j) {
-        Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
-        row[0u] = xd[0][i] * yv[0][j];
-        row[1u] = xd[1][i] * yv[1][j];
-        row[2u] = xd[2][i] * yv[2][j];
-        row[3u] = xd[3][i] * yv[3][j];
-        row[output_stride + 0u] = xv[0][i] * yd[0][j];
-        row[output_stride + 1u] = xv[1][i] * yd[1][j];
-        row[output_stride + 2u] = xv[2][i] * yd[2][j];
-        row[output_stride + 3u] = xv[3][i] * yd[3][j];
-        row[2u * output_stride + 0u] = Real(0);
-        row[2u * output_stride + 1u] = Real(0);
-        row[2u * output_stride + 2u] = Real(0);
-        row[2u * output_stride + 3u] = Real(0);
-        ++node;
-    };
-
-    write_node(0u, 0u);
-    write_node(p, 0u);
-    write_node(p, p);
-    write_node(0u, p);
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_node(i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_node(p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_node(i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_node(0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_node(i, j);
-        }
-    }
-}
-
-template<int N>
-void evaluate_line_gradients_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT d_coeffs,
-    Real* SVMP_RESTRICT gradients_out) {
-    const Real x0 = points[0][0];
-    const Real x1 = points[1][0];
-    const Real x2 = points[2][0];
-    const Real x3 = points[3][0];
-
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        const Real* c = d_coeffs + i * static_cast<std::size_t>(N - 1);
-        Real r0 = c[N - 2];
-        Real r1 = c[N - 2];
-        Real r2 = c[N - 2];
-        Real r3 = c[N - 2];
-        for (int k = N - 2; k > 0; --k) {
-            const Real ck = c[k - 1];
-            r0 = r0 * x0 + ck;
-            r1 = r1 * x1 + ck;
-            r2 = r2 * x2 + ck;
-            r3 = r3 * x3 + ck;
-        }
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = r0;
-        row[1] = r1;
-        row[2] = r2;
-        row[3] = r3;
-        row[output_stride + 0u] = Real(0);
-        row[output_stride + 1u] = Real(0);
-        row[output_stride + 2u] = Real(0);
-        row[output_stride + 3u] = Real(0);
-        row[2u * output_stride + 0u] = Real(0);
-        row[2u * output_stride + 1u] = Real(0);
-        row[2u * output_stride + 2u] = Real(0);
-        row[2u * output_stride + 3u] = Real(0);
-    }
-}
-
-bool try_evaluate_line_values_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT v_coeffs,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out) {
-    (void)v_coeffs;
-    switch (n_axis) {
-        case 5:
-            evaluate_line_order4_values_q4(points, output_stride, values_out);
-            return true;
-        case 6:
-            evaluate_line_values_product_q4<6>(points, output_stride, values_out);
-            return true;
-        case 7:
-            evaluate_line_values_product_q4<7>(points, output_stride, values_out);
-            return true;
-        case 8:
-            evaluate_line_values_product_q4<8>(points, output_stride, values_out);
-            return true;
-        case 9:
-            evaluate_line_values_product_q4<9>(points, output_stride, values_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-bool try_evaluate_line_gradients_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT d_coeffs,
-    int n_axis,
-    Real* SVMP_RESTRICT gradients_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_gradients_horner_q4<5>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 6:
-            evaluate_line_gradients_horner_q4<6>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 7:
-            evaluate_line_gradients_horner_q4<7>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 8:
-            evaluate_line_gradients_horner_q4<8>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 9:
-            evaluate_line_gradients_horner_q4<9>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_hessians_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_hessians_product_q4<5>(points, output_stride, hessians_out);
-            return true;
-        case 6:
-            evaluate_line_hessians_product_q4<6>(points, output_stride, hessians_out);
-            return true;
-        case 7:
-            evaluate_line_hessians_product_q4<7>(points, output_stride, hessians_out);
-            return true;
-        case 8:
-            evaluate_line_hessians_product_q4<8>(points, output_stride, hessians_out);
-            return true;
-        case 9:
-            evaluate_line_hessians_product_q4<9>(points, output_stride, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_all_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_all_product_q4<5>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 6:
-            evaluate_line_all_product_q4<6>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 7:
-            evaluate_line_all_product_q4<7>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 8:
-            evaluate_line_all_product_q4<8>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 9:
-            evaluate_line_all_product_q4<9>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_quad_values_product_q4<5>(points, output_stride, values_out);
-            return true;
-        case 6:
-            evaluate_quad_values_product_q4<6>(points, output_stride, values_out);
-            return true;
-        case 7:
-            evaluate_quad_values_product_q4<7>(points, output_stride, values_out);
-            return true;
-        case 8:
-            evaluate_quad_values_product_q4<8>(points, output_stride, values_out);
-            return true;
-        case 9:
-            evaluate_quad_values_product_q4<9>(points, output_stride, values_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_derivatives_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_quad_derivatives_product_q4<5>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 6:
-            evaluate_quad_derivatives_product_q4<6>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 7:
-            evaluate_quad_derivatives_product_q4<7>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 8:
-            evaluate_quad_derivatives_product_q4<8>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 9:
-            evaluate_quad_derivatives_product_q4<9>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-void evaluate_tensor_product_points_strided(
-    LagrangeTopology topology,
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* v_coeffs,
-    const Real* d_coeffs,
-    const Real* d2_coeffs,
-    const Real* barycentric_weights,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_qpts = points.size();
-    if (num_qpts == 0 || tensor_indices.empty()) {
-        return;
-    }
-
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    const bool values_only = values_out != nullptr && !need_grad && !need_hess;
-    const bool gradients_only = values_out == nullptr && need_grad && !need_hess;
-    const bool hessians_only = values_out == nullptr && gradients_out == nullptr && need_hess;
-    const bool all_outputs = values_out != nullptr && need_grad && need_hess;
-    const AxisDeriv level = need_hess
-        ? AxisDeriv::ValuesAndFirstAndSecond
-        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
-
-    if (topology == LagrangeTopology::Line && num_qpts == 4u) {
-        if (values_only &&
-            try_evaluate_line_values_horner_q4(
-                points, output_stride, v_coeffs, n_axis, values_out)) {
-            return;
-        }
-        if (gradients_only &&
-            try_evaluate_line_gradients_horner_q4(
-                points, output_stride, d_coeffs, n_axis, gradients_out)) {
-            return;
-        }
-        if (hessians_only &&
-            try_evaluate_line_hessians_product_q4(
-                points, output_stride, n_axis, hessians_out)) {
-            return;
-        }
-        if (all_outputs &&
-            try_evaluate_line_all_product_q4(
-                points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
-            return;
-        }
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        values_only &&
-        num_qpts == 4u &&
-        try_evaluate_quad_values_product_q4(points, output_stride, n_axis, values_out)) {
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis == 5) {
-        evaluate_quad_order4_gradients_q4(points, output_stride, gradients_out);
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis == 9) {
-        evaluate_quad_order8_gradients_product_q4(points, output_stride, gradients_out);
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        (gradients_only || hessians_only || all_outputs) &&
-        num_qpts == 4u &&
-        try_evaluate_quad_derivatives_product_q4(
-            points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
-        return;
-    }
-
-    auto& scratch = evaluate_scratch();
-    AxisBatchScratch& x_batch = scratch.axis_x_batch;
-    AxisBatchScratch& y_batch = scratch.axis_y_batch;
-    AxisBatchScratch& z_batch = scratch.axis_z_batch;
-
-    const bool has_y = topology != LagrangeTopology::Line;
-    const bool has_z = topology == LagrangeTopology::Hexahedron;
-    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-    const bool use_product_axis_batch =
-        has_z &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis >= 5 &&
-        n_axis <= 9;
-    auto fill_tensor_axis_batch = [&](AxisBatchScratch& batch, std::size_t component) {
-        if (use_product_axis_batch &&
-            try_fill_axis_batch_product_q4(batch, points, component, n_axis, level)) {
-            return;
-        }
-        fill_axis_batch(batch, points, component, v_coeffs, d_coeffs, d2_coeffs,
-                        barycentric_weights, n_axis, level);
-    };
-
-    fill_tensor_axis_batch(x_batch, 0u);
-    if (!has_y) {
-        if (values_only) {
-            if (num_qpts == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const std::size_t i = tensor_indices[node][0];
-                    Real* value_row = values_out + node * output_stride;
-                    value_row[0] = x_batch.values[i];
-                    value_row[1] = x_batch.values[axis_stride + i];
-                    value_row[2] = x_batch.values[2u * axis_stride + i];
-                    value_row[3] = x_batch.values[3u * axis_stride + i];
-                }
-                return;
-            }
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const std::size_t i = tensor_indices[node][0];
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    value_row[q] = x_batch.values[q * axis_stride + i];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const std::size_t i = tensor_indices[node][0];
-                Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    grad_row[0u * output_stride + q] =
-                        x_batch.first[q * axis_stride + i];
-                    grad_row[1u * output_stride + q] = Real(0);
-                    grad_row[2u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-            const std::size_t i = tensor_indices[node][0];
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t q_axis = q * axis_stride + i;
-                if (value_row != nullptr) {
-                    value_row[q] = x_batch.values[q_axis];
-                }
-                if (need_grad) {
-                    grad_row[0u * output_stride + q] = x_batch.first[q_axis];
-                    grad_row[1u * output_stride + q] = Real(0);
-                    grad_row[2u * output_stride + q] = Real(0);
-                }
-                if (need_hess) {
-                    hess_row[0u * output_stride + q] = x_batch.second[q_axis];
-                    hess_row[1u * output_stride + q] = Real(0);
-                    hess_row[2u * output_stride + q] = Real(0);
-                    hess_row[3u * output_stride + q] = Real(0);
-                    hess_row[4u * output_stride + q] = Real(0);
-                    hess_row[5u * output_stride + q] = Real(0);
-                    hess_row[6u * output_stride + q] = Real(0);
-                    hess_row[7u * output_stride + q] = Real(0);
-                    hess_row[8u * output_stride + q] = Real(0);
-                }
-            }
-        }
-        return;
-    }
-    const bool use_tensor_tables =
-        has_z ||
-        (axis_stride == 8u && !(need_hess && values_out == nullptr && gradients_out == nullptr));
-    if (use_tensor_tables) {
-        fill_tensor_axis_batch(y_batch, 1u);
-    } else if (has_y) {
-        fill_tensor_axis_batch(y_batch, 1u);
-    }
-    if (has_z) {
-        fill_tensor_axis_batch(z_batch, 2u);
-    }
-
-    if (use_tensor_tables) {
-        const std::size_t ny = axis_stride;
-        const std::size_t nz = has_z ? axis_stride : 1u;
-        const std::size_t nyz = ny * nz;
-        const std::size_t table_count = num_qpts * nyz;
-
-        if (has_z && num_qpts == 4u && output_stride == 4u) {
-            if (values_only &&
-                evaluate_tensor_product_values_stride4_q4_transposed(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, values_out)) {
-                return;
-            }
-            if (gradients_only &&
-                evaluate_tensor_product_gradients_stride4_q4_transposed(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, gradients_out)) {
-                return;
-            }
-            if (hessians_only &&
-                evaluate_tensor_product_second_stride4_q4_transposed<false>(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
-                    nullptr, nullptr, hessians_out)) {
-                return;
-            }
-            if (all_outputs &&
-                evaluate_tensor_product_second_stride4_q4_transposed<true>(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
-                    values_out, gradients_out, hessians_out)) {
-                return;
-            }
-        }
-
-        Real Mvv_stack[kMaxStackYZ];
-        Real Mdv_stack[kMaxStackYZ];
-        Real Mvd_stack[kMaxStackYZ];
-        Real Md2v_stack[kMaxStackYZ];
-        Real Mvd2_stack[kMaxStackYZ];
-        Real Mdd_stack[kMaxStackYZ];
-
-        Real* Mvv;
-        Real* Mdv;
-        Real* Mvd;
-        Real* Md2v;
-        Real* Mvd2;
-        Real* Mdd;
-        if (table_count <= kMaxStackYZ) {
-            Mvv = Mvv_stack;
-            Mdv = Mdv_stack;
-            Mvd = Mvd_stack;
-            Md2v = Md2v_stack;
-            Mvd2 = Mvd2_stack;
-            Mdd = Mdd_stack;
-        } else {
-            auto& tables = scratch.tensor_tables;
-            tables.resizeFor(table_count);
-            Mvv = tables.vv.data();
-            Mdv = tables.dv.data();
-            Mvd = tables.vd.data();
-            Md2v = tables.d2v.data();
-            Mvd2 = tables.vd2.data();
-            Mdd = tables.dd.data();
-        }
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const std::size_t q_axis = q * axis_stride;
-            const std::size_t q_table = q * nyz;
-            for (std::size_t j = 0; j < ny; ++j) {
-                const Real yv = y_batch.values[q_axis + j];
-                const Real yd = (need_grad || need_hess) ? y_batch.first[q_axis + j] : Real(0);
-                const Real y2 = need_hess ? y_batch.second[q_axis + j] : Real(0);
-                for (std::size_t k = 0; k < nz; ++k) {
-                    const std::size_t slot = q_table + j * nz + k;
-                    const Real zv = has_z ? z_batch.values[q_axis + k] : Real(1);
-                    Mvv[slot] = yv * zv;
-                    if (need_grad || need_hess) {
-                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
-                        Mdv[slot] = yd * zv;
-                        Mvd[slot] = yv * zd;
-                    }
-                    if (need_hess) {
-                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
-                        const Real z2 = has_z ? z_batch.second[q_axis + k] : Real(0);
-                        Md2v[slot] = y2 * zv;
-                        Mvd2[slot] = yv * z2;
-                        Mdd[slot] = yd * zd;
-                    }
-                }
-            }
-        }
-
-        if (values_only) {
-            if (has_z && num_qpts == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-
-                    write_tensor_product_value_strided_q<0>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<1>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<2>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<3>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                }
-                return;
-            }
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const auto& idx = tensor_indices[node];
-                const std::size_t i = idx[0];
-                const std::size_t jk = idx[1] * nz + idx[2];
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t q_axis = q * axis_stride;
-                    const std::size_t slot = q * nyz + jk;
-                    value_row[q] = x_batch.values[q_axis + i] * Mvv[slot];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            if (has_z && num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                        const auto& idx = tensor_indices[node];
-                        const std::size_t i = idx[0];
-                        const std::size_t jk = idx[1] * nz + idx[2];
-                        Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                        write_tensor_product_gradient_stride4_q<0>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<1>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<2>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<3>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                        const auto& idx = tensor_indices[node];
-                        const std::size_t i = idx[0];
-                        const std::size_t jk = idx[1] * nz + idx[2];
-                        Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                        write_tensor_product_gradient_strided_q<0>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<1>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<2>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<3>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const auto& idx = tensor_indices[node];
-                const std::size_t i = idx[0];
-                const std::size_t jk = idx[1] * nz + idx[2];
-                Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t q_axis = q * axis_stride;
-                    const std::size_t slot = q * nyz + jk;
-                    const Real xv = x_batch.values[q_axis + i];
-                    const Real xd = x_batch.first[q_axis + i];
-                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
-                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
-                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
-                }
-            }
-            return;
-        }
-
-        if (has_z && num_qpts == 4u && hessians_only) {
-            if (output_stride == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_hessian_stride4_q<0>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<1>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<2>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<3>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                }
-            } else {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_hessian_strided_q<0>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<1>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<2>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<3>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                }
-            }
-            return;
-        }
-
-        if (has_z && num_qpts == 4u && all_outputs) {
-            if (output_stride == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-                    Real* grad_row = gradients_out + node * 3u * output_stride;
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_all_stride4_q<0>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<1>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<2>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<3>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                }
-            } else {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-                    Real* grad_row = gradients_out + node * 3u * output_stride;
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_all_strided_q<0>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<1>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<2>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<3>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-            const auto& idx = tensor_indices[node];
-            const std::size_t i = idx[0];
-            const std::size_t jk = idx[1] * nz + idx[2];
-
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t q_axis = q * axis_stride;
-                const std::size_t slot = q * nyz + jk;
-                const Real xv = x_batch.values[q_axis + i];
-
-                if (value_row != nullptr) {
-                    value_row[q] = xv * Mvv[slot];
-                }
-
-                if (need_grad) {
-                    const Real xd = x_batch.first[q_axis + i];
-                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
-                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
-                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
-                }
-
-                if (need_hess) {
-                    const Real xd = x_batch.first[q_axis + i];
-                    const Real x2 = x_batch.second[q_axis + i];
-                    const Real hxy = xd * Mdv[slot];
-                    const Real hxz = xd * Mvd[slot];
-                    const Real hyz = xv * Mdd[slot];
-                    hess_row[0u * output_stride + q] = x2 * Mvv[slot];
-                    hess_row[4u * output_stride + q] = xv * Md2v[slot];
-                    hess_row[8u * output_stride + q] = xv * Mvd2[slot];
-                    hess_row[1u * output_stride + q] = hxy;
-                    hess_row[3u * output_stride + q] = hxy;
-                    hess_row[2u * output_stride + q] = hxz;
-                    hess_row[6u * output_stride + q] = hxz;
-                    hess_row[5u * output_stride + q] = hyz;
-                    hess_row[7u * output_stride + q] = hyz;
-                }
-            }
-        }
-        return;
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t j = idx[1];
-
-        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const std::size_t q_axis = q * axis_stride;
-            const Real xv = x_batch.values[q_axis + i];
-            const Real yv = y_batch.values[q_axis + j];
-
-            if (value_row != nullptr) {
-                value_row[q] = xv * yv;
-            }
-
-            if (need_grad) {
-                const Real xd = x_batch.first[q_axis + i];
-                const Real yd = y_batch.first[q_axis + j];
-                grad_row[0u * output_stride + q] = xd * yv;
-                grad_row[1u * output_stride + q] = xv * yd;
-                grad_row[2u * output_stride + q] = Real(0);
-            }
-
-            if (need_hess) {
-                const Real xd = x_batch.first[q_axis + i];
-                const Real yd = y_batch.first[q_axis + j];
-                const Real x2 = x_batch.second[q_axis + i];
-                const Real y2 = y_batch.second[q_axis + j];
-                const Real hxy = xd * yd;
-
-                hess_row[0u * output_stride + q] = x2 * yv;
-                hess_row[4u * output_stride + q] = xv * y2;
-                hess_row[8u * output_stride + q] = Real(0);
-                hess_row[1u * output_stride + q] = hxy;
-                hess_row[3u * output_stride + q] = hxy;
-                hess_row[2u * output_stride + q] = Real(0);
-                hess_row[6u * output_stride + q] = Real(0);
-                hess_row[5u * output_stride + q] = Real(0);
-                hess_row[7u * output_stride + q] = Real(0);
-            }
-        }
-    }
-}
-
-void evaluate_wedge_points_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* v_coeffs,
-    const Real* d_coeffs,
-    const Real* d2_coeffs,
-    const Real* barycentric_weights,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (points.empty() || wedge_indices.empty()) {
-        return;
-    }
-
-    const bool want_values = values_out != nullptr;
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    const bool values_only = want_values && !need_grad && !need_hess;
-    const bool gradients_only = !want_values && need_grad && !need_hess;
-    const bool hessians_only = !want_values && !need_grad && need_hess;
-    const bool all_outputs = want_values && need_grad && need_hess;
-    const bool use_batched_wedge =
-        (values_only && order <= 3) ||
-        (gradients_only && order >= 2) ||
-        (hessians_only && order >= 3) ||
-        (all_outputs && order >= 3);
-    if (values_only &&
-        order >= 4 &&
-        order <= 8 &&
-        try_evaluate_wedge_values_product_q4(
-            simplex_exponents, wedge_indices, order, points, output_stride, values_out)) {
-        return;
-    }
-    const AxisDeriv level = need_hess
-        ? AxisDeriv::ValuesAndFirstAndSecond
-        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const std::size_t tri_count = simplex_exponents.size();
-    if (use_batched_wedge) {
-        const std::size_t num_qpts = points.size();
-        const std::size_t tri_stride = num_qpts;
-        if (num_qpts == 4u &&
-            output_stride == 4u &&
-            (gradients_only || hessians_only || all_outputs) &&
-            order >= 3 &&
-            order <= 8 &&
-            wedge_node_by_tri_z.size() == tri_count * static_cast<std::size_t>(n_axis)) {
-            const bool use_product_axis_batch =
-                gradients_only &&
-                n_axis >= 5 &&
-                n_axis <= 9;
-            if (!use_product_axis_batch ||
-                !try_fill_axis_batch_product_q4(
-                    scratch.axis_z_batch, points, 2u, n_axis, level)) {
-                fill_axis_batch(scratch.axis_z_batch,
-                                points,
-                                2u,
-                                v_coeffs,
-                                d_coeffs,
-                                d2_coeffs,
-                                barycentric_weights,
-                                n_axis,
-                                level);
-            }
-            if (need_hess) {
-                if (try_evaluate_wedge_fused_stride4_q4<true>(
-                        simplex_exponents, wedge_node_by_tri_z, order, points,
-                        scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
-                    return;
-                }
-            } else if (try_evaluate_wedge_fused_stride4_q4<false>(
-                           simplex_exponents, wedge_node_by_tri_z, order, points,
-                           scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
-                return;
-            }
-        }
-
-        const std::size_t tri_values_size = tri_count * tri_stride;
-        scratch.wedge_tri_values_batch.resize(tri_values_size);
-        if (need_grad || need_hess) {
-            scratch.wedge_tri_gradient_batch.resize(tri_count * 2u * tri_stride);
-        }
-        if (need_hess) {
-            scratch.wedge_tri_hessian_batch.resize(tri_count * 3u * tri_stride);
-        }
-
-        detail::evaluate_triangle_simplex_basis_wedge_components_strided(
-            simplex_exponents,
-            order,
-            points,
-            tri_stride,
-            scratch.wedge_tri_values_batch.data(),
-            (need_grad || need_hess) ? scratch.wedge_tri_gradient_batch.data() : nullptr,
-            need_hess ? scratch.wedge_tri_hessian_batch.data() : nullptr);
-
-        const bool use_product_axis_batch =
-            gradients_only &&
-            points.size() == 4u &&
-            n_axis >= 5 &&
-            n_axis <= 9;
-        if (!use_product_axis_batch ||
-            !try_fill_axis_batch_product_q4(
-                scratch.axis_z_batch, points, 2u, n_axis, level)) {
-            fill_axis_batch(scratch.axis_z_batch,
-                            points,
-                            2u,
-                            v_coeffs,
-                            d_coeffs,
-                            d2_coeffs,
-                            barycentric_weights,
-                            n_axis,
-                            level);
-        }
-
-        const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-        if (all_outputs) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* value_row = values_out + node * output_stride;
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_all_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* value_row = values_out + node * output_stride;
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_all_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* value_row = values_out + node * output_stride;
-                Real* g = gradients_out + node * 3u * output_stride;
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real tri_gx = tri_g[0u * tri_stride + q];
-                    const Real tri_gy = tri_g[1u * tri_stride + q];
-                    const Real tri_hxx = tri_H[0u * tri_stride + q];
-                    const Real tri_hxy = tri_H[1u * tri_stride + q];
-                    const Real tri_hyy = tri_H[2u * tri_stride + q];
-                    const Real hxz = tri_gx * zd;
-                    const Real hxy = tri_hxy * zv;
-                    const Real hyz = tri_gy * zd;
-
-                    value_row[q] = tri_v * zv;
-                    g[0u * output_stride + q] = tri_gx * zv;
-                    g[1u * output_stride + q] = tri_gy * zv;
-                    g[2u * output_stride + q] = tri_v * zd;
-                    H[0u * output_stride + q] = tri_hxx * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_hyy * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-            return;
-        }
-
-        if (hessians_only) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_hessian_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_hessian_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real tri_gx = tri_g[0u * tri_stride + q];
-                    const Real tri_gy = tri_g[1u * tri_stride + q];
-                    const Real tri_hxx = tri_H[0u * tri_stride + q];
-                    const Real tri_hxy = tri_H[1u * tri_stride + q];
-                    const Real tri_hyy = tri_H[2u * tri_stride + q];
-                    const Real hxz = tri_gx * zd;
-                    const Real hxy = tri_hxy * zv;
-                    const Real hyz = tri_gy * zd;
-
-                    H[0u * output_stride + q] = tri_hxx * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_hyy * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_gradient_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_gradient_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* g = gradients_out + node * 3u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
-                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
-                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t tri_q = tri * tri_stride + q;
-                const std::size_t z_q = q * axis_stride + z;
-                const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                const Real zv = scratch.axis_z_batch.values[z_q];
-                if (values_out != nullptr) {
-                    value_row[q] = tri_v * zv;
-                }
-
-                if (need_grad) {
-                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
-                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
-                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
-                }
-
-                if (need_hess) {
-                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                    const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real hxz = tri_g[0u * tri_stride + q] * zd;
-                    const Real hxy = tri_H[1u * tri_stride + q] * zv;
-                    const Real hyz = tri_g[1u * tri_stride + q] * zd;
-                    H[0u * output_stride + q] = tri_H[0u * tri_stride + q] * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_H[2u * tri_stride + q] * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-        }
-
-        return;
-    }
-
-    scratch.tri_values.resize(tri_count);
-    if (need_grad || need_hess) {
-        scratch.tri_gradient_components.resize(tri_count * 3u);
-    }
-    if (need_hess) {
-        scratch.tri_hessian_components.resize(tri_count * 9u);
-    }
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const AxisBasisEvaluations z_axis =
-            fill_axis_scratch(scratch.axis_z,
-                              v_coeffs,
-                              d_coeffs,
-                              d2_coeffs,
-                              barycentric_weights,
-                              n_axis,
-                              xi[2],
-                              level);
-        detail::evaluate_triangle_simplex_basis_to(
-            simplex_exponents,
-            order,
-            xi,
-            scratch.tri_values.data(),
-            (need_grad || need_hess) ? scratch.tri_gradient_components.data() : nullptr,
-            need_hess ? scratch.tri_hessian_components.data() : nullptr);
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            const Real tri_v = scratch.tri_values[tri];
-            const Real zv = z_axis.values[z];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = tri_v * zv;
-            }
-
-            if (need_grad) {
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = tri_g[0] * zv;
-                g[1u * output_stride + q] = tri_g[1] * zv;
-                g[2u * output_stride + q] = tri_v * z_axis.first[z];
-            }
-
-            if (need_hess) {
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real zd = z_axis.first[z];
-                const Real hxz = tri_g[0] * zd;
-                const Real hxy = tri_H[1] * zv;
-                const Real hyz = tri_g[1] * zd;
-                Real* H = hessians_out + node * 9u * output_stride;
-                H[0u * output_stride + q] = tri_H[0] * zv;
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = tri_H[4] * zv;
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = tri_v * z_axis.second[z];
-            }
-        }
-    }
-}
-
-NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
-    switch (element_type) {
-        case ElementType::Line3:
-            return {ElementType::Line2, std::max(order, 2)};
-        case ElementType::Triangle6:
-            return {ElementType::Triangle3, std::max(order, 2)};
-        case ElementType::Quad9:
-            return {ElementType::Quad4, std::max(order, 2)};
-        case ElementType::Quad8:
-            throw BasisElementCompatibilityException(
-                "Quad8 is a serendipity element; use SerendipityBasis for Quad8",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Tetra10:
-            return {ElementType::Tetra4, std::max(order, 2)};
-        case ElementType::Hex27:
-            return {ElementType::Hex8, std::max(order, 2)};
-        case ElementType::Hex20:
-            throw BasisElementCompatibilityException(
-                "Hex20 is a serendipity element; use SerendipityBasis for Hex20",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Wedge18:
-            return {ElementType::Wedge6, std::max(order, 2)};
-        case ElementType::Wedge15:
-            throw BasisElementCompatibilityException(
-                "Wedge15 is a serendipity element; use SerendipityBasis for Wedge15",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Pyramid13:
-            throw BasisElementCompatibilityException(
-                "Pyramid13 is a serendipity variant; use SerendipityBasis (Pyramid13) or the complete-family Lagrange path via LagrangeBasis (Pyramid5, order >= 2)",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Pyramid14:
-            return {ElementType::Pyramid5, std::max(order, 2)};
-        default:
-            return {element_type, order};
-    }
-}
-
-} // namespace
-
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
-    evaluate_scratch().prewarm(max_order, max_qpts);
-}
-
-LagrangeBasis::LagrangeBasis(ElementType type, int order)
-    : element_type_(type), dimension_(0), order_(order) {
-    const NormalizedLagrangeRequest normalized = normalize_lagrange_request(element_type_, order_);
-    element_type_ = normalized.element_type;
-    order_ = normalized.order;
-
-    if (order_ < 0) {
-        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
-                                          __FILE__, __LINE__, __func__);
-    }
-
-    dimension_ = lagrange_topology_traits(element_type_).dimension;
-
-    init_nodes();
-    init_evaluation_dispatch();
-}
-
-void LagrangeBasis::init_nodes() {
-    nodes_.clear();
-    nodes_1d_.clear();
-    tensor_indices_.clear();
-    simplex_exponents_.clear();
-    wedge_indices_.clear();
-    wedge_node_by_tri_z_.clear();
-    axis_v_coeffs_.clear();
-    axis_d_coeffs_.clear();
-    axis_d2_coeffs_.clear();
-    axis_barycentric_weights_.clear();
-    const auto topology = lagrange_topology_traits(element_type_).topology;
-    topology_id_ = static_cast<int>(topology);
-    switch (topology) {
-        case LagrangeTopology::Point:
-            build_point_nodes();
-            return;
-        case LagrangeTopology::Line:
-            build_tensor_product_nodes(1);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Quadrilateral:
-            build_tensor_product_nodes(2);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Hexahedron:
-            build_tensor_product_nodes(3);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Triangle:
-        case LagrangeTopology::Tetrahedron:
-            build_simplex_nodes();
-            return;
-        case LagrangeTopology::Wedge:
-            build_wedge_nodes();
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Pyramid:
-            build_pyramid_nodes();
-            return;
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
-                                             __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::init_evaluation_dispatch() {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    switch (topology) {
-        case LagrangeTopology::Point:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_point_vectors;
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tensor_product_vectors;
-            return;
-        case LagrangeTopology::Triangle:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_triangle_vectors;
-            return;
-        case LagrangeTopology::Tetrahedron:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tetrahedron_vectors;
-            return;
-        case LagrangeTopology::Wedge:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_wedge_vectors;
-            return;
-        case LagrangeTopology::Pyramid:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_pyramid_vectors;
-            return;
-        case LagrangeTopology::Unknown:
-            break;
-    }
-    vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_unsupported_vectors;
-}
-
-void LagrangeBasis::compute_axis_monomial_coefficients() {
-    const int N = static_cast<int>(nodes_1d_.size());
-    if (N == 0) return;
-
-    axis_barycentric_weights_.resize(static_cast<std::size_t>(N));
-    fill_equispaced_barycentric_weights(N, axis_barycentric_weights_.data());
-
-    if (assign_precomputed_axis_coefficients(N, axis_v_coeffs_, axis_d_coeffs_, axis_d2_coeffs_)) {
-        return;
-    }
-
-    axis_v_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N), Real(0));
-    if (N >= 2) {
-        axis_d_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 1), Real(0));
-    }
-    if (N >= 3) {
-        axis_d2_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 2), Real(0));
-    }
-
-    if (N == 1) {
-        axis_v_coeffs_[0] = Real(1);
-        return;
-    }
-
-    // For each L_i, compute monomial coefficients of P_i(x) = prod_{j != i} (x - x_j),
-    // then divide by w_i = prod_{j != i} (x_i - x_j).
-    std::vector<Real> coeffs;
-    coeffs.reserve(static_cast<std::size_t>(N));
-    for (int i = 0; i < N; ++i) {
-        coeffs.assign(1, Real(1));  // start with constant polynomial 1
-        for (int j = 0; j < N; ++j) {
-            if (j == i) continue;
-            // Multiply (x - x_j) into coeffs (in-place via temp).
-            std::vector<Real> next(coeffs.size() + 1, Real(0));
-            for (std::size_t k = 0; k < coeffs.size(); ++k) {
-                next[k]     -= nodes_1d_[static_cast<std::size_t>(j)] * coeffs[k];
-                next[k + 1] += coeffs[k];
-            }
-            coeffs.swap(next);
-        }
-        // Divide by w_i.
-        Real denom = Real(1);
-        for (int j = 0; j < N; ++j) {
-            if (j == i) continue;
-            denom *= (nodes_1d_[static_cast<std::size_t>(i)] - nodes_1d_[static_cast<std::size_t>(j)]);
-        }
-        const Real inv_denom = Real(1) / denom;
-        for (int k = 0; k < N; ++k) {
-            axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N) + static_cast<std::size_t>(k)]
-                = coeffs[static_cast<std::size_t>(k)] * inv_denom;
-        }
-
-        // First derivative coefficients: d/dx (sum_k c_ik * x^k) = sum_{k>=1} k*c_ik * x^(k-1).
-        if (N >= 2) {
-            for (int k = 1; k < N; ++k) {
-                axis_d_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 1)
-                              + static_cast<std::size_t>(k - 1)]
-                    = static_cast<Real>(k)
-                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
-                                       + static_cast<std::size_t>(k)];
-            }
-        }
-
-        // Second derivative coefficients: d^2/dx^2 = sum_{k>=2} k*(k-1)*c_ik * x^(k-2).
-        if (N >= 3) {
-            for (int k = 2; k < N; ++k) {
-                axis_d2_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 2)
-                              + static_cast<std::size_t>(k - 2)]
-                    = static_cast<Real>(k * (k - 1))
-                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
-                                       + static_cast<std::size_t>(k)];
-            }
-        }
-    }
-}
-
-void LagrangeBasis::build_point_nodes() {
-    nodes_.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(0)});
-}
-
-void LagrangeBasis::init_equispaced_1d_nodes() {
-    nodes_1d_.clear();
-    for (int i = 0; i <= std::max(order_, 0); ++i) {
-        nodes_1d_.push_back(detail::equispaced_pm_one_coord(i, order_));
-    }
-}
-
-void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
-    init_equispaced_1d_nodes();
-
-    if (dimensions < 1 || dimensions > 3) {
-        throw BasisConfigurationException("LagrangeBasis::build_tensor_product_nodes requires dimension 1, 2, or 3",
-                                          __FILE__, __LINE__, __func__);
-    }
-
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    tensor_indices_.resize(nodes_.size(), TensorNodeIndex{0u, 0u, 0u});
-    for (std::size_t n = 0; n < nodes_.size(); ++n) {
-        tensor_indices_[n][0] = lattice_index_pm_one(
-            nodes_[n][0], order_,
-            "LagrangeBasis: invalid tensor-product x-coordinate in public node ordering");
-        if (dimensions >= 2) {
-            tensor_indices_[n][1] = lattice_index_pm_one(
-                nodes_[n][1], order_,
-                "LagrangeBasis: invalid tensor-product y-coordinate in public node ordering");
-        }
-        if (dimensions == 3) {
-            tensor_indices_[n][2] = lattice_index_pm_one(
-                nodes_[n][2], order_,
-                "LagrangeBasis: invalid tensor-product z-coordinate in public node ordering");
-        }
-    }
-}
-
-void LagrangeBasis::build_simplex_nodes() {
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    simplex_exponents_.clear();
-    simplex_exponents_.reserve(nodes_.size());
-    for (const auto& node : nodes_) {
-        switch (topology) {
-            case LagrangeTopology::Triangle:
-                simplex_exponents_.push_back(triangle_exponents_from_public_node(node, order_));
-                break;
-            case LagrangeTopology::Tetrahedron:
-                simplex_exponents_.push_back(tetrahedron_exponents_from_public_node(node, order_));
-                break;
-            default:
-                throw BasisElementCompatibilityException("LagrangeBasis::build_simplex_nodes requires simplex topology",
-                                                         __FILE__, __LINE__, __func__);
-        }
-    }
-}
-
-void LagrangeBasis::build_wedge_nodes() {
-    init_equispaced_1d_nodes();
-    const auto triangle_nodes = ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
-    simplex_exponents_.clear();
-    simplex_exponents_.reserve(triangle_nodes.size());
-    std::unordered_map<std::array<int, 4>, std::size_t, SimplexExponentHash> triangle_index_by_exponent;
-    triangle_index_by_exponent.reserve(triangle_nodes.size());
-    for (std::size_t tri = 0; tri < triangle_nodes.size(); ++tri) {
-        const auto exponents = triangle_exponents_from_public_node(triangle_nodes[tri], order_);
-        simplex_exponents_.push_back(exponents);
-        const auto inserted = triangle_index_by_exponent.emplace(exponents, tri);
-        if (!inserted.second) {
-            throw BasisNodeOrderingException("LagrangeBasis: duplicate wedge triangle descriptor",
-                                             __FILE__, __LINE__, __func__);
-        }
-    }
-
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    wedge_indices_.clear();
-    wedge_indices_.reserve(nodes_.size());
-    const std::size_t z_count = static_cast<std::size_t>(order_ + 1);
-    const std::size_t missing_node = nodes_.size();
-    wedge_node_by_tri_z_.assign(triangle_nodes.size() * z_count, missing_node);
-    for (std::size_t node_index = 0; node_index < nodes_.size(); ++node_index) {
-        const auto& node = nodes_[node_index];
-        const auto exponents = triangle_exponents_from_public_node(node, order_);
-        const auto found = triangle_index_by_exponent.find(exponents);
-        if (found == triangle_index_by_exponent.end()) {
-            throw BasisNodeOrderingException("LagrangeBasis: failed to resolve wedge triangle descriptor in public ordering",
-                                             __FILE__, __LINE__, __func__);
-        }
-        const std::size_t tri = found->second;
-        const std::size_t z =
-            lattice_index_pm_one(node[2], order_,
-                                 "LagrangeBasis: invalid wedge z-coordinate in public node ordering");
-        wedge_indices_.push_back(WedgeNodeIndex{tri, z});
-        wedge_node_by_tri_z_[tri * z_count + z] = node_index;
-    }
-    for (std::size_t entry = 0; entry < wedge_node_by_tri_z_.size(); ++entry) {
-        if (wedge_node_by_tri_z_[entry] == missing_node) {
-            throw BasisNodeOrderingException("LagrangeBasis: incomplete wedge tensor-product node map",
-                                             __FILE__, __LINE__, __func__);
-        }
-    }
-}
-
-void LagrangeBasis::build_pyramid_nodes() {
-    nodes_ = detail::lagrange_pyramid::nodes(order_);
-}
-
-void LagrangeBasis::evaluate_point_vectors(const math::Vector<Real, 3>&,
-                                           std::vector<Real>* values,
-                                           std::vector<Gradient>* gradients,
-                                           std::vector<Hessian>* hessians) const {
-    if (values != nullptr) {
-        values->resize(1u);
-        (*values)[0] = Real(1);
-    }
-    if (gradients != nullptr) {
-        gradients->resize(1u);
-        (*gradients)[0] = Gradient{};
-    }
-    if (hessians != nullptr) {
-        hessians->resize(1u);
-        (*hessians)[0] = Hessian{};
-    }
-}
-
-void LagrangeBasis::evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
-                                                    std::vector<Real>* values,
-                                                    std::vector<Gradient>* gradients,
-                                                    std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
-                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
-                                                  : AxisDeriv::ValuesOnly;
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const AxisBasisEvaluations x_axis =
-        fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], level);
-    AxisBasisEvaluations y_axis = constant_axis_basis();
-    AxisBasisEvaluations z_axis = constant_axis_basis();
-
-    if (topology != LagrangeTopology::Line) {
-        y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], level);
-    }
-    if (topology == LagrangeTopology::Hexahedron) {
-        z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
-    }
-
-    evaluate_tensor_product_factorized(tensor_indices_, x_axis, y_axis, z_axis,
-                                       values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
-                                              std::vector<Real>* values,
-                                              std::vector<Gradient>* gradients,
-                                              std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-    detail::evaluate_triangle_simplex_basis(simplex_exponents_, order_, xi,
-                                            values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
-                                                 std::vector<Real>* values,
-                                                 std::vector<Gradient>* gradients,
-                                                 std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-    detail::evaluate_tetrahedron_simplex_basis(simplex_exponents_, order_, xi,
-                                               values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
-                                           std::vector<Real>* values,
-                                           std::vector<Gradient>* gradients,
-                                           std::vector<Hessian>* hessians) const {
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
-                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
-                                                  : AxisDeriv::ValuesOnly;
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const AxisBasisEvaluations z_axis =
-        fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
-
-    if (hessians != nullptr) {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, &scratch.tri_gradients, &scratch.tri_hessians);
-    } else if (gradients != nullptr) {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, &scratch.tri_gradients, nullptr);
-    } else {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, nullptr, nullptr);
-    }
-
-    const std::size_t n_nodes = wedge_indices_.size();
-    if (values != nullptr) {
-        values->resize(n_nodes);
-    }
-    if (gradients != nullptr) {
-        gradients->resize(n_nodes);
-    }
-    if (hessians != nullptr) {
-        hessians->resize(n_nodes);
-    }
-
-    for (std::size_t n = 0; n < n_nodes; ++n) {
-        const auto& index = wedge_indices_[n];
-        const std::size_t tri_idx = index[0];
-        const std::size_t z_idx = index[1];
-        const Real zv = z_axis.values[z_idx];
-        const Real tri_v = scratch.tri_values[tri_idx];
-
-        if (values != nullptr) {
-            (*values)[n] = tri_v * zv;
-        }
-        if (gradients != nullptr) {
-            const Real zd = z_axis.first[z_idx];
-            (*gradients)[n][0] = scratch.tri_gradients[tri_idx][0] * zv;
-            (*gradients)[n][1] = scratch.tri_gradients[tri_idx][1] * zv;
-            (*gradients)[n][2] = tri_v * zd;
-        }
-        if (hessians != nullptr) {
-            const Real zd = z_axis.first[z_idx];
-            const Real zd2 = z_axis.second[z_idx];
-            Hessian H{};
-            H(0, 0) = scratch.tri_hessians[tri_idx](0, 0) * zv;
-            H(1, 1) = scratch.tri_hessians[tri_idx](1, 1) * zv;
-            H(0, 1) = scratch.tri_hessians[tri_idx](0, 1) * zv;
-            H(1, 0) = H(0, 1);
-            H(2, 2) = tri_v * zd2;
-            H(0, 2) = scratch.tri_gradients[tri_idx][0] * zd;
-            H(2, 0) = H(0, 2);
-            H(1, 2) = scratch.tri_gradients[tri_idx][1] * zd;
-            H(2, 1) = H(1, 2);
-            (*hessians)[n] = H;
-        }
-    }
-}
-
-void LagrangeBasis::evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
-                                             std::vector<Real>* values,
-                                             std::vector<Gradient>* gradients,
-                                             std::vector<Hessian>* hessians) const {
-    if (values != nullptr && gradients != nullptr && hessians != nullptr) {
-        detail::lagrange_pyramid::evaluate_all(order_, xi, *values, *gradients, *hessians);
-        return;
-    }
-    if (values != nullptr) {
-        detail::lagrange_pyramid::evaluate_values(order_, xi, *values);
-    }
-    if (gradients != nullptr) {
-        detail::lagrange_pyramid::evaluate_gradients(order_, xi, *gradients);
-    }
-    if (hessians != nullptr) {
-        detail::lagrange_pyramid::evaluate_hessians(order_, xi, *hessians);
-    }
-}
-
-void LagrangeBasis::evaluate_unsupported_vectors(const math::Vector<Real, 3>&,
-                                                 std::vector<Real>*,
-                                                 std::vector<Gradient>*,
-                                                 std::vector<Hessian>*) const {
-    throw BasisEvaluationException("Unsupported element in LagrangeBasis vector evaluation",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_values(const math::Vector<Real, 3>& xi,
-                                    std::vector<Real>& values) const {
-    (this->*vector_evaluation_dispatch_)(xi, &values, nullptr, nullptr);
-}
-
-void LagrangeBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                       std::vector<Gradient>& gradients) const {
-    (this->*vector_evaluation_dispatch_)(xi, nullptr, &gradients, nullptr);
-}
-
-void LagrangeBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                      std::vector<Hessian>& hessians) const {
-    (this->*vector_evaluation_dispatch_)(xi, nullptr, nullptr, &hessians);
-}
-
-void LagrangeBasis::evaluate_all(const math::Vector<Real, 3>& xi,
-                                 std::vector<Real>& values,
-                                 std::vector<Gradient>& gradients,
-                                 std::vector<Hessian>& hessians) const {
-    (this->*vector_evaluation_dispatch_)(xi, &values, &gradients, &hessians);
-}
-
-void LagrangeBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, nullptr, nullptr)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            values_out[0] = Real(1);
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesOnly);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesOnly);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  values_out, nullptr, nullptr);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       values_out, nullptr, nullptr);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          values_out, nullptr, nullptr);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
-            scratch.tri_values.resize(simplex_exponents_.size());
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(), nullptr, nullptr);
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                values_out[n] = scratch.tri_values[index[0]] * z_axis.values[index[1]];
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_values_to(order_, xi, values_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_values_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, gradients_out, nullptr)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            gradients_out[0] = Real(0);
-            gradients_out[1] = Real(0);
-            gradients_out[2] = Real(0);
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirst);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirst);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  nullptr, gradients_out, nullptr);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       nullptr, gradients_out, nullptr);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          nullptr, gradients_out, nullptr);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       nullptr);
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                Real* g = gradients_out + n * 3u;
-                g[0] = tri_g[0] * z_axis.values[z];
-                g[1] = tri_g[1] * z_axis.values[z];
-                g[2] = scratch.tri_values[tri] * z_axis.first[z];
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_gradients_to(order_, xi, gradients_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_gradients_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
                                          Real* SVMP_RESTRICT hessians_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, nullptr, hessians_out)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            for (std::size_t i = 0; i < 9; ++i) {
-                hessians_out[i] = Real(0);
-            }
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  nullptr, nullptr, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       nullptr, nullptr, hessians_out);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          nullptr, nullptr, hessians_out);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            scratch.tri_hessian_components.resize(tri_count * 9u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       scratch.tri_hessian_components.data());
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real zv = z_axis.values[z];
-                const Real zd = z_axis.first[z];
-                const Real zd2 = z_axis.second[z];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real hxy = tri_H[1] * zv;
-                const Real hxz = tri_g[0] * zd;
-                const Real hyz = tri_g[1] * zd;
-                Real* H = hessians_out + n * 9u;
-                H[0] = tri_H[0] * zv;
-                H[4] = tri_H[4] * zv;
-                H[1] = hxy;
-                H[3] = hxy;
-                H[8] = scratch.tri_values[tri] * zd2;
-                H[2] = hxz;
-                H[6] = hxz;
-                H[5] = hyz;
-                H[7] = hyz;
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_hessians_to(order_, xi, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_hessians_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, gradients_out, hessians_out)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            values_out[0] = Real(1);
-            gradients_out[0] = Real(0);
-            gradients_out[1] = Real(0);
-            gradients_out[2] = Real(0);
-            for (std::size_t i = 0; i < 9; ++i) {
-                hessians_out[i] = Real(0);
-            }
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  values_out, gradients_out, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       values_out, gradients_out, hessians_out);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          values_out, gradients_out, hessians_out);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            scratch.tri_hessian_components.resize(tri_count * 9u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       scratch.tri_hessian_components.data());
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real zv = z_axis.values[z];
-                const Real zd = z_axis.first[z];
-                const Real zd2 = z_axis.second[z];
-                const Real tri_v = scratch.tri_values[tri];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real hxy = tri_H[1] * zv;
-                const Real hxz = tri_g[0] * zd;
-                const Real hyz = tri_g[1] * zd;
-
-                values_out[n] = tri_v * zv;
-
-                Real* g = gradients_out + n * 3u;
-                g[0] = tri_g[0] * zv;
-                g[1] = tri_g[1] * zv;
-                g[2] = tri_v * zd;
-
-                Real* H = hessians_out + n * 9u;
-                H[0] = tri_H[0] * zv;
-                H[4] = tri_H[4] * zv;
-                H[1] = hxy;
-                H[3] = hxy;
-                H[8] = tri_v * zd2;
-                H[2] = hxz;
-                H[6] = hxz;
-                H[5] = hyz;
-                H[7] = hyz;
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_all_to(
-                order_, xi, values_out, gradients_out, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_all_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(points, points.size(), values_out, gradients_out, hessians_out);
-}
-
-void LagrangeBasis::evaluate_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException("LagrangeBasis strided evaluation requires output_stride >= points.size()",
-                                          __FILE__, __LINE__, __func__);
-    }
-    if (values_out == nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-        return;
-    }
-
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_strided(topology,
-                                             order_,
-                                             points,
-                                             output_stride,
-                                             values_out,
-                                             gradients_out,
-                                             hessians_out)) {
-        return;
-    }
-
-    if (topology == LagrangeTopology::Line ||
-        topology == LagrangeTopology::Quadrilateral ||
-        topology == LagrangeTopology::Hexahedron) {
-        evaluate_tensor_product_points_strided(topology,
-                                               tensor_indices_,
-                                               points,
-                                               output_stride,
-                                               axis_v_coeffs_.data(),
-                                               axis_d_coeffs_.data(),
-                                               axis_d2_coeffs_.data(),
-                                               axis_barycentric_weights_.data(),
-                                               static_cast<int>(nodes_1d_.size()),
-                                               values_out,
-                                               gradients_out,
-                                               hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Triangle) {
-        detail::evaluate_triangle_simplex_basis_strided(
-            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Tetrahedron) {
-        detail::evaluate_tetrahedron_simplex_basis_strided(
-            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Wedge &&
-        evaluate_wedge_fast_strided(order_,
-                                    wedge_indices_,
-                                    points,
-                                    output_stride,
-                                    values_out,
-                                    gradients_out,
-                                    hessians_out)) {
-        return;
-    }
-
-    const bool wedge_scalar_hessian_fallback =
-        topology == LagrangeTopology::Wedge &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr &&
-        order_ <= 2;
-    if (topology == LagrangeTopology::Wedge && !wedge_scalar_hessian_fallback) {
-        evaluate_wedge_points_strided(simplex_exponents_,
-                                      wedge_indices_,
-                                      wedge_node_by_tri_z_,
-                                      order_,
-                                      points,
-                                      output_stride,
-                                      axis_v_coeffs_.data(),
-                                      axis_d_coeffs_.data(),
-                                      axis_d2_coeffs_.data(),
-                                      axis_barycentric_weights_.data(),
-                                      static_cast<int>(nodes_1d_.size()),
-                                      values_out,
-                                      gradients_out,
-                                      hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Pyramid) {
-        detail::lagrange_pyramid::evaluate_at_quadrature_points_strided(
-            order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    auto& scratch = evaluate_scratch();
-    auto& v_tmp = scratch.strided_values_tmp;
-    auto& g_tmp = scratch.strided_gradients_tmp;
-    auto& h_tmp = scratch.strided_hessians_tmp;
-
-    if (values_out)    v_tmp.resize(num_dofs);
-    if (gradients_out) g_tmp.resize(num_dofs * 3u);
-    if (hessians_out)  h_tmp.resize(num_dofs * 9u);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out && gradients_out && hessians_out) {
-            evaluate_all_to(points[q], v_tmp.data(), g_tmp.data(), h_tmp.data());
-        } else {
-            if (values_out)    evaluate_values_to(points[q], v_tmp.data());
-            if (gradients_out) evaluate_gradients_to(points[q], g_tmp.data());
-            if (hessians_out)  evaluate_hessians_to(points[q], h_tmp.data());
-        }
-
-        if (values_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                values_out[d * output_stride + q] = v_tmp[d];
-            }
-        }
-        if (gradients_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                gradients_out[(d * 3u + 0u) * output_stride + q] = g_tmp[d * 3u + 0u];
-                gradients_out[(d * 3u + 1u) * output_stride + q] = g_tmp[d * 3u + 1u];
-                gradients_out[(d * 3u + 2u) * output_stride + q] = g_tmp[d * 3u + 2u];
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                scatter_hessian_components_strided(
-                    h_tmp.data() + d * 9u,
-                    hessians_out + d * 9u * output_stride,
-                    output_stride,
-                    q);
-            }
-        }
-    }
+    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 91f7e379c..a5fe8e0fa 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -8,12 +8,9 @@
 #ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
 #define SVMP_FE_BASIS_LAGRANGEBASIS_H
 
-/**
- * @file LagrangeBasis.h
- * @brief Nodal Lagrange polynomial basis on reference elements
- */
-
 #include "BasisFunction.h"
+#include "BasisTraits.h"
+
 #include <array>
 #include <cstddef>
 
@@ -23,33 +20,12 @@ namespace basis {
 
 void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
 
-/**
- * @brief Complete nodal H1 Lagrange basis on canonical reference topologies
- *
- * Supports arbitrary polynomial order on the canonical complete families:
- * `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`, `Wedge6`, and `Pyramid5`.
- * Low-order complete-family aliases (`Line3`, `Triangle6`, `Quad9`,
- * `Tetra10`, `Hex27`, `Wedge18`, `Pyramid14`) normalize to their canonical
- * topology plus order. Serendipity variants remain intentionally excluded.
- *
- * Node locations are generated on canonical reference elements using
- * equispaced coordinates on tensor-product elements, barycentric grids on
- * simplices, tensorized triangle-line grids on wedges, and a rational nodal
- * pyramid construction on `Pyramid5`.
- *
- * The evaluator is numerically stabilized for those nodes, but the
- * interpolation problem itself remains the equispaced Lagrange problem. For
- * high-order interpolation, especially order >= 4, prefer `SpectralBasis`
- * (GLL / Warp & Blend nodes) unless exact equispaced nodal placement is part
- * of the requested discretization.
- *
- * For the rational pyramid family, basis values remain exact at the apex.
- * Gradients and Hessians are analytic on the supported interior reference
- * domain, but the exact-apex nodal derivative limit is not unique and those
- * derivative queries throw at the exact apex.
- */
 class LagrangeBasis : public BasisFunction {
 public:
+    using TensorNodeIndex = std::array<std::size_t, 3>;
+    using SimplexExponent = std::array<int, 4>;
+    using WedgeNodeIndex = std::array<std::size_t, 2>;
+
     LagrangeBasis(ElementType type, int order);
 
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
@@ -57,7 +33,6 @@ class LagrangeBasis : public BasisFunction {
     int dimension() const noexcept override { return dimension_; }
     int order() const noexcept override { return order_; }
     std::size_t size() const noexcept override { return nodes_.size(); }
-    bool cache_identity_is_structural() const noexcept override { return true; }
 
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
@@ -72,96 +47,32 @@ class LagrangeBasis : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    void evaluate_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const final;
-    void evaluate_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const final;
-
-    // Raw-pointer output API. Caller must pre-size buffers to size().
-    void evaluate_values_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT values_out) const final;
-    void evaluate_gradients_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT gradients_out) const final;
-    void evaluate_hessians_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT hessians_out) const final;
+    void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                            Real* SVMP_RESTRICT values_out) const final;
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                               Real* SVMP_RESTRICT gradients_out) const final;
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT hessians_out) const final;
 
 private:
-    using TensorNodeIndex = std::array<std::size_t, 3>;
-    using WedgeNodeIndex = std::array<std::size_t, 2>;
-    using VectorEvaluationDispatch = void (LagrangeBasis::*)(
-        const math::Vector<Real, 3>&,
-        std::vector<Real>*,
-        std::vector<Gradient>*,
-        std::vector<Hessian>*) const;
-
-    // Cached topology encoded as int because the topology enum lives in
-    // the .cpp anon namespace. Set once in init_nodes.
-    int topology_id_ = 0;
-
     ElementType element_type_;
-    int dimension_;
-    int order_;
+    BasisTopology topology_{BasisTopology::Unknown};
+    int dimension_{0};
+    int order_{0};
 
     std::vector<Real> nodes_1d_;
     std::vector<math::Vector<Real, 3>> nodes_;
     std::vector<TensorNodeIndex> tensor_indices_;
-    std::vector<std::array<int, 4>> simplex_exponents_;
+    std::vector<SimplexExponent> simplex_exponents_;
     std::vector<WedgeNodeIndex> wedge_indices_;
-    std::vector<std::size_t> wedge_node_by_tri_z_;
-
-    // Precomputed Horner-form coefficients of the 1D Lagrange basis.
-    // Layout per axis (n_axis = nodes_1d_.size() = order_+1):
-    //   axis_v_coeffs_[i * n_axis + k] = coeff of x^k in L_i(x), 0 <= i,k < n_axis
-    //   axis_d_coeffs_[i * (n_axis - 1) + k] = coeff of x^k in L_i'(x)
-    //   axis_d2_coeffs_[i * (n_axis - 2) + k] = coeff of x^k in L_i''(x)  (only if n_axis >= 3)
-    // Populated by build_tensor_product_nodes / build_wedge_nodes.
-    std::vector<Real> axis_v_coeffs_;
-    std::vector<Real> axis_d_coeffs_;
-    std::vector<Real> axis_d2_coeffs_;
-    std::vector<Real> axis_barycentric_weights_;
-    VectorEvaluationDispatch vector_evaluation_dispatch_{nullptr};
 
     void init_nodes();
-    void init_evaluation_dispatch();
     void build_point_nodes();
     void build_tensor_product_nodes(int dimensions);
     void build_simplex_nodes();
     void build_wedge_nodes();
-    void build_pyramid_nodes();
     void init_equispaced_1d_nodes();
-    void compute_axis_monomial_coefficients();
-    void evaluate_point_vectors(const math::Vector<Real, 3>& xi,
-                                std::vector<Real>* values,
-                                std::vector<Gradient>* gradients,
-                                std::vector<Hessian>* hessians) const;
-    void evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
-                                         std::vector<Real>* values,
-                                         std::vector<Gradient>* gradients,
-                                         std::vector<Hessian>* hessians) const;
-    void evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
-                                   std::vector<Real>* values,
-                                   std::vector<Gradient>* gradients,
-                                   std::vector<Hessian>* hessians) const;
-    void evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
-                                      std::vector<Real>* values,
-                                      std::vector<Gradient>* gradients,
-                                      std::vector<Hessian>* hessians) const;
-    void evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
-                                std::vector<Real>* values,
-                                std::vector<Gradient>* gradients,
-                                std::vector<Hessian>* hessians) const;
-    void evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
-                                  std::vector<Real>* values,
-                                  std::vector<Gradient>* gradients,
-                                  std::vector<Hessian>* hessians) const;
-    void evaluate_unsupported_vectors(const math::Vector<Real, 3>& xi,
-                                      std::vector<Real>* values,
-                                      std::vector<Gradient>* gradients,
-                                      std::vector<Hessian>* hessians) const;
+
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
                          Real* SVMP_RESTRICT values_out,
                          Real* SVMP_RESTRICT gradients_out,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
deleted file mode 100644
index 5b9faae04..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
+++ /dev/null
@@ -1,1378 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISFAST_H
-#define SVMP_FE_BASIS_LAGRANGEBASISFAST_H
-
-/**
- * @file LagrangeBasisFast.h
- * @brief Header-only zero-overhead specializations of the Lagrange basis
- *
- * Provides templated static methods for the common nodal Lagrange families
- * with compile-time-known polynomial order. Callers that know their basis
- * type and order at compile time use these directly — there is no virtual
- * dispatch, no std::vector allocation, no scratch lookup, and no topology
- * switch. The output buffers are stack-allocated std::array, sized at
- * compile time. The compiler fully unrolls and constant-folds.
- *
- * These specializations are an alternative entry point to the runtime path
- * provided by `LagrangeBasis`. The runtime path remains the canonical API
- * for generic callers; these specializations serve hot loops that know the
- * element type.
- *
- * Node orderings match `ReferenceNodeLayout::get_lagrange_node_coords(...)` (VTK).
- */
-
-#include "Types.h"
-#include "Math/Vector.h"
-#include "Math/Matrix.h"
-#include <array>
-#include <cstddef>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-using Gradient = math::Vector<Real, 3>;
-using Hessian  = math::Matrix<Real, 3, 3>;
-
-namespace detail {
-
-constexpr Gradient scaled_gradient(const Gradient& gradient, Real scale) {
-    return Gradient{scale * gradient[0], scale * gradient[1], scale * gradient[2]};
-}
-
-constexpr Gradient p2_edge_gradient(Real left,
-                                    const Gradient& left_gradient,
-                                    Real right,
-                                    const Gradient& right_gradient) {
-    return Gradient{
-        Real(4) * (left_gradient[0] * right + right_gradient[0] * left),
-        Real(4) * (left_gradient[1] * right + right_gradient[1] * left),
-        Real(4) * (left_gradient[2] * right + right_gradient[2] * left),
-    };
-}
-
-constexpr Hessian p2_vertex_hessian(const Gradient& gradient) {
-    Hessian hessian{};
-    for (std::size_t row = 0; row < 3u; ++row) {
-        for (std::size_t col = 0; col < 3u; ++col) {
-            hessian(row, col) = Real(4) * gradient[row] * gradient[col];
-        }
-    }
-    return hessian;
-}
-
-constexpr Hessian p2_edge_hessian(const Gradient& left_gradient,
-                                  const Gradient& right_gradient) {
-    Hessian hessian{};
-    for (std::size_t row = 0; row < 3u; ++row) {
-        for (std::size_t col = 0; col < 3u; ++col) {
-            hessian(row, col) = Real(4) * (
-                left_gradient[row] * right_gradient[col] +
-                right_gradient[row] * left_gradient[col]);
-        }
-    }
-    return hessian;
-}
-
-constexpr std::size_t public_axis_index(int lattice, int order) noexcept {
-    return lattice == 0 ? 0u :
-           lattice == order ? 1u :
-           static_cast<std::size_t>(lattice + 1);
-}
-
-template<int Order>
-constexpr Real public_axis_coord(std::size_t public_index) noexcept {
-    const int lattice = public_index == 0u ? 0 :
-                        public_index == 1u ? Order :
-                        static_cast<int>(public_index) - 1;
-    return Real(-1) + Real(2) * static_cast<Real>(lattice) / static_cast<Real>(Order);
-}
-
-template<int Order>
-constexpr std::array<Real, Order + 1> make_public_axis_nodes() {
-    std::array<Real, Order + 1> nodes{};
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        nodes[i] = public_axis_coord<Order>(i);
-    }
-    return nodes;
-}
-
-template<int Order>
-constexpr std::array<Real, Order + 1> make_public_axis_inverse_denominators() {
-    constexpr auto nodes = make_public_axis_nodes<Order>();
-    std::array<Real, Order + 1> inv_denominators{};
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        Real denominator = Real(1);
-        for (std::size_t j = 0; j < nodes.size(); ++j) {
-            if (j != i) {
-                denominator *= nodes[i] - nodes[j];
-            }
-        }
-        inv_denominators[i] = Real(1) / denominator;
-    }
-    return inv_denominators;
-}
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-void fill_axis_lagrange(Real x,
-                        std::array<Real, Order + 1>& values,
-                        std::array<Real, Order + 1>* first,
-                        std::array<Real, Order + 1>* second) {
-    constexpr auto nodes = make_public_axis_nodes<Order>();
-    constexpr auto inv_denominators = make_public_axis_inverse_denominators<Order>();
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        Real product = Real(1);
-        for (std::size_t j = 0; j < nodes.size(); ++j) {
-            if (j != i) {
-                product *= x - nodes[j];
-            }
-        }
-        values[i] = product * inv_denominators[i];
-
-        if constexpr (NeedFirst) {
-            Real derivative = Real(0);
-            for (std::size_t m = 0; m < nodes.size(); ++m) {
-                if (m == i) {
-                    continue;
-                }
-                Real term = Real(1);
-                for (std::size_t j = 0; j < nodes.size(); ++j) {
-                    if (j != i && j != m) {
-                        term *= x - nodes[j];
-                    }
-                }
-                derivative += term;
-            }
-            (*first)[i] = derivative * inv_denominators[i];
-        }
-
-        if constexpr (NeedSecond) {
-            Real curvature = Real(0);
-            for (std::size_t m = 0; m < nodes.size(); ++m) {
-                if (m == i) {
-                    continue;
-                }
-                for (std::size_t l = 0; l < nodes.size(); ++l) {
-                    if (l == i || l == m) {
-                        continue;
-                    }
-                    Real term = Real(1);
-                    for (std::size_t j = 0; j < nodes.size(); ++j) {
-                        if (j != i && j != m && j != l) {
-                            term *= x - nodes[j];
-                        }
-                    }
-                    curvature += term;
-                }
-            }
-            (*second)[i] = curvature * inv_denominators[i];
-        }
-    }
-}
-
-template<int Order>
-void fill_axis_values(Real x, std::array<Real, Order + 1>& values) {
-    fill_axis_lagrange<Order, false, false>(x, values, nullptr, nullptr);
-}
-
-template<int Order>
-void fill_axis_values_first(Real x,
-                            std::array<Real, Order + 1>& values,
-                            std::array<Real, Order + 1>& first) {
-    fill_axis_lagrange<Order, true, false>(x, values, &first, nullptr);
-}
-
-template<int Order>
-void fill_axis_values_first_second(Real x,
-                                   std::array<Real, Order + 1>& values,
-                                   std::array<Real, Order + 1>& first,
-                                   std::array<Real, Order + 1>& second) {
-    fill_axis_lagrange<Order, true, true>(x, values, &first, &second);
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)>
-make_quad_tensor_node_axes() {
-    std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)> axes{};
-    std::size_t n = 0;
-
-    axes[n++] = {{0u, 0u}};
-    axes[n++] = {{1u, 0u}};
-    axes[n++] = {{1u, 1u}};
-    axes[n++] = {{0u, 1u}};
-
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order)}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order)}};
-    }
-
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order)}};
-        }
-    }
-
-    return axes;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)>
-make_hex_tensor_node_axes() {
-    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)> axes{};
-    std::size_t n = 0;
-
-    axes[n++] = {{0u, 0u, 0u}};
-    axes[n++] = {{1u, 0u, 0u}};
-    axes[n++] = {{1u, 1u, 0u}};
-    axes[n++] = {{0u, 1u, 0u}};
-    axes[n++] = {{0u, 0u, 1u}};
-    axes[n++] = {{1u, 0u, 1u}};
-    axes[n++] = {{1u, 1u, 1u}};
-    axes[n++] = {{0u, 1u, 1u}};
-
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u, 0u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order), 0u}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u, 0u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order), 0u}};
-    }
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u, 1u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order), 1u}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u, 1u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order), 1u}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{0u, 0u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{1u, 0u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{1u, 1u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{0u, 1u, public_axis_index(k, Order)}};
-    }
-
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 0u}};
-        }
-    }
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 1u}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), 0u, public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int j = 1; j < Order; ++j) {
-            axes[n++] = {{1u, public_axis_index(j, Order), public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int i = Order - 1; i >= 1; --i) {
-            axes[n++] = {{public_axis_index(i, Order), 1u, public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int j = Order - 1; j >= 1; --j) {
-            axes[n++] = {{0u, public_axis_index(j, Order), public_axis_index(k, Order)}};
-        }
-    }
-
-    for (int k = 1; k < Order; ++k) {
-        for (int j = 1; j < Order; ++j) {
-            for (int i = 1; i < Order; ++i) {
-                axes[n++] = {{public_axis_index(i, Order),
-                              public_axis_index(j, Order),
-                              public_axis_index(k, Order)}};
-            }
-        }
-    }
-
-    return axes;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2>
-make_triangle_simplex_exponents() {
-    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2> exponents{};
-    std::size_t n = 0;
-
-    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u}};
-    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u}};
-    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order)}};
-
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m), 0u}};
-    }
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{0u, static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m)}};
-    }
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{static_cast<std::size_t>(m), 0u, static_cast<std::size_t>(Order - m)}};
-    }
-
-    for (int c = 1; c <= Order - 2; ++c) {
-        for (int b = 1; b <= Order - c - 1; ++b) {
-            const int a = Order - b - c;
-            exponents[n++] = {{static_cast<std::size_t>(a),
-                               static_cast<std::size_t>(b),
-                               static_cast<std::size_t>(c)}};
-        }
-    }
-
-    return exponents;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6>
-make_tetrahedron_simplex_exponents() {
-    std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6> exponents{};
-    std::size_t n = 0;
-
-    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u, 0u}};
-    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u, 0u}};
-    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order), 0u}};
-    exponents[n++] = {{0u, 0u, 0u, static_cast<std::size_t>(Order)}};
-
-    constexpr int edges[6][2] = {
-        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
-    };
-    for (const auto& edge : edges) {
-        for (int m = 1; m < Order; ++m) {
-            std::array<std::size_t, 4> e{};
-            e[static_cast<std::size_t>(edge[0])] = static_cast<std::size_t>(Order - m);
-            e[static_cast<std::size_t>(edge[1])] = static_cast<std::size_t>(m);
-            exponents[n++] = e;
-        }
-    }
-
-    constexpr int faces[4][3] = {
-        {0, 1, 2},
-        {0, 1, 3},
-        {1, 2, 3},
-        {0, 2, 3},
-    };
-    for (const auto& face : faces) {
-        for (int c = 1; c <= Order - 2; ++c) {
-            for (int b = 1; b <= Order - c - 1; ++b) {
-                const int a = Order - b - c;
-                std::array<std::size_t, 4> e{};
-                e[static_cast<std::size_t>(face[0])] = static_cast<std::size_t>(a);
-                e[static_cast<std::size_t>(face[1])] = static_cast<std::size_t>(b);
-                e[static_cast<std::size_t>(face[2])] = static_cast<std::size_t>(c);
-                exponents[n++] = e;
-            }
-        }
-    }
-
-    for (int l = 1; l <= Order - 3; ++l) {
-        for (int k = 1; k <= Order - l - 2; ++k) {
-            for (int j = 1; j <= Order - l - k - 1; ++j) {
-                const int i = Order - j - k - l;
-                exponents[n++] = {{static_cast<std::size_t>(i),
-                                   static_cast<std::size_t>(j),
-                                   static_cast<std::size_t>(k),
-                                   static_cast<std::size_t>(l)}};
-            }
-        }
-    }
-
-    return exponents;
-}
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-void fill_simplex_factor_sequence(Real lambda,
-                                  std::array<Real, Order + 1>& phi,
-                                  std::array<Real, Order + 1>* dphi,
-                                  std::array<Real, Order + 1>* d2phi) {
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        (*dphi)[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        (*d2phi)[0] = Real(0);
-    }
-
-    const Real t = static_cast<Real>(Order) * lambda;
-    constexpr Real dt_dlambda = static_cast<Real>(Order);
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-
-    for (int a = 1; a <= Order; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_prev;
-            (*dphi)[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_prev + s * d2phi_dt2_prev;
-                (*d2phi)[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-template<int Order>
-void fill_simplex_factor_values(Real lambda, std::array<Real, Order + 1>& phi) {
-    fill_simplex_factor_sequence<Order, false, false>(lambda, phi, nullptr, nullptr);
-}
-
-template<int Order>
-void fill_simplex_factor_values_first(Real lambda,
-                                      std::array<Real, Order + 1>& phi,
-                                      std::array<Real, Order + 1>& dphi) {
-    fill_simplex_factor_sequence<Order, true, false>(lambda, phi, &dphi, nullptr);
-}
-
-template<int Order>
-void fill_simplex_factor_values_first_second(Real lambda,
-                                             std::array<Real, Order + 1>& phi,
-                                             std::array<Real, Order + 1>& dphi,
-                                             std::array<Real, Order + 1>& d2phi) {
-    fill_simplex_factor_sequence<Order, true, true>(lambda, phi, &dphi, &d2phi);
-}
-
-} // namespace detail
-
-// ---------------------------------------------------------------------------
-// LagrangeLineFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeLineFast;
-
-template<>
-struct LagrangeLineFast<1> {
-    static constexpr int n_dofs = 2;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = (Real(1) - xi[0]) * Real(0.5);
-        out[1] = (Real(1) + xi[0]) * Real(0.5);
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-0.5), Real(0), Real(0)};
-        out[1] = Gradient{Real( 0.5), Real(0), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeLineFast<2> {
-    static constexpr int n_dofs = 3;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real x = xi[0];
-        out[0] = x * (x - Real(1)) * Real(0.5);
-        out[1] = x * (x + Real(1)) * Real(0.5);
-        out[2] = (Real(1) - x) * (Real(1) + x);
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real x = xi[0];
-        out[0] = Gradient{x - Real(0.5), Real(0), Real(0)};
-        out[1] = Gradient{x + Real(0.5), Real(0), Real(0)};
-        out[2] = Gradient{Real(-2) * x, Real(0), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[0](0, 0) = Real(1);
-        out[1](0, 0) = Real(1);
-        out[2](0, 0) = Real(-2);
-    }
-};
-
-template<>
-struct LagrangeLineFast<3> {
-    static constexpr int n_dofs = 4;
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        detail::fill_axis_values<3>(xi[0], out);
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, n_dofs> values{};
-        std::array<Real, n_dofs> first{};
-        detail::fill_axis_values_first<3>(xi[0], values, first);
-        for (std::size_t i = 0; i < first.size(); ++i) {
-            out[i] = Gradient{first[i], Real(0), Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, n_dofs> values{};
-        std::array<Real, n_dofs> first{};
-        std::array<Real, n_dofs> second{};
-        detail::fill_axis_values_first_second<3>(xi[0], values, first, second);
-        for (std::size_t i = 0; i < second.size(); ++i) {
-            Hessian H{};
-            H(0, 0) = second[i];
-            out[i] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeQuadFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeQuadFast;
-
-template<>
-struct LagrangeQuadFast<1> {
-    static constexpr int n_dofs = 4;
-
-    // VTK Quad4 corner ordering: (-,-), (+,-), (+,+), (-,+).
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        out[0] = lx * ly;
-        out[1] = ux * ly;
-        out[2] = ux * uy;
-        out[3] = lx * uy;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        out[0] = Gradient{Real(-0.5) * ly, Real(-0.5) * lx, Real(0)};
-        out[1] = Gradient{Real( 0.5) * ly, Real(-0.5) * ux, Real(0)};
-        out[2] = Gradient{Real( 0.5) * uy, Real( 0.5) * ux, Real(0)};
-        out[3] = Gradient{Real(-0.5) * uy, Real( 0.5) * lx, Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[3] = Hessian{};
-        constexpr Real qrt = Real(0.25);
-        out[0](0, 1) = qrt;  out[0](1, 0) = qrt;
-        out[1](0, 1) = -qrt; out[1](1, 0) = -qrt;
-        out[2](0, 1) = qrt;  out[2](1, 0) = qrt;
-        out[3](0, 1) = -qrt; out[3](1, 0) = -qrt;
-    }
-};
-
-template<>
-struct LagrangeQuadFast<2> {
-    static constexpr int n_dofs = 9;
-
-    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes = {{
-        {{0u, 0u}}, {{1u, 0u}}, {{1u, 1u}}, {{0u, 1u}},
-        {{2u, 0u}}, {{1u, 2u}}, {{2u, 1u}}, {{0u, 2u}},
-        {{2u, 2u}},
-    }};
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            out[n] = Gradient{gx[i][0] * ly[j], lx[i] * gy[j][0], Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
-        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            Hessian H{};
-            H(0, 0) = hx[i](0, 0) * ly[j];
-            H(1, 1) = lx[i] * hy[j](0, 0);
-            H(0, 1) = gx[i][0] * gy[j][0];
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-template<>
-struct LagrangeQuadFast<3> {
-    static constexpr int n_dofs = 16;
-
-    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes =
-        detail::make_quad_tensor_node_axes<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        detail::fill_axis_values<3>(xi[0], lx);
-        detail::fill_axis_values<3>(xi[1], ly);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        detail::fill_axis_values_first<3>(xi[0], lx, gx);
-        detail::fill_axis_values_first<3>(xi[1], ly, gy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            out[n] = Gradient{gx[i] * ly[j], lx[i] * gy[j], Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
-        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
-        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            Hessian H{};
-            H(0, 0) = hx[i] * ly[j];
-            H(1, 1) = lx[i] * hy[j];
-            H(0, 1) = gx[i] * gy[j];
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeHexFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeHexFast;
-
-template<>
-struct LagrangeHexFast<1> {
-    static constexpr int n_dofs = 8;
-
-    // VTK Hex8 corner ordering: (-,-,-), (+,-,-), (+,+,-), (-,+,-),
-    //                           (-,-,+), (+,-,+), (+,+,+), (-,+,+).
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        // Precompute z-plane partial products (sum factorization).
-        const Real lxly = lx * ly;
-        const Real uxly = ux * ly;
-        const Real uxuy = ux * uy;
-        const Real lxuy = lx * uy;
-        out[0] = lxly * lz;
-        out[1] = uxly * lz;
-        out[2] = uxuy * lz;
-        out[3] = lxuy * lz;
-        out[4] = lxly * uz;
-        out[5] = uxly * uz;
-        out[6] = uxuy * uz;
-        out[7] = lxuy * uz;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        // dL_0(x)/dx = -0.5, dL_1(x)/dx = +0.5 along each axis.
-        out[0] = Gradient{Real(-0.5) * ly * lz, Real(-0.5) * lx * lz, Real(-0.5) * lx * ly};
-        out[1] = Gradient{Real( 0.5) * ly * lz, Real(-0.5) * ux * lz, Real(-0.5) * ux * ly};
-        out[2] = Gradient{Real( 0.5) * uy * lz, Real( 0.5) * ux * lz, Real(-0.5) * ux * uy};
-        out[3] = Gradient{Real(-0.5) * uy * lz, Real( 0.5) * lx * lz, Real(-0.5) * lx * uy};
-        out[4] = Gradient{Real(-0.5) * ly * uz, Real(-0.5) * lx * uz, Real( 0.5) * lx * ly};
-        out[5] = Gradient{Real( 0.5) * ly * uz, Real(-0.5) * ux * uz, Real( 0.5) * ux * ly};
-        out[6] = Gradient{Real( 0.5) * uy * uz, Real( 0.5) * ux * uz, Real( 0.5) * ux * uy};
-        out[7] = Gradient{Real(-0.5) * uy * uz, Real( 0.5) * lx * uz, Real( 0.5) * lx * uy};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                            std::array<Hessian, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        const Real ax[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
-        const Real ay[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
-        const Real az[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
-        const int sx[8] = {-1, 1, 1, -1, -1, 1, 1, -1};
-        const int sy[8] = {-1, -1, 1, 1, -1, -1, 1, 1};
-        const int sz[8] = {-1, -1, -1, -1, 1, 1, 1, 1};
-        constexpr Real qrt = Real(0.25);
-        for (std::size_t n = 0; n < static_cast<std::size_t>(n_dofs); ++n) {
-            out[n] = Hessian{};
-            out[n](0, 1) = static_cast<Real>(sx[n] * sy[n]) * qrt * az[n];
-            out[n](1, 0) = out[n](0, 1);
-            out[n](0, 2) = static_cast<Real>(sx[n] * sz[n]) * qrt * ay[n];
-            out[n](2, 0) = out[n](0, 2);
-            out[n](1, 2) = static_cast<Real>(sy[n] * sz[n]) * qrt * ax[n];
-            out[n](2, 1) = out[n](1, 2);
-        }
-    }
-};
-
-template<>
-struct LagrangeHexFast<2> {
-    static constexpr int n_dofs = 27;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes = {{
-        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
-        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
-        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
-        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
-        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
-        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
-        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
-    }};
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            out[n] = Gradient{
-                gx[i][0] * ly[j] * lz[k],
-                lx[i] * gy[j][0] * lz[k],
-                lx[i] * ly[j] * gz[k][0],
-            };
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
-        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
-        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
-        LagrangeLineFast<2>::evaluate_hessians({xi[2], Real(0), Real(0)}, hz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            Hessian H{};
-            H(0, 0) = hx[i](0, 0) * ly[j] * lz[k];
-            H(1, 1) = lx[i] * hy[j](0, 0) * lz[k];
-            H(2, 2) = lx[i] * ly[j] * hz[k](0, 0);
-            H(0, 1) = gx[i][0] * gy[j][0] * lz[k];
-            H(1, 0) = H(0, 1);
-            H(0, 2) = gx[i][0] * ly[j] * gz[k][0];
-            H(2, 0) = H(0, 2);
-            H(1, 2) = lx[i] * gy[j][0] * gz[k][0];
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-template<>
-struct LagrangeHexFast<3> {
-    static constexpr int n_dofs = 64;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes =
-        detail::make_hex_tensor_node_axes<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        detail::fill_axis_values<3>(xi[0], lx);
-        detail::fill_axis_values<3>(xi[1], ly);
-        detail::fill_axis_values<3>(xi[2], lz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
-        detail::fill_axis_values_first<3>(xi[0], lx, gx);
-        detail::fill_axis_values_first<3>(xi[1], ly, gy);
-        detail::fill_axis_values_first<3>(xi[2], lz, gz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            out[n] = Gradient{
-                gx[i] * ly[j] * lz[k],
-                lx[i] * gy[j] * lz[k],
-                lx[i] * ly[j] * gz[k],
-            };
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hz{};
-        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
-        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
-        detail::fill_axis_values_first_second<3>(xi[2], lz, gz, hz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            Hessian H{};
-            H(0, 0) = hx[i] * ly[j] * lz[k];
-            H(1, 1) = lx[i] * hy[j] * lz[k];
-            H(2, 2) = lx[i] * ly[j] * hz[k];
-            H(0, 1) = gx[i] * gy[j] * lz[k];
-            H(1, 0) = H(0, 1);
-            H(0, 2) = gx[i] * ly[j] * gz[k];
-            H(2, 0) = H(0, 2);
-            H(1, 2) = lx[i] * gy[j] * gz[k];
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeTriFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeTriFast;
-
-template<>
-struct LagrangeTriFast<1> {
-    static constexpr int n_dofs = 3;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = Real(1) - xi[0] - xi[1];
-        out[1] = xi[0];
-        out[2] = xi[1];
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-1), Real(-1), Real(0)};
-        out[1] = Gradient{Real( 1), Real( 0), Real(0)};
-        out[2] = Gradient{Real( 0), Real( 1), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeTriFast<2> {
-    static constexpr int n_dofs = 6;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        out[0] = l0 * (Real(2) * l0 - Real(1));
-        out[1] = l1 * (Real(2) * l1 - Real(1));
-        out[2] = l2 * (Real(2) * l2 - Real(1));
-        out[3] = Real(4) * l0 * l1;
-        out[4] = Real(4) * l1 * l2;
-        out[5] = Real(4) * l0 * l2;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
-
-        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
-        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
-        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
-        out[3] = detail::p2_edge_gradient(l0, g0, l1, g1);
-        out[4] = detail::p2_edge_gradient(l1, g1, l2, g2);
-        out[5] = detail::p2_edge_gradient(l0, g0, l2, g2);
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
-
-        out[0] = detail::p2_vertex_hessian(g0);
-        out[1] = detail::p2_vertex_hessian(g1);
-        out[2] = detail::p2_vertex_hessian(g2);
-        out[3] = detail::p2_edge_hessian(g0, g1);
-        out[4] = detail::p2_edge_hessian(g1, g2);
-        out[5] = detail::p2_edge_hessian(g0, g2);
-    }
-};
-
-template<>
-struct LagrangeTriFast<3> {
-    static constexpr int n_dofs = 10;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> exponents =
-        detail::make_triangle_simplex_exponents<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        detail::fill_simplex_factor_values<3>(l0, phi0);
-        detail::fill_simplex_factor_values<3>(l1, phi1);
-        detail::fill_simplex_factor_values<3>(l2, phi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
-        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
-        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real dl0 = dphi0[e[0]] * v1 * v2;
-            const Real dl1 = v0 * dphi1[e[1]] * v2;
-            const Real dl2 = v0 * v1 * dphi2[e[2]];
-            out[n] = Gradient{dl1 - dl0, dl2 - dl0, Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> d2phi0{};
-        std::array<Real, 4> d2phi1{};
-        std::array<Real, 4> d2phi2{};
-        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
-        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
-        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real D0 = dphi0[e[0]];
-            const Real D1 = dphi1[e[1]];
-            const Real D2 = dphi2[e[2]];
-            const Real H00 = d2phi0[e[0]] * v1 * v2;
-            const Real H11 = v0 * d2phi1[e[1]] * v2;
-            const Real H22 = v0 * v1 * d2phi2[e[2]];
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-
-            Hessian H{};
-            H(0, 0) = H00 - Real(2) * H01 + H11;
-            H(1, 1) = H00 - Real(2) * H02 + H22;
-            H(0, 1) = H00 - H01 - H02 + H12;
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeTetFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeTetFast;
-
-template<>
-struct LagrangeTetFast<1> {
-    static constexpr int n_dofs = 4;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = Real(1) - xi[0] - xi[1] - xi[2];
-        out[1] = xi[0];
-        out[2] = xi[1];
-        out[3] = xi[2];
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-1), Real(-1), Real(-1)};
-        out[1] = Gradient{Real( 1), Real( 0), Real( 0)};
-        out[2] = Gradient{Real( 0), Real( 1), Real( 0)};
-        out[3] = Gradient{Real( 0), Real( 0), Real( 1)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[3] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeTetFast<2> {
-    static constexpr int n_dofs = 10;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-
-        out[0] = l0 * (Real(2) * l0 - Real(1));
-        out[1] = l1 * (Real(2) * l1 - Real(1));
-        out[2] = l2 * (Real(2) * l2 - Real(1));
-        out[3] = l3 * (Real(2) * l3 - Real(1));
-        out[4] = Real(4) * l0 * l1;
-        out[5] = Real(4) * l1 * l2;
-        out[6] = Real(4) * l0 * l2;
-        out[7] = Real(4) * l0 * l3;
-        out[8] = Real(4) * l1 * l3;
-        out[9] = Real(4) * l2 * l3;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
-        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
-
-        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
-        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
-        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
-        out[3] = detail::scaled_gradient(g3, Real(4) * l3 - Real(1));
-        out[4] = detail::p2_edge_gradient(l0, g0, l1, g1);
-        out[5] = detail::p2_edge_gradient(l1, g1, l2, g2);
-        out[6] = detail::p2_edge_gradient(l0, g0, l2, g2);
-        out[7] = detail::p2_edge_gradient(l0, g0, l3, g3);
-        out[8] = detail::p2_edge_gradient(l1, g1, l3, g3);
-        out[9] = detail::p2_edge_gradient(l2, g2, l3, g3);
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
-        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
-
-        out[0] = detail::p2_vertex_hessian(g0);
-        out[1] = detail::p2_vertex_hessian(g1);
-        out[2] = detail::p2_vertex_hessian(g2);
-        out[3] = detail::p2_vertex_hessian(g3);
-        out[4] = detail::p2_edge_hessian(g0, g1);
-        out[5] = detail::p2_edge_hessian(g1, g2);
-        out[6] = detail::p2_edge_hessian(g0, g2);
-        out[7] = detail::p2_edge_hessian(g0, g3);
-        out[8] = detail::p2_edge_hessian(g1, g3);
-        out[9] = detail::p2_edge_hessian(g2, g3);
-    }
-};
-
-template<>
-struct LagrangeTetFast<3> {
-    static constexpr int n_dofs = 20;
-
-    static constexpr std::array<std::array<std::size_t, 4>, n_dofs> exponents =
-        detail::make_tetrahedron_simplex_exponents<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        detail::fill_simplex_factor_values<3>(l0, phi0);
-        detail::fill_simplex_factor_values<3>(l1, phi1);
-        detail::fill_simplex_factor_values<3>(l2, phi2);
-        detail::fill_simplex_factor_values<3>(l3, phi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]] * phi3[e[3]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> dphi3{};
-        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
-        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
-        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
-        detail::fill_simplex_factor_values_first<3>(l3, phi3, dphi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real v3 = phi3[e[3]];
-            const Real dl0 = dphi0[e[0]] * v1 * v2 * v3;
-            const Real dl1 = v0 * dphi1[e[1]] * v2 * v3;
-            const Real dl2 = v0 * v1 * dphi2[e[2]] * v3;
-            const Real dl3 = v0 * v1 * v2 * dphi3[e[3]];
-            out[n] = Gradient{dl1 - dl0, dl2 - dl0, dl3 - dl0};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> dphi3{};
-        std::array<Real, 4> d2phi0{};
-        std::array<Real, 4> d2phi1{};
-        std::array<Real, 4> d2phi2{};
-        std::array<Real, 4> d2phi3{};
-        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
-        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
-        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
-        detail::fill_simplex_factor_values_first_second<3>(l3, phi3, dphi3, d2phi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real v3 = phi3[e[3]];
-            const Real D0 = dphi0[e[0]];
-            const Real D1 = dphi1[e[1]];
-            const Real D2 = dphi2[e[2]];
-            const Real D3 = dphi3[e[3]];
-
-            const Real H00 = d2phi0[e[0]] * v1 * v2 * v3;
-            const Real H11 = v0 * d2phi1[e[1]] * v2 * v3;
-            const Real H22 = v0 * v1 * d2phi2[e[2]] * v3;
-            const Real H33 = v0 * v1 * v2 * d2phi3[e[3]];
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-
-            Hessian H{};
-            H(0, 0) = H00 - Real(2) * H01 + H11;
-            H(1, 1) = H00 - Real(2) * H02 + H22;
-            H(2, 2) = H00 - Real(2) * H03 + H33;
-            H(0, 1) = H00 - H01 - H02 + H12;
-            H(1, 0) = H(0, 1);
-            H(0, 2) = H00 - H01 - H03 + H13;
-            H(2, 0) = H(0, 2);
-            H(1, 2) = H00 - H02 - H03 + H23;
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISFAST_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
deleted file mode 100644
index 4a332621e..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
+++ /dev/null
@@ -1,2069 +0,0 @@
-#include "LagrangeBasisPyramid.h"
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "Basis/BasisExceptions.h"
-#include "BasisTolerance.h"
-#include "Math/DenseLinearAlgebra.h"
-#include "Math/DenseTransformKernels.h"
-#include "LagrangeBasisUtility.h"
-#include "PyramidModalBasis.h"
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-class PyramidLagrangeCache {
-public:
-    using ModalTerm = pyramid_modal::Term;
-
-    struct UvPolynomial {
-        using Power = std::pair<int, int>;
-        std::vector<std::pair<Power, Real>> coeffs;
-
-        void add_term(int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
-            if (std::abs(coeff) <= tol) {
-                return;
-            }
-            const auto key = std::make_pair(pu, pv);
-            const auto found = std::lower_bound(
-                coeffs.begin(),
-                coeffs.end(),
-                key,
-                [](const auto& entry, const Power& value) { return entry.first < value; });
-            if (found == coeffs.end() || found->first != key) {
-                coeffs.insert(found, {key, coeff});
-                return;
-            }
-
-            found->second += coeff;
-            if (std::abs(found->second) <= tol) {
-                coeffs.erase(found);
-            }
-        }
-
-        void add_scaled(const UvPolynomial& other, Real scale, Real tol = Real(1e-14)) {
-            if (std::abs(scale) <= tol) {
-                return;
-            }
-            for (const auto& [powers, coeff] : other.coeffs) {
-                add_term(powers.first, powers.second, scale * coeff, tol);
-            }
-        }
-
-        bool empty(Real tol = Real(1e-12)) const {
-            for (const auto& [powers, coeff] : coeffs) {
-                (void)powers;
-                if (std::abs(coeff) > tol) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        bool is_constant(Real tol = Real(1e-12)) const {
-            for (const auto& [powers, coeff] : coeffs) {
-                if (std::abs(coeff) <= tol) {
-                    continue;
-                }
-                if (powers.first != 0 || powers.second != 0) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        Real constant_value(Real tol = Real(1e-12)) const {
-            Real value = Real(0);
-            for (const auto& [powers, coeff] : coeffs) {
-                if (std::abs(coeff) <= tol) {
-                    continue;
-                }
-                if (powers.first == 0 && powers.second == 0) {
-                    value += coeff;
-                }
-            }
-            return value;
-        }
-    };
-
-    struct ApexSeries {
-        std::vector<std::pair<int, UvPolynomial>> by_power;
-
-        void add_term(int beta, int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
-            const auto found = find_or_insert(beta);
-            found->second.add_term(pu, pv, coeff, tol);
-            if (found->second.empty(tol)) {
-                by_power.erase(found);
-            }
-        }
-
-        void add_scaled(const ApexSeries& other, Real scale, Real tol = Real(1e-14)) {
-            if (std::abs(scale) <= tol) {
-                return;
-            }
-            for (const auto& [beta, poly] : other.by_power) {
-                const auto found = find_or_insert(beta);
-                found->second.add_scaled(poly, scale, tol);
-                if (found->second.empty(tol)) {
-                    by_power.erase(found);
-                }
-            }
-        }
-
-    private:
-        std::vector<std::pair<int, UvPolynomial>>::iterator find_or_insert(int beta) {
-            const auto found = std::lower_bound(
-                by_power.begin(),
-                by_power.end(),
-                beta,
-                [](const auto& entry, int value) { return entry.first < value; });
-            if (found != by_power.end() && found->first == beta) {
-                return found;
-            }
-            return by_power.insert(found, {beta, UvPolynomial{}});
-        }
-    };
-
-    using GradientSeries = std::array<ApexSeries, 3>;
-    using HessianSeries = std::array<std::array<ApexSeries, 3>, 3>;
-
-    enum class ApexLimitKind {
-        Constant,
-        DirectionDependent,
-        Singular,
-    };
-
-    enum class ApexRankStatus {
-        Exact,
-        DirectionDependent,
-        Singular,
-    };
-
-    struct ApexClassification {
-        ApexLimitKind kind{ApexLimitKind::Constant};
-        Real constant_value{0};
-        int leading_power{1};
-    };
-
-    struct ApexData {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        ApexRankStatus gradient_status{ApexRankStatus::Exact};
-        ApexRankStatus hessian_status{ApexRankStatus::Exact};
-    };
-
-    struct OrderData {
-        int order{0};
-        std::vector<math::Vector<Real, 3>> nodes;
-        std::vector<ModalTerm> modal_terms;
-        std::vector<Real> modal_to_nodal;
-        ApexData apex;
-    };
-
-    struct EvaluationScratch {
-        std::vector<Real> modal_values;
-        std::vector<Real> modal_gradient_components;
-        std::vector<Real> modal_hessian_components;
-        std::vector<Gradient> modal_gradients;
-        std::vector<Hessian> modal_hessians;
-        pyramid_modal::EvaluationPoint modal_point;
-
-        void prewarm(std::size_t max_size, std::size_t max_qpts) {
-            const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
-            modal_values.reserve(batched_size);
-            modal_gradient_components.reserve(batched_size * 3u);
-            modal_hessian_components.reserve(batched_size * 9u);
-            modal_gradients.reserve(max_size);
-            modal_hessians.reserve(max_size);
-        }
-    };
-
-    static EvaluationScratch& evaluation_scratch() {
-        // Scratch is intentionally thread-local: production assembly uses a
-        // persistent worker-thread team, so buffers stay warm on each worker.
-        static thread_local EvaluationScratch scratch;
-        return scratch;
-    }
-
-    static void prewarm_scratch(std::size_t max_size, std::size_t max_qpts) {
-        evaluation_scratch().prewarm(max_size, max_qpts);
-    }
-
-    static bool is_apex_point(const math::Vector<Real, 3>& xi) {
-        const Real tol = apex_coord_tolerance();
-        return std::abs(xi[0]) <= tol &&
-               std::abs(xi[1]) <= tol &&
-               std::abs(Real(1) - xi[2]) <= tol;
-    }
-
-    static bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi) {
-        return basis_near_zero(Real(1) - xi[2]);
-    }
-
-    static void validate_top_plane_query(const math::Vector<Real, 3>& xi) {
-        if (on_degenerate_top_plane(xi) && !is_apex_point(xi)) [[unlikely]] {
-            throw BasisEvaluationException(
-                "Pyramid reference evaluation on the degenerate z=1 plane is only defined at the apex",
-                __FILE__, __LINE__, __func__);
-        }
-    }
-
-    static OrderData build_order_data(int order) {
-        OrderData data;
-        data.order = order;
-
-        data.nodes = build_public_nodes(order);
-        data.modal_terms = pyramid_modal::build_terms(order);
-
-        const std::size_t n = data.nodes.size();
-        if (data.modal_terms.size() != n) {
-            throw BasisConstructionException("LagrangeBasis pyramid modal basis size mismatch",
-                                             __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> vandermonde(n * n, Real(0));
-        for (std::size_t row = 0; row < n; ++row) {
-            pyramid_modal::EvaluationPoint modal_point;
-            pyramid_modal::prepare_evaluation_point(
-                data.modal_terms, data.nodes[row], modal_point);
-            for (std::size_t col = 0; col < n; ++col) {
-                Real value = Real(0);
-                pyramid_modal::evaluate_term(data.modal_terms[col], modal_point, value);
-                vandermonde[row * n + col] = value;
-            }
-        }
-
-        const auto inverse_result = math::invert_dense_matrix_with_diagnostics(
-            std::move(vandermonde),
-            n,
-            "LagrangeBasis pyramid Vandermonde");
-        math::validate_dense_inverse_diagnostics(
-            inverse_result,
-            n,
-            "LagrangeBasis pyramid Vandermonde");
-        const std::vector<Real>& inverse = inverse_result.inverse;
-
-        data.modal_to_nodal.assign(n * n, Real(0));
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                data.modal_to_nodal[basis_i * n + modal_j] =
-                    inverse[modal_j * n + basis_i];
-            }
-        }
-        data.apex = build_apex_data(data);
-        return data;
-    }
-
-    static bool has_low_order_fast_modal_to_nodal(const OrderData& data) noexcept {
-        return data.order == 1 || data.order == 2;
-    }
-
-    static const OrderData& get(int order) {
-        constexpr int kMaxOnceCachedOrder = 12;
-        if (order >= 0 && order <= kMaxOnceCachedOrder) {
-            static std::array<std::once_flag, kMaxOnceCachedOrder + 1> flags;
-            static std::array<std::unique_ptr<OrderData>, kMaxOnceCachedOrder + 1> cache;
-            const auto idx = static_cast<std::size_t>(order);
-            std::call_once(flags[idx], [idx, order]() {
-                cache[idx] = std::make_unique<OrderData>(build_order_data(order));
-            });
-            return *cache[idx];
-        }
-
-        static std::mutex fallback_mutex;
-        static std::map<int, std::unique_ptr<OrderData>> fallback_cache;
-
-        std::lock_guard<std::mutex> lock(fallback_mutex);
-        const auto found = fallback_cache.find(order);
-        if (found != fallback_cache.end()) {
-            return *found->second;
-        }
-
-        auto data = std::make_unique<OrderData>(build_order_data(order));
-        const auto [it, inserted] = fallback_cache.emplace(order, std::move(data));
-        (void)inserted;
-        return *it->second;
-    }
-
-    static void evaluate_values(const OrderData& data,
-                                const math::Vector<Real, 3>& xi,
-                                std::vector<Real>& values) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            values = data.apex.values;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal = scratch.modal_values;
-        auto& modal_point = scratch.modal_point;
-        modal.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal, values);
-        } else {
-            apply_modal_to_nodal(data, modal, values);
-        }
-    }
-
-    static void evaluate_gradients(const OrderData& data,
-                                   const math::Vector<Real, 3>& xi,
-                                   std::vector<Gradient>& gradients) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            gradients = data.apex.gradients;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_point = scratch.modal_point;
-        modal_gradients.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal_gradients, gradients);
-        } else {
-            apply_modal_to_nodal(data, modal_gradients, gradients);
-        }
-    }
-
-    static void evaluate_hessians(const OrderData& data,
-                                  const math::Vector<Real, 3>& xi,
-                                  std::vector<Hessian>& hessians) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            hessians = data.apex.hessians;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_hessians.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal_hessians, hessians);
-        } else {
-            apply_modal_to_nodal(data, modal_hessians, hessians);
-        }
-    }
-
-    static void evaluate_all(const OrderData& data,
-                             const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& values,
-                             std::vector<Gradient>& gradients,
-                             std::vector<Hessian>& hessians) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            values = data.apex.values;
-            gradients = data.apex.gradients;
-            hessians = data.apex.hessians;
-            return;
-        }
-
-        const std::size_t n = data.modal_terms.size();
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_values.resize(n);
-        modal_gradients.resize(n);
-        modal_hessians.resize(n);
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-
-        for (std::size_t m = 0; m < n; ++m) {
-            pyramid_modal::evaluate_term(
-                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
-        }
-
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_all(
-                data, modal_values, modal_gradients, modal_hessians, values, gradients, hessians);
-            return;
-        }
-
-        values.resize(n);
-        gradients.resize(n);
-        hessians.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Gradient gradient{};
-            Hessian hessian{};
-            Real value = Real(0);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                value += coeff * modal_values[modal_j];
-
-                const Real* modal_gradient = modal_gradients[modal_j].data();
-                gradient[0] += coeff * modal_gradient[0];
-                gradient[1] += coeff * modal_gradient[1];
-                gradient[2] += coeff * modal_gradient[2];
-
-                const Real* modal_hessian = modal_hessians[modal_j].data();
-                Real* hessian_data = hessian.data();
-                hessian_data[0] += coeff * modal_hessian[0];
-                hessian_data[1] += coeff * modal_hessian[1];
-                hessian_data[2] += coeff * modal_hessian[2];
-                hessian_data[4] += coeff * modal_hessian[4];
-                hessian_data[5] += coeff * modal_hessian[5];
-                hessian_data[8] += coeff * modal_hessian[8];
-            }
-            values[basis_i] = value;
-            gradients[basis_i] = gradient;
-            Real* hessian_data = hessian.data();
-            hessian_data[3] = hessian_data[1];
-            hessian_data[6] = hessian_data[2];
-            hessian_data[7] = hessian_data[5];
-            hessians[basis_i] = hessian;
-        }
-    }
-
-    static void evaluate_values_to(const OrderData& data,
-                                   const math::Vector<Real, 3>& xi,
-                                   Real* SVMP_RESTRICT values_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal = scratch.modal_values;
-        auto& modal_point = scratch.modal_point;
-        modal.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal, values_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal, values_out);
-        }
-    }
-
-    static void evaluate_gradients_to(const OrderData& data,
-                                      const math::Vector<Real, 3>& xi,
-                                      Real* SVMP_RESTRICT gradients_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
-                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
-                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
-                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
-            }
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_point = scratch.modal_point;
-        modal_gradients.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal_gradients, gradients_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal_gradients, gradients_out);
-        }
-    }
-
-    static void evaluate_hessians_to(const OrderData& data,
-                                     const math::Vector<Real, 3>& xi,
-                                     Real* SVMP_RESTRICT hessians_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
-                store_hessian(data.apex.hessians[i], hessians_out + i * 9u);
-            }
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_hessians.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal_hessians, hessians_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal_hessians, hessians_out);
-        }
-    }
-
-    static void evaluate_all_to(const OrderData& data,
-                                const math::Vector<Real, 3>& xi,
-                                Real* SVMP_RESTRICT values_out,
-                                Real* SVMP_RESTRICT gradients_out,
-                                Real* SVMP_RESTRICT hessians_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
-            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
-                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
-                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
-                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
-            }
-            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
-                const Real* hessian = data.apex.hessians[i].data();
-                std::copy(hessian, hessian + 9u, hessians_out + i * 9u);
-            }
-            return;
-        }
-
-        const std::size_t n = data.modal_terms.size();
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_values.resize(n);
-        modal_gradients.resize(n);
-        modal_hessians.resize(n);
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-
-        for (std::size_t m = 0; m < n; ++m) {
-            pyramid_modal::evaluate_term(
-                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
-        }
-
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_all_to(
-                data, modal_values, modal_gradients, modal_hessians, values_out, gradients_out, hessians_out);
-            return;
-        }
-
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Real value = Real(0);
-            Real gradient[3] = {Real(0), Real(0), Real(0)};
-            Real hessian[9] = {};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                value += coeff * modal_values[modal_j];
-
-                const Real* modal_gradient = modal_gradients[modal_j].data();
-                gradient[0] += coeff * modal_gradient[0];
-                gradient[1] += coeff * modal_gradient[1];
-                gradient[2] += coeff * modal_gradient[2];
-
-                const Real* modal_hessian = modal_hessians[modal_j].data();
-                hessian[0] += coeff * modal_hessian[0];
-                hessian[1] += coeff * modal_hessian[1];
-                hessian[2] += coeff * modal_hessian[2];
-                hessian[4] += coeff * modal_hessian[4];
-                hessian[5] += coeff * modal_hessian[5];
-                hessian[8] += coeff * modal_hessian[8];
-            }
-
-            values_out[basis_i] = value;
-            Real* gradient_out = gradients_out + basis_i * 3u;
-            gradient_out[0] = gradient[0];
-            gradient_out[1] = gradient[1];
-            gradient_out[2] = gradient[2];
-
-            Real* hessian_out = hessians_out + basis_i * 9u;
-            hessian_out[0] = hessian[0];
-            hessian_out[1] = hessian[1];
-            hessian_out[2] = hessian[2];
-            hessian_out[3] = hessian[1];
-            hessian_out[4] = hessian[4];
-            hessian_out[5] = hessian[5];
-            hessian_out[6] = hessian[2];
-            hessian_out[7] = hessian[5];
-            hessian_out[8] = hessian[8];
-        }
-    }
-
-    static void evaluate_at_quadrature_points_strided(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        const unsigned mask = (values_out != nullptr ? 1u : 0u) |
-                              (gradients_out != nullptr ? 2u : 0u) |
-                              (hessians_out != nullptr ? 4u : 0u);
-        switch (mask) {
-            case 0u:
-                validate_strided_points(points);
-                return;
-            case 1u:
-                evaluate_at_quadrature_points_strided_impl<true, false, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 2u:
-                evaluate_at_quadrature_points_strided_impl<false, true, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 3u:
-                evaluate_at_quadrature_points_strided_impl<true, true, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 4u:
-                evaluate_at_quadrature_points_strided_impl<false, false, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 5u:
-                evaluate_at_quadrature_points_strided_impl<true, false, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 6u:
-                evaluate_at_quadrature_points_strided_impl<false, true, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 7u:
-                evaluate_at_quadrature_points_strided_impl<true, true, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            default:
-                return;
-        }
-    }
-
-private:
-    static void validate_strided_points(const std::vector<math::Vector<Real, 3>>& points) {
-        for (const auto& xi : points) {
-            validate_top_plane_query(xi);
-        }
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void write_apex_strided(const OrderData& data,
-                                   std::size_t q,
-                                   std::size_t output_stride,
-                                   Real* SVMP_RESTRICT values_out,
-                                   Real* SVMP_RESTRICT gradients_out,
-                                   Real* SVMP_RESTRICT hessians_out) {
-        const std::size_t n = data.modal_terms.size();
-        if constexpr (NeedValues) {
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                values_out[basis_i * output_stride + q] = data.apex.values[basis_i];
-            }
-        }
-        if constexpr (NeedGradients) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                Real* g = gradients_out + basis_i * 3u * output_stride;
-                g[0u * output_stride + q] = data.apex.gradients[basis_i][0];
-                g[1u * output_stride + q] = data.apex.gradients[basis_i][1];
-                g[2u * output_stride + q] = data.apex.gradients[basis_i][2];
-            }
-        }
-        if constexpr (NeedHessians) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                const Real* hessian = data.apex.hessians[basis_i].data();
-                Real* H = hessians_out + basis_i * 9u * output_stride;
-                for (std::size_t component = 0; component < 9u; ++component) {
-                    H[component * output_stride + q] = hessian[component];
-                }
-            }
-        }
-    }
-
-    template <int Px,
-              int Py,
-              int Pz,
-              int DenomPower,
-              bool NeedValues,
-              bool NeedGradients,
-              bool NeedHessians>
-    static void fill_low_order_modal_jet(std::size_t modal_i,
-                                         const Real* SVMP_RESTRICT xp,
-                                         const Real* SVMP_RESTRICT yp,
-                                         const Real* SVMP_RESTRICT zp,
-                                         const Real* SVMP_RESTRICT inv_tp,
-                                         Real* SVMP_RESTRICT modal_values,
-                                         Real (*SVMP_RESTRICT modal_gradients)[3],
-                                         Real (*SVMP_RESTRICT modal_hessians)[9]) {
-        const Real xy_base = xp[Px] * yp[Py];
-        const Real base = xy_base * zp[Pz];
-        const Real inv_denom = inv_tp[DenomPower];
-        const Real value = base * inv_denom;
-
-        if constexpr (NeedValues) {
-            modal_values[modal_i] = value;
-        }
-        if constexpr (NeedGradients) {
-            Real* g = modal_gradients[modal_i];
-            if constexpr (Px > 0) {
-                g[0] = static_cast<Real>(Px) * xp[Px - 1] * yp[Py] * zp[Pz] * inv_denom;
-            } else {
-                g[0] = Real(0);
-            }
-            if constexpr (Py > 0) {
-                g[1] = static_cast<Real>(Py) * xp[Px] * yp[Py - 1] * zp[Pz] * inv_denom;
-            } else {
-                g[1] = Real(0);
-            }
-            Real gz = Real(0);
-            if constexpr (Pz > 0) {
-                gz += static_cast<Real>(Pz) * xy_base * zp[Pz - 1] * inv_denom;
-            }
-            if constexpr (DenomPower > 0) {
-                gz += static_cast<Real>(DenomPower) * base * inv_tp[DenomPower + 1];
-            }
-            g[2] = gz;
-        }
-        if constexpr (NeedHessians) {
-            Real* H = modal_hessians[modal_i];
-            if constexpr (Px > 1) {
-                H[0] = static_cast<Real>(Px * (Px - 1)) *
-                       xp[Px - 2] * yp[Py] * zp[Pz] * inv_denom;
-            } else {
-                H[0] = Real(0);
-            }
-            if constexpr (Py > 1) {
-                H[4] = static_cast<Real>(Py * (Py - 1)) *
-                       xp[Px] * yp[Py - 2] * zp[Pz] * inv_denom;
-            } else {
-                H[4] = Real(0);
-            }
-            Real hxy = Real(0);
-            if constexpr (Px > 0 && Py > 0) {
-                hxy = static_cast<Real>(Px * Py) *
-                      xp[Px - 1] * yp[Py - 1] * zp[Pz] * inv_denom;
-            }
-            H[1] = hxy;
-            H[3] = hxy;
-
-            Real hxz = Real(0);
-            if constexpr (Px > 0) {
-                constexpr Real px_real = static_cast<Real>(Px);
-                const Real x_deriv_y = px_real * xp[Px - 1] * yp[Py];
-                if constexpr (Pz > 0) {
-                    hxz += x_deriv_y * static_cast<Real>(Pz) *
-                           zp[Pz - 1] * inv_denom;
-                }
-                if constexpr (DenomPower > 0) {
-                    hxz += x_deriv_y * static_cast<Real>(DenomPower) *
-                           zp[Pz] * inv_tp[DenomPower + 1];
-                }
-            }
-            H[2] = hxz;
-            H[6] = hxz;
-
-            Real hyz = Real(0);
-            if constexpr (Py > 0) {
-                constexpr Real py_real = static_cast<Real>(Py);
-                const Real x_y_deriv = py_real * xp[Px] * yp[Py - 1];
-                if constexpr (Pz > 0) {
-                    hyz += x_y_deriv * static_cast<Real>(Pz) *
-                           zp[Pz - 1] * inv_denom;
-                }
-                if constexpr (DenomPower > 0) {
-                    hyz += x_y_deriv * static_cast<Real>(DenomPower) *
-                           zp[Pz] * inv_tp[DenomPower + 1];
-                }
-            }
-            H[5] = hyz;
-            H[7] = hyz;
-
-            Real hzz = Real(0);
-            if constexpr (Pz > 1) {
-                hzz += static_cast<Real>(Pz * (Pz - 1)) *
-                       xy_base * zp[Pz - 2] * inv_denom;
-            }
-            if constexpr (Pz > 0 && DenomPower > 0) {
-                hzz += static_cast<Real>(2 * Pz * DenomPower) * xy_base *
-                       zp[Pz - 1] * inv_tp[DenomPower + 1];
-            }
-            if constexpr (DenomPower > 0) {
-                hzz += static_cast<Real>(DenomPower * (DenomPower + 1)) *
-                       base * inv_tp[DenomPower + 2];
-            }
-            H[8] = hzz;
-        }
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void evaluate_low_order_modal_jets(const OrderData& data,
-                                              const math::Vector<Real, 3>& xi,
-                                              Real* SVMP_RESTRICT modal_values,
-                                              Real (*SVMP_RESTRICT modal_gradients)[3],
-                                              Real (*SVMP_RESTRICT modal_hessians)[9]) {
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
-        const Real inv_t = Real(1) / (Real(1) - z);
-        const Real xp[3] = {Real(1), x, x * x};
-        const Real yp[3] = {Real(1), y, y * y};
-        const Real zp[3] = {Real(1), z, z * z};
-        Real inv_tp[5] = {Real(1), inv_t, Real(0), Real(0), Real(0)};
-        inv_tp[2] = inv_tp[1] * inv_t;
-        inv_tp[3] = inv_tp[2] * inv_t;
-        inv_tp[4] = inv_tp[3] * inv_t;
-
-        fill_low_order_modal_jet<0, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            0u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            1u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        if (data.order == 1) {
-            fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-                2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-                3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-                4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            return;
-        }
-
-        fill_low_order_modal_jet<2, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<2, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            5u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 2, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            6u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 2, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            7u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<2, 2, 0, 2, NeedValues, NeedGradients, NeedHessians>(
-            8u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            9u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            10u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 1, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            11u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 1, 1, 1, NeedValues, NeedGradients, NeedHessians>(
-            12u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 0, 2, 0, NeedValues, NeedGradients, NeedHessians>(
-            13u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static bool try_evaluate_low_order_strided(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        if (!has_low_order_fast_modal_to_nodal(data)) {
-            return false;
-        }
-        for (const auto& xi : points) {
-            validate_top_plane_query(xi);
-            if (is_apex_point(xi)) {
-                return false;
-            }
-        }
-
-        Real modal_values[14];
-        Real modal_gradients[14][3];
-        Real modal_hessians[14][9];
-        for (std::size_t q = 0; q < points.size(); ++q) {
-            evaluate_low_order_modal_jets<NeedValues, NeedGradients, NeedHessians>(
-                data, points[q], modal_values, modal_gradients, modal_hessians);
-            if constexpr (NeedValues) {
-                apply_low_order_combination(
-                    data,
-                    1u,
-                    [&](std::size_t modal_i, std::size_t) {
-                        return modal_values[modal_i];
-                    },
-                    [&](std::size_t basis_i, std::size_t, Real value) {
-                        values_out[basis_i * output_stride + q] = value;
-                    });
-            }
-            if constexpr (NeedGradients) {
-                apply_low_order_combination(
-                    data,
-                    3u,
-                    [&](std::size_t modal_i, std::size_t component) {
-                        return modal_gradients[modal_i][component];
-                    },
-                    [&](std::size_t basis_i, std::size_t component, Real value) {
-                        gradients_out[basis_i * 3u * output_stride +
-                                      component * output_stride + q] = value;
-                    });
-            }
-            if constexpr (NeedHessians) {
-                apply_low_order_combination(
-                    data,
-                    9u,
-                    [&](std::size_t modal_i, std::size_t component) {
-                        return modal_hessians[modal_i][component];
-                    },
-                    [&](std::size_t basis_i, std::size_t component, Real value) {
-                        hessians_out[basis_i * 9u * output_stride +
-                                     component * output_stride + q] = value;
-                    });
-            }
-        }
-        return true;
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void evaluate_at_quadrature_points_strided_impl(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        const std::size_t n = data.modal_terms.size();
-        if (points.empty() || n == 0u) {
-            return;
-        }
-        if (try_evaluate_low_order_strided<NeedValues, NeedGradients, NeedHessians>(
-                data, points, output_stride, values_out, gradients_out, hessians_out)) {
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        if constexpr (NeedValues) {
-            modal_values.resize(n);
-        }
-        if constexpr (NeedGradients) {
-            modal_gradients.resize(n);
-        }
-        if constexpr (NeedHessians) {
-            modal_hessians.resize(n);
-        }
-        const bool use_fast_modal_to_nodal = has_low_order_fast_modal_to_nodal(data);
-
-        if (!use_fast_modal_to_nodal) {
-            bool has_apex_query = false;
-            for (const auto& xi : points) {
-                validate_top_plane_query(xi);
-                has_apex_query = has_apex_query || is_apex_point(xi);
-            }
-
-            if (!has_apex_query) {
-                const std::size_t num_qpts = points.size();
-                if constexpr (NeedValues) {
-                    modal_values.resize(n * num_qpts);
-                }
-                if constexpr (NeedGradients) {
-                    scratch.modal_gradient_components.resize(n * 3u * num_qpts);
-                }
-                if constexpr (NeedHessians) {
-                    scratch.modal_hessian_components.resize(n * 9u * num_qpts);
-                }
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const auto& xi = points[q];
-                    pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-                    for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                        Real modal_value = Real(0);
-                        Gradient modal_gradient{};
-                        Hessian modal_hessian{};
-                        pyramid_modal::evaluate_term(
-                            data.modal_terms[modal_j],
-                            modal_point,
-                            modal_value,
-                            NeedGradients ? &modal_gradient : nullptr,
-                            NeedHessians ? &modal_hessian : nullptr);
-                        if constexpr (NeedValues) {
-                            modal_values[modal_j * num_qpts + q] = modal_value;
-                        }
-                        if constexpr (NeedGradients) {
-                            for (std::size_t component = 0; component < 3u; ++component) {
-                                scratch.modal_gradient_components[
-                                    (modal_j * 3u + component) * num_qpts + q] =
-                                    modal_gradient[component];
-                            }
-                        }
-                        if constexpr (NeedHessians) {
-                            for (std::size_t component = 0; component < 9u; ++component) {
-                                scratch.modal_hessian_components[
-                                    (modal_j * 9u + component) * num_qpts + q] =
-                                    modal_hessian.data()[component];
-                            }
-                        }
-                    }
-                }
-
-                const Real* transform = data.modal_to_nodal.data();
-                if constexpr (NeedValues) {
-                    math::dense_transform_batched_row_major(
-                        transform,
-                        n,
-                        n,
-                        modal_values.data(),
-                        num_qpts,
-                        values_out,
-                        output_stride,
-                        num_qpts);
-                }
-                if constexpr (NeedGradients) {
-                    for (std::size_t component = 0; component < 3u; ++component) {
-                        math::dense_transform_batched_row_major(
-                            transform,
-                            n,
-                            n,
-                            scratch.modal_gradient_components.data() + component * num_qpts,
-                            3u * num_qpts,
-                            gradients_out + component * output_stride,
-                            3u * output_stride,
-                            num_qpts);
-                    }
-                }
-                if constexpr (NeedHessians) {
-                    for (std::size_t component = 0; component < 9u; ++component) {
-                        math::dense_transform_batched_row_major(
-                            transform,
-                            n,
-                            n,
-                            scratch.modal_hessian_components.data() + component * num_qpts,
-                            9u * num_qpts,
-                            hessians_out + component * output_stride,
-                            9u * output_stride,
-                            num_qpts);
-                    }
-                }
-                return;
-            }
-        }
-
-        for (std::size_t q = 0; q < points.size(); ++q) {
-            const auto& xi = points[q];
-            validate_top_plane_query(xi);
-
-            if (is_apex_point(xi)) {
-                write_apex_strided<NeedValues, NeedGradients, NeedHessians>(
-                    data, q, output_stride, values_out, gradients_out, hessians_out);
-                continue;
-            }
-
-            pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                Gradient* gradient_out = nullptr;
-                Hessian* hessian_out = nullptr;
-                if constexpr (NeedGradients) {
-                    gradient_out = &modal_gradients[modal_j];
-                }
-                if constexpr (NeedHessians) {
-                    hessian_out = &modal_hessians[modal_j];
-                }
-                if constexpr (NeedValues) {
-                    pyramid_modal::evaluate_term(
-                        data.modal_terms[modal_j],
-                        modal_point,
-                        modal_values[modal_j],
-                        gradient_out,
-                        hessian_out);
-                } else {
-                    Real value = Real(0);
-                    pyramid_modal::evaluate_term(
-                        data.modal_terms[modal_j],
-                        modal_point,
-                        value,
-                        gradient_out,
-                        hessian_out);
-                }
-            }
-
-            if (use_fast_modal_to_nodal) {
-                if constexpr (NeedValues) {
-                    apply_low_order_combination(
-                        data,
-                        1u,
-                        [&](std::size_t modal_i, std::size_t) {
-                            return modal_values[modal_i];
-                        },
-                        [&](std::size_t basis_i, std::size_t, Real value) {
-                            values_out[basis_i * output_stride + q] = value;
-                        });
-                }
-                if constexpr (NeedGradients) {
-                    apply_low_order_combination(
-                        data,
-                        3u,
-                        [&](std::size_t modal_i, std::size_t component) {
-                            return modal_gradients[modal_i][component];
-                        },
-                        [&](std::size_t basis_i, std::size_t component, Real value) {
-                            gradients_out[basis_i * 3u * output_stride +
-                                          component * output_stride + q] = value;
-                        });
-                }
-                if constexpr (NeedHessians) {
-                    apply_low_order_combination(
-                        data,
-                        9u,
-                        [&](std::size_t modal_i, std::size_t component) {
-                            return modal_hessians[modal_i].data()[component];
-                        },
-                        [&](std::size_t basis_i, std::size_t component, Real value) {
-                            hessians_out[basis_i * 9u * output_stride +
-                                         component * output_stride + q] = value;
-                        });
-                }
-                continue;
-            }
-
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
-                [[maybe_unused]] Real value = Real(0);
-                [[maybe_unused]] std::array<Real, 3> gradient{};
-                [[maybe_unused]] std::array<Real, 9> hessian{};
-
-                for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                    const Real coeff = matrix_row[modal_j];
-                    if constexpr (NeedValues) {
-                        value += coeff * modal_values[modal_j];
-                    }
-                    if constexpr (NeedGradients) {
-                        const Real* modal_gradient = modal_gradients[modal_j].data();
-                        gradient[0] += coeff * modal_gradient[0];
-                        gradient[1] += coeff * modal_gradient[1];
-                        gradient[2] += coeff * modal_gradient[2];
-                    }
-                    if constexpr (NeedHessians) {
-                        const Real* modal_hessian = modal_hessians[modal_j].data();
-                        for (std::size_t component = 0; component < 9u; ++component) {
-                            hessian[component] += coeff * modal_hessian[component];
-                        }
-                    }
-                }
-
-                if constexpr (NeedValues) {
-                    values_out[basis_i * output_stride + q] = value;
-                }
-                if constexpr (NeedGradients) {
-                    Real* g = gradients_out + basis_i * 3u * output_stride;
-                    g[0u * output_stride + q] = gradient[0];
-                    g[1u * output_stride + q] = gradient[1];
-                    g[2u * output_stride + q] = gradient[2];
-                }
-                if constexpr (NeedHessians) {
-                    Real* H = hessians_out + basis_i * 9u * output_stride;
-                    for (std::size_t component = 0; component < 9u; ++component) {
-                        H[component * output_stride + q] = hessian[component];
-                    }
-                }
-            }
-        }
-    }
-
-    static Real apex_coord_tolerance() noexcept {
-        return basis_scaled_tolerance();
-    }
-
-    // Coefficient pruning for symbolic apex series, not a reference-coordinate
-    // roundoff test. Keep this strict and separate from BasisTolerance.
-    static constexpr Real kSeriesTolerance = Real(1e-12);
-
-    static Real binomial_coeff(int n, int k) {
-        if (k < 0 || k > n) {
-            return Real(0);
-        }
-        if (k == 0 || k == n) {
-            return Real(1);
-        }
-        k = std::min(k, n - k);
-        Real coeff = Real(1);
-        for (int i = 1; i <= k; ++i) {
-            coeff *= static_cast<Real>(n - (k - i));
-            coeff /= static_cast<Real>(i);
-        }
-        return coeff;
-    }
-
-    static void add_z_expansion(ApexSeries& series,
-                                int z_power,
-                                int beta0,
-                                int pu,
-                                int pv,
-                                Real coeff) {
-        for (int q = 0; q <= z_power; ++q) {
-            const Real z_coeff = coeff * binomial_coeff(z_power, q) *
-                                 ((q % 2 == 0) ? Real(1) : Real(-1));
-            series.add_term(beta0 + q, pu, pv, z_coeff, kSeriesTolerance);
-        }
-    }
-
-    static ApexSeries modal_value_asymptotic(const ModalTerm& term) {
-        ApexSeries series;
-        add_z_expansion(series,
-                        term.pz,
-                        term.px + term.py - term.denom_power,
-                        term.px,
-                        term.py,
-                        Real(1));
-        return series;
-    }
-
-    static GradientSeries modal_gradient_asymptotic(const ModalTerm& term) {
-        GradientSeries gradient_series{};
-
-        if (term.px > 0) {
-            add_z_expansion(gradient_series[0],
-                            term.pz,
-                            term.px - 1 + term.py - term.denom_power,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px));
-        }
-
-        if (term.py > 0) {
-            add_z_expansion(gradient_series[1],
-                            term.pz,
-                            term.px + term.py - 1 - term.denom_power,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py));
-        }
-
-        if (term.pz > 0) {
-            add_z_expansion(gradient_series[2],
-                            term.pz - 1,
-                            term.px + term.py - term.denom_power,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.pz));
-        }
-        if (term.denom_power > 0) {
-            add_z_expansion(gradient_series[2],
-                            term.pz,
-                            term.px + term.py - term.denom_power - 1,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.denom_power));
-        }
-
-        return gradient_series;
-    }
-
-    static HessianSeries modal_hessian_asymptotic(const ModalTerm& term) {
-        HessianSeries hessian_series{};
-
-        if (term.px > 1) {
-            add_z_expansion(hessian_series[0][0],
-                            term.pz,
-                            term.px - 2 + term.py - term.denom_power,
-                            term.px - 2,
-                            term.py,
-                            static_cast<Real>(term.px * (term.px - 1)));
-        }
-
-        if (term.py > 1) {
-            add_z_expansion(hessian_series[1][1],
-                            term.pz,
-                            term.px + term.py - 2 - term.denom_power,
-                            term.px,
-                            term.py - 2,
-                            static_cast<Real>(term.py * (term.py - 1)));
-        }
-
-        if (term.px > 0 && term.py > 0) {
-            add_z_expansion(hessian_series[0][1],
-                            term.pz,
-                            term.px + term.py - 2 - term.denom_power,
-                            term.px - 1,
-                            term.py - 1,
-                            static_cast<Real>(term.px * term.py));
-            hessian_series[1][0] = hessian_series[0][1];
-        }
-
-        if (term.px > 0 && term.pz > 0) {
-            add_z_expansion(hessian_series[0][2],
-                            term.pz - 1,
-                            term.px - 1 + term.py - term.denom_power,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px * term.pz));
-        }
-        if (term.px > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[0][2],
-                            term.pz,
-                            term.px - 1 + term.py - term.denom_power - 1,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px * term.denom_power));
-        }
-        hessian_series[2][0] = hessian_series[0][2];
-
-        if (term.py > 0 && term.pz > 0) {
-            add_z_expansion(hessian_series[1][2],
-                            term.pz - 1,
-                            term.px + term.py - 1 - term.denom_power,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py * term.pz));
-        }
-        if (term.py > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[1][2],
-                            term.pz,
-                            term.px + term.py - 1 - term.denom_power - 1,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py * term.denom_power));
-        }
-        hessian_series[2][1] = hessian_series[1][2];
-
-        if (term.pz > 1) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz - 2,
-                            term.px + term.py - term.denom_power,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.pz * (term.pz - 1)));
-        }
-        if (term.pz > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz - 1,
-                            term.px + term.py - term.denom_power - 1,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(2 * term.pz * term.denom_power));
-        }
-        if (term.denom_power > 0) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz,
-                            term.px + term.py - term.denom_power - 2,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.denom_power * (term.denom_power + 1)));
-        }
-
-        return hessian_series;
-    }
-
-    static ApexClassification classify_series(const ApexSeries& series) {
-        for (const auto& [beta, poly] : series.by_power) {
-            if (poly.empty(kSeriesTolerance)) {
-                continue;
-            }
-            if (beta < 0) {
-                return {ApexLimitKind::Singular, Real(0), beta};
-            }
-            if (beta > 0) {
-                return {ApexLimitKind::Constant, Real(0), beta};
-            }
-            if (poly.is_constant(kSeriesTolerance)) {
-                return {ApexLimitKind::Constant, poly.constant_value(kSeriesTolerance), beta};
-            }
-            return {ApexLimitKind::DirectionDependent, Real(0), beta};
-        }
-        return {ApexLimitKind::Constant, Real(0), 1};
-    }
-
-    static void accumulate_rank_status(ApexRankStatus& status,
-                                       const ApexClassification& classification) {
-        if (classification.kind == ApexLimitKind::Singular) {
-            status = ApexRankStatus::Singular;
-            return;
-        }
-        if (classification.kind == ApexLimitKind::DirectionDependent &&
-            status != ApexRankStatus::Singular) {
-            status = ApexRankStatus::DirectionDependent;
-        }
-    }
-
-    static std::string apex_status_message(const char* rank,
-                                           ApexRankStatus status) {
-        switch (status) {
-            case ApexRankStatus::DirectionDependent:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " at the exact apex is not uniquely defined under admissible interior approaches";
-            case ApexRankStatus::Singular:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " at the exact apex is singular for this basis family";
-            case ApexRankStatus::Exact:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " apex evaluation unexpectedly reported non-exact status";
-        }
-        return std::string("Pyramid rational nodal ") + rank +
-               " apex evaluation is not available";
-    }
-
-    static ApexData build_apex_data(const OrderData& data) {
-        const std::size_t n = data.modal_terms.size();
-
-        std::vector<ApexSeries> modal_values(n);
-        std::vector<GradientSeries> modal_gradients(n);
-        std::vector<HessianSeries> modal_hessians(n);
-        for (std::size_t m = 0; m < n; ++m) {
-            modal_values[m] = modal_value_asymptotic(data.modal_terms[m]);
-            modal_gradients[m] = modal_gradient_asymptotic(data.modal_terms[m]);
-            modal_hessians[m] = modal_hessian_asymptotic(data.modal_terms[m]);
-        }
-
-        std::vector<ApexSeries> nodal_values(n);
-        std::vector<GradientSeries> nodal_gradients(n);
-        std::vector<HessianSeries> nodal_hessians(n);
-        for (std::size_t i = 0; i < n; ++i) {
-            for (std::size_t m = 0; m < n; ++m) {
-                const Real coeff = data.modal_to_nodal[i * n + m];
-                nodal_values[i].add_scaled(modal_values[m], coeff, kSeriesTolerance);
-                for (int d = 0; d < 3; ++d) {
-                    nodal_gradients[i][static_cast<std::size_t>(d)].add_scaled(
-                        modal_gradients[m][static_cast<std::size_t>(d)], coeff, kSeriesTolerance);
-                }
-                for (int r = 0; r < 3; ++r) {
-                    for (int c = 0; c < 3; ++c) {
-                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]
-                            .add_scaled(
-                                modal_hessians[m][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)],
-                                coeff,
-                                kSeriesTolerance);
-                    }
-                }
-            }
-        }
-
-        ApexData apex;
-        apex.values.assign(n, Real(0));
-        apex.gradients.assign(n, Gradient{});
-        apex.hessians.assign(n, Hessian{});
-
-        for (std::size_t i = 0; i < n; ++i) {
-            const ApexClassification value_class = classify_series(nodal_values[i]);
-            if (value_class.kind != ApexLimitKind::Constant) {
-                throw BasisConstructionException(
-                    "Pyramid nodal value at apex is not uniquely defined for basis index " +
-                    std::to_string(i),
-                    __FILE__, __LINE__, __func__);
-            }
-            apex.values[i] = value_class.constant_value;
-
-            for (int d = 0; d < 3; ++d) {
-                const ApexClassification grad_class = classify_series(
-                    nodal_gradients[i][static_cast<std::size_t>(d)]);
-                accumulate_rank_status(apex.gradient_status, grad_class);
-                if (grad_class.kind == ApexLimitKind::Constant) {
-                    apex.gradients[i][static_cast<std::size_t>(d)] = grad_class.constant_value;
-                }
-            }
-
-            for (int r = 0; r < 3; ++r) {
-                for (int c = 0; c < 3; ++c) {
-                    const ApexClassification hess_class = classify_series(
-                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]);
-                    accumulate_rank_status(apex.hessian_status, hess_class);
-                    if (hess_class.kind == ApexLimitKind::Constant) {
-                        apex.hessians[i](static_cast<std::size_t>(r),
-                                         static_cast<std::size_t>(c)) = hess_class.constant_value;
-                    }
-                }
-            }
-        }
-
-        if (apex.gradient_status != ApexRankStatus::Exact) {
-            apex.gradients.clear();
-        }
-        if (apex.hessian_status != ApexRankStatus::Exact) {
-            apex.hessians.clear();
-        }
-
-        return apex;
-    }
-
-    static std::vector<math::Vector<Real, 3>> build_public_nodes(int order) {
-        if (order == 0) {
-            return {math::Vector<Real, 3>{Real(0), Real(0), Real(0.25)}};
-        }
-
-        std::vector<math::Vector<Real, 3>> nodes;
-        nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
-
-        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(-1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(-1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(1)});
-
-        for (int m = 1; m < order; ++m) {
-            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(-1), Real(0)});
-        }
-        for (int m = 1; m < order; ++m) {
-            nodes.push_back(math::Vector<Real, 3>{Real(1), equispaced_pm_one_coord(m, order), Real(0)});
-        }
-        for (int m = order - 1; m >= 1; --m) {
-            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(1), Real(0)});
-        }
-        for (int m = order - 1; m >= 1; --m) {
-            nodes.push_back(math::Vector<Real, 3>{Real(-1), equispaced_pm_one_coord(m, order), Real(0)});
-        }
-
-        for (int level = 1; level < order; ++level) {
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-            nodes.push_back(math::Vector<Real, 3>{-scale, -scale, z});
-            nodes.push_back(math::Vector<Real, 3>{scale, -scale, z});
-            nodes.push_back(math::Vector<Real, 3>{scale, scale, z});
-            nodes.push_back(math::Vector<Real, 3>{-scale, scale, z});
-        }
-
-        for (int j = 1; j < order; ++j) {
-            for (int i = 1; i < order; ++i) {
-                nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, order),
-                                                      equispaced_pm_one_coord(j, order),
-                                                      Real(0)});
-            }
-        }
-
-        for (int level = 1; level < order - 1; ++level) {
-            const int n = order - level;
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-
-            for (int m = 1; m < n; ++m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{s, -scale, z});
-            }
-            for (int m = 1; m < n; ++m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{scale, s, z});
-            }
-            for (int m = n - 1; m >= 1; --m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{s, scale, z});
-            }
-            for (int m = n - 1; m >= 1; --m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{-scale, s, z});
-            }
-        }
-
-        for (int level = 1; level < order - 1; ++level) {
-            const int n = order - level;
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-            for (int j = 1; j < n; ++j) {
-                for (int i = 1; i < n; ++i) {
-                    nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, n) * scale,
-                                                          equispaced_pm_one_coord(j, n) * scale,
-                                                          z});
-                }
-            }
-        }
-
-        return nodes;
-    }
-
-    struct VectorValueSink {
-        std::vector<Real>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, Real value) const { output[i] = value; }
-    };
-
-    struct RawValueSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, Real value) const { output[i] = value; }
-    };
-
-    struct VectorGradientSink {
-        std::vector<Gradient>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, const Gradient& value) const { output[i] = value; }
-    };
-
-    struct RawGradientSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, const Gradient& value) const {
-            Real* dst = output + i * 3u;
-            dst[0] = value[0];
-            dst[1] = value[1];
-            dst[2] = value[2];
-        }
-    };
-
-    struct VectorHessianSink {
-        std::vector<Hessian>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, const Hessian& value) const { output[i] = value; }
-    };
-
-    struct RawHessianSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, const Hessian& value) const {
-            store_hessian(value, output + i * 9u);
-        }
-    };
-
-    template <typename Get, typename Set>
-    static void apply_order1_combination(std::size_t components,
-                                         const Get& get,
-                                         const Set& set) {
-        for (std::size_t c = 0; c < components; ++c) {
-            const Real m0 = get(0u, c);
-            const Real m1 = get(1u, c);
-            const Real m2 = get(2u, c);
-            const Real m3 = get(3u, c);
-            const Real m4 = get(4u, c);
-            set(0u, c, Real(0.25) * (m0 - m1 - m2 + m3 - m4));
-            set(1u, c, Real(0.25) * (m0 + m1 - m2 - m3 - m4));
-            set(2u, c, Real(0.25) * (m0 + m1 + m2 + m3 - m4));
-            set(3u, c, Real(0.25) * (m0 - m1 + m2 - m3 - m4));
-            set(4u, c, m4);
-        }
-    }
-
-    template <typename Get, typename Set>
-    static void apply_order2_combination(std::size_t components,
-                                         const Get& get,
-                                         const Set& set) {
-        for (std::size_t c = 0; c < components; ++c) {
-            const Real m0 = get(0u, c);
-            const Real m1 = get(1u, c);
-            const Real m2 = get(2u, c);
-            const Real m3 = get(3u, c);
-            const Real m4 = get(4u, c);
-            const Real m5 = get(5u, c);
-            const Real m6 = get(6u, c);
-            const Real m7 = get(7u, c);
-            const Real m8 = get(8u, c);
-            const Real m9 = get(9u, c);
-            const Real m10 = get(10u, c);
-            const Real m11 = get(11u, c);
-            const Real m12 = get(12u, c);
-            const Real m13 = get(13u, c);
-            set(0u, c, Real(0.25) * (m4 - m5 - m7 + m8 - m9 + m10 + m11 - Real(2) * m12 + m13));
-            set(1u, c, Real(0.25) * (-m4 - m5 + m7 + m8 - m9 - m10 + m11 + Real(2) * m12 + m13));
-            set(2u, c, Real(0.25) * (m4 + m5 + m7 + m8 - m9 - m10 - m11 - Real(2) * m12 + m13));
-            set(3u, c, Real(0.25) * (-m4 + m5 - m7 + m8 - m9 + m10 - m11 + Real(2) * m12 + m13));
-            set(4u, c, -m9 + Real(2) * m13);
-            set(5u, c, Real(0.5) * (-m3 + m5 + m6 - m8 + m11));
-            set(6u, c, Real(0.5) * (m1 + m2 - m7 - m8 - m10));
-            set(7u, c, Real(0.5) * (m3 - m5 + m6 - m8 - m11));
-            set(8u, c, Real(0.5) * (-m1 + m2 + m7 - m8 + m10));
-            set(9u, c, m9 - m10 - m11 + m12 - m13);
-            set(10u, c, m9 + m10 - m11 - m12 - m13);
-            set(11u, c, m9 + m10 + m11 + m12 - m13);
-            set(12u, c, m9 - m10 + m11 - m12 - m13);
-            set(13u, c, m0 - m2 - m6 + m8 - Real(2) * m9 + m13);
-        }
-    }
-
-    template <typename Get, typename Set>
-    static void apply_low_order_combination(const OrderData& data,
-                                            std::size_t components,
-                                            const Get& get,
-                                            const Set& set) {
-        if (data.order == 1) {
-            apply_order1_combination(components, get, set);
-            return;
-        }
-        apply_order2_combination(components, get, set);
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Real>& modal_values,
-                                            std::vector<Real>& nodal_values) {
-        const std::size_t n = modal_values.size();
-        nodal_values.resize(n);
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Real>& modal_values,
-                                               Real* SVMP_RESTRICT nodal_values) {
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Gradient>& modal_gradients,
-                                            std::vector<Gradient>& nodal_gradients) {
-        const std::size_t n = modal_gradients.size();
-        nodal_gradients.resize(n);
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i][component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Gradient>& modal_gradients,
-                                               Real* SVMP_RESTRICT nodal_gradients) {
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i * 3u + component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Hessian>& modal_hessians,
-                                            std::vector<Hessian>& nodal_hessians) {
-        const std::size_t n = modal_hessians.size();
-        nodal_hessians.resize(n);
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i].data()[component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Hessian>& modal_hessians,
-                                               Real* SVMP_RESTRICT nodal_hessians) {
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i * 9u + component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_all(
-        const OrderData& data,
-        const std::vector<Real>& modal_values,
-        const std::vector<Gradient>& modal_gradients,
-        const std::vector<Hessian>& modal_hessians,
-        std::vector<Real>& nodal_values,
-        std::vector<Gradient>& nodal_gradients,
-        std::vector<Hessian>& nodal_hessians) {
-        const std::size_t n = modal_values.size();
-        nodal_values.resize(n);
-        nodal_gradients.resize(n);
-        nodal_hessians.resize(n);
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i][component] = value;
-            });
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i].data()[component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_all_to(
-        const OrderData& data,
-        const std::vector<Real>& modal_values,
-        const std::vector<Gradient>& modal_gradients,
-        const std::vector<Hessian>& modal_hessians,
-        Real* SVMP_RESTRICT nodal_values,
-        Real* SVMP_RESTRICT nodal_gradients,
-        Real* SVMP_RESTRICT nodal_hessians) {
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i * 3u + component] = value;
-            });
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i * 9u + component] = value;
-            });
-    }
-
-    template <typename Sink>
-    // Keep modal transform helpers free of forced-inline attributes unless
-    // compiler-versioned benchmarks and LLVM IR checks show a stable benefit.
-    static void apply_modal_values_to_nodal(const OrderData& data,
-                                            const std::vector<Real>& modal_values,
-                                            const Sink& sink) {
-        const std::size_t n = modal_values.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Real value = Real(0);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                value += row[modal_j] * modal_values[modal_j];
-            }
-            sink.write(basis_i, value);
-        }
-    }
-
-    template <typename Sink>
-    static void apply_modal_gradients_to_nodal(const OrderData& data,
-                                               const std::vector<Gradient>& modal_gradients,
-                                               const Sink& sink) {
-        const std::size_t n = modal_gradients.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Gradient gradient{};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    gradient[component] += coeff * modal_gradients[modal_j][component];
-                }
-            }
-            sink.write(basis_i, gradient);
-        }
-    }
-
-    template <typename Sink>
-    static void apply_modal_hessians_to_nodal(const OrderData& data,
-                                              const std::vector<Hessian>& modal_hessians,
-                                              const Sink& sink) {
-        const std::size_t n = modal_hessians.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
-            Hessian hessian{};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = matrix_row[modal_j];
-                for (std::size_t row = 0; row < 3u; ++row) {
-                    for (std::size_t col = 0; col < 3u; ++col) {
-                        hessian(row, col) += coeff * modal_hessians[modal_j](row, col);
-                    }
-                }
-            }
-            sink.write(basis_i, hessian);
-        }
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Real>& modal_values,
-                                     std::vector<Real>& nodal_values) {
-        apply_modal_values_to_nodal(data, modal_values, VectorValueSink{nodal_values});
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Gradient>& modal_gradients,
-                                     std::vector<Gradient>& nodal_gradients) {
-        apply_modal_gradients_to_nodal(data, modal_gradients, VectorGradientSink{nodal_gradients});
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Hessian>& modal_hessians,
-                                     std::vector<Hessian>& nodal_hessians) {
-        apply_modal_hessians_to_nodal(data, modal_hessians, VectorHessianSink{nodal_hessians});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Real>& modal_values,
-                                        Real* nodal_values) {
-        apply_modal_values_to_nodal(data, modal_values, RawValueSink{nodal_values});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Gradient>& modal_gradients,
-                                        Real* nodal_gradients) {
-        apply_modal_gradients_to_nodal(data, modal_gradients, RawGradientSink{nodal_gradients});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Hessian>& modal_hessians,
-                                        Real* nodal_hessians) {
-        apply_modal_hessians_to_nodal(data, modal_hessians, RawHessianSink{nodal_hessians});
-    }
-};
-
-namespace lagrange_pyramid {
-
-const std::vector<math::Vector<Real, 3>>& nodes(int order) {
-    return PyramidLagrangeCache::get(order).nodes;
-}
-
-void prewarm_scratch(int order, std::size_t max_qpts) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::prewarm_scratch(data.modal_terms.size(), max_qpts);
-}
-
-void evaluate_values(int order,
-                     const math::Vector<Real, 3>& xi,
-                     std::vector<Real>& values) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_values(data, xi, values);
-}
-
-void evaluate_gradients(int order,
-                        const math::Vector<Real, 3>& xi,
-                        std::vector<Gradient>& gradients) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_gradients(data, xi, gradients);
-}
-
-void evaluate_hessians(int order,
-                       const math::Vector<Real, 3>& xi,
-                       std::vector<Hessian>& hessians) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_hessians(data, xi, hessians);
-}
-
-void evaluate_all(int order,
-                  const math::Vector<Real, 3>& xi,
-                  std::vector<Real>& values,
-                  std::vector<Gradient>& gradients,
-                  std::vector<Hessian>& hessians) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_all(data, xi, values, gradients, hessians);
-}
-
-void evaluate_values_to(int order,
-                        const math::Vector<Real, 3>& xi,
-                        Real* SVMP_RESTRICT values_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_values_to(data, xi, values_out);
-}
-
-void evaluate_gradients_to(int order,
-                           const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT gradients_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_gradients_to(data, xi, gradients_out);
-}
-
-void evaluate_hessians_to(int order,
-                          const math::Vector<Real, 3>& xi,
-                          Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_hessians_to(data, xi, hessians_out);
-}
-
-void evaluate_all_to(int order,
-                     const math::Vector<Real, 3>& xi,
-                     Real* SVMP_RESTRICT values_out,
-                     Real* SVMP_RESTRICT gradients_out,
-                     Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_all_to(data, xi, values_out, gradients_out, hessians_out);
-}
-
-void evaluate_at_quadrature_points_strided(
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_at_quadrature_points_strided(
-        data, points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-} // namespace lagrange_pyramid
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
deleted file mode 100644
index 76859501c..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
-#define SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
-
-// Private declarations for the rational pyramid Lagrange helper implemented in
-// LagrangeBasisPyramid.cpp. This header is intentionally small so the large
-// construction and apex-classification code stays out of LagrangeBasis.cpp.
-
-#include "BasisFunction.h"
-
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace lagrange_pyramid {
-
-const std::vector<math::Vector<Real, 3>>& nodes(int order);
-
-void prewarm_scratch(int order, std::size_t max_qpts = 0);
-
-void evaluate_values(int order,
-                     const math::Vector<Real, 3>& xi,
-                     std::vector<Real>& values);
-void evaluate_gradients(int order,
-                        const math::Vector<Real, 3>& xi,
-                        std::vector<Gradient>& gradients);
-void evaluate_hessians(int order,
-                       const math::Vector<Real, 3>& xi,
-                       std::vector<Hessian>& hessians);
-void evaluate_all(int order,
-                  const math::Vector<Real, 3>& xi,
-                  std::vector<Real>& values,
-                  std::vector<Gradient>& gradients,
-                  std::vector<Hessian>& hessians);
-
-void evaluate_values_to(int order,
-                        const math::Vector<Real, 3>& xi,
-                        Real* SVMP_RESTRICT values_out);
-void evaluate_gradients_to(int order,
-                           const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT gradients_out);
-void evaluate_hessians_to(int order,
-                          const math::Vector<Real, 3>& xi,
-                          Real* SVMP_RESTRICT hessians_out);
-void evaluate_all_to(int order,
-                     const math::Vector<Real, 3>& xi,
-                     Real* SVMP_RESTRICT values_out,
-                     Real* SVMP_RESTRICT gradients_out,
-                     Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_at_quadrature_points_strided(
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-} // namespace lagrange_pyramid
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
deleted file mode 100644
index 36325576a..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
+++ /dev/null
@@ -1,2457 +0,0 @@
-#include "LagrangeBasisSimplex.h"
-
-#include <array>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-// Falling-factorial (equispaced barycentric) Lagrange factors for simplex nodes.
-//
-// For a fixed polynomial order p and barycentric coordinate lambda in [0, 1],
-// define
-//   phi_a(lambda) = product_{m=0}^{a-1} (p * lambda - m) / (a - m), a = 0..p
-// Then for a multi-index (i0, i1, ..., id) with sum i_k = p, the simplex
-// Lagrange basis function is product_k phi_{i_k}(lambda_k), nodal on the
-// barycentric lattice.
-//
-// Output buffers must each be sized to at least p+1 entries; the function
-// writes every output slot (no pre-zero required by the caller).
-template <bool NeedFirst, bool NeedSecond>
-void simplex_lagrange_factor_sequence_impl(int p,
-                                           Real lambda,
-                                           Real* phi,
-                                           Real* dphi,
-                                           Real* d2phi) {
-    static_assert(!NeedSecond || NeedFirst,
-                  "second derivative factors require first-derivative recurrence state");
-
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        dphi[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        d2phi[0] = Real(0);
-    }
-    if (p == 0) {
-        return;
-    }
-
-    const Real t = static_cast<Real>(p) * lambda;
-    const Real dt_dlambda = static_cast<Real>(p);
-
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-
-    for (int a = 1; a <= p; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt_old = dphi_dt_prev;
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
-            dphi[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
-                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-void simplex_lagrange_factor_sequence(int p,
-                                      Real lambda,
-                                      Real* phi,
-                                      Real* dphi,
-                                      Real* d2phi) {
-    if (d2phi != nullptr) {
-        simplex_lagrange_factor_sequence_impl<true, true>(p, lambda, phi, dphi, d2phi);
-    } else if (dphi != nullptr) {
-        simplex_lagrange_factor_sequence_impl<true, false>(p, lambda, phi, dphi, nullptr);
-    } else {
-        simplex_lagrange_factor_sequence_impl<false, false>(p, lambda, phi, nullptr, nullptr);
-    }
-}
-
-constexpr int kFixedSimplexAxisOrder = 12;
-constexpr std::size_t kFixedSimplexAxisSize =
-    static_cast<std::size_t>(kFixedSimplexAxisOrder + 1);
-constexpr std::size_t kFixedSimplexBatchEntries = 512;
-
-template <int Order>
-inline void simplex_lagrange_factor_values_product(Real lambda,
-                                                   Real* SVMP_RESTRICT values) {
-    static_assert(Order >= 0, "simplex order must be non-negative");
-    values[0] = Real(1);
-    const Real t = static_cast<Real>(Order) * lambda;
-    for (int a = 1; a <= Order; ++a) {
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        values[a] = values[a - 1] * (t - static_cast<Real>(a - 1)) * inv_a;
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
-        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
-        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2];
-        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2];
-        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2];
-        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2];
-    }
-}
-
-bool try_evaluate_triangle_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-    case 4:
-        evaluate_triangle_simplex_values_q4<4>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_values_q4<5>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_values_q4<6>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_values_q4<7>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_values_q4<8>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
-        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
-        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
-        simplex_lagrange_factor_values_product<Order>(l3, phi3[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2] * phi3[0][i3];
-        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2] * phi3[1][i3];
-        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2] * phi3[2][i3];
-        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2] * phi3[3][i3];
-    }
-}
-
-bool try_evaluate_tetrahedron_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-    case 4:
-        evaluate_tetrahedron_simplex_values_q4<4>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 5:
-        evaluate_tetrahedron_simplex_values_q4<5>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 6:
-        evaluate_tetrahedron_simplex_values_q4<6>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 7:
-        evaluate_tetrahedron_simplex_values_q4<7>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 8:
-        evaluate_tetrahedron_simplex_values_q4<8>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    static_assert(Order >= 3 && Order <= 8,
-                  "specialized tetrahedron gradient path covers orders 3..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real dphi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l0, phi0[q], dphi0[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l1, phi1[q], dphi1[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l2, phi2[q], dphi2[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l3, phi3[q], dphi3[q], nullptr);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real gx[4];
-        Real gy[4];
-        Real gz[4];
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real v3 = phi3[q][i3];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real D3 = dphi3[q][i3];
-            const Real v23 = v2 * v3;
-            const Real v01 = v0 * v1;
-            const Real dl0 = D0 * v1 * v23;
-            gx[q] = v0 * D1 * v23 - dl0;
-            gy[q] = v01 * D2 * v3 - dl0;
-            gz[q] = v01 * v2 * D3 - dl0;
-        }
-
-        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-        g[0u] = gx[0];
-        g[1u] = gx[1];
-        g[2u] = gx[2];
-        g[3u] = gx[3];
-        g[output_stride + 0u] = gy[0];
-        g[output_stride + 1u] = gy[1];
-        g[output_stride + 2u] = gy[2];
-        g[output_stride + 3u] = gy[3];
-        g[2u * output_stride + 0u] = gz[0];
-        g[2u * output_stride + 1u] = gz[1];
-        g[2u * output_stride + 2u] = gz[2];
-        g[2u * output_stride + 3u] = gz[3];
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    static_assert((Order == 2) || (Order >= 4 && Order <= 8),
-                  "specialized simplex path covers order 2 and orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l0, phi0[q], dphi0[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l1, phi1[q], dphi1[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l2, phi2[q], dphi2[q], nullptr);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real dl0 = D0 * v1 * v2;
-            g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-            g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-            g[2u * output_stride + q] = Real(0);
-        }
-    }
-}
-
-bool try_evaluate_triangle_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    switch (order) {
-    case 2:
-        evaluate_triangle_simplex_gradients_q4<2>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 4:
-        evaluate_triangle_simplex_gradients_q4<4>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_gradients_q4<5>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_gradients_q4<6>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_gradients_q4<7>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_gradients_q4<8>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-        H[2u * output_stride + 0u] = Real(0);
-        H[2u * output_stride + 1u] = Real(0);
-        H[2u * output_stride + 2u] = Real(0);
-        H[2u * output_stride + 3u] = Real(0);
-        H[5u * output_stride + 0u] = Real(0);
-        H[5u * output_stride + 1u] = Real(0);
-        H[5u * output_stride + 2u] = Real(0);
-        H[5u * output_stride + 3u] = Real(0);
-        H[6u * output_stride + 0u] = Real(0);
-        H[6u * output_stride + 1u] = Real(0);
-        H[6u * output_stride + 2u] = Real(0);
-        H[6u * output_stride + 3u] = Real(0);
-        H[7u * output_stride + 0u] = Real(0);
-        H[7u * output_stride + 1u] = Real(0);
-        H[7u * output_stride + 2u] = Real(0);
-        H[7u * output_stride + 3u] = Real(0);
-        H[8u * output_stride + 0u] = Real(0);
-        H[8u * output_stride + 1u] = Real(0);
-        H[8u * output_stride + 2u] = Real(0);
-        H[8u * output_stride + 3u] = Real(0);
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            if (value_row != nullptr) {
-                value_row[q] = v0 * v1 * v2;
-            }
-
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            if (g != nullptr) {
-                const Real dl0 = D0 * v1 * v2;
-                g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                g[2u * output_stride + q] = Real(0);
-            }
-
-            const Real DD0 = d2phi0[q][i0];
-            const Real DD1 = d2phi1[q][i1];
-            const Real DD2 = d2phi2[q][i2];
-            const Real H00 = DD0 * v1 * v2;
-            const Real H11 = v0 * DD1 * v2;
-            const Real H22 = v0 * v1 * DD2;
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-            const Real h01 = H00 - H01 - H02 + H12;
-            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-            H[1u * output_stride + q] = h01;
-            H[3u * output_stride + q] = h01;
-            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-        }
-    }
-}
-
-bool try_evaluate_triangle_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-    case 2:
-        evaluate_triangle_simplex_hessian_outputs_q4<2>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 3:
-        evaluate_triangle_simplex_hessian_outputs_q4<3>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 4:
-        evaluate_triangle_simplex_hessian_outputs_q4<4>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_hessian_outputs_q4<5>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_hessian_outputs_q4<6>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_hessian_outputs_q4<7>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_hessian_outputs_q4<8>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_hessian_q4(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real H00 = DD0 * v1 * v2 * v3;
-    const Real H11 = v0 * DD1 * v2 * v3;
-    const Real H22 = v0 * v1 * DD2 * v3;
-    const Real H33 = v0 * v1 * v2 * DD3;
-    const Real H01 = D0 * D1 * v2 * v3;
-    const Real H02 = D0 * v1 * D2 * v3;
-    const Real H03 = D0 * v1 * v2 * D3;
-    const Real H12 = v0 * D1 * D2 * v3;
-    const Real H13 = v0 * D1 * v2 * D3;
-    const Real H23 = v0 * v1 * D2 * D3;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-    H[0u * output_stride + Q] = H00 - Real(2) * H01 + H11;
-    H[1u * output_stride + Q] = h01;
-    H[2u * output_stride + Q] = h02;
-    H[3u * output_stride + Q] = h01;
-    H[4u * output_stride + Q] = H00 - Real(2) * H02 + H22;
-    H[5u * output_stride + Q] = h12;
-    H[6u * output_stride + Q] = h02;
-    H[7u * output_stride + Q] = h12;
-    H[8u * output_stride + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_hessian_stride4_q(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real v12 = v1 * v2;
-    const Real v13 = v1 * v3;
-    const Real v23 = v2 * v3;
-    const Real v123 = v1 * v23;
-    const Real v023 = v0 * v23;
-    const Real v013 = v0 * v13;
-    const Real v012 = v0 * v12;
-    const Real H00 = DD0 * v123;
-    const Real H11 = DD1 * v023;
-    const Real H22 = DD2 * v013;
-    const Real H33 = DD3 * v012;
-    const Real H01 = D0 * D1 * v23;
-    const Real H02 = D0 * D2 * v13;
-    const Real H03 = D0 * D3 * v12;
-    const Real H12 = D1 * D2 * v0 * v3;
-    const Real H13 = D1 * D3 * v0 * v2;
-    const Real H23 = D2 * D3 * v0 * v1;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-    H[Q] = H00 - Real(2) * H01 + H11;
-    H[4u + Q] = h01;
-    H[8u + Q] = h02;
-    H[12u + Q] = h01;
-    H[16u + Q] = H00 - Real(2) * H02 + H22;
-    H[20u + Q] = h12;
-    H[24u + Q] = h02;
-    H[28u + Q] = h12;
-    H[32u + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_all_stride4_q(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT g,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real v12 = v1 * v2;
-    const Real v13 = v1 * v3;
-    const Real v23 = v2 * v3;
-    const Real v123 = v1 * v23;
-    const Real v023 = v0 * v23;
-    const Real v013 = v0 * v13;
-    const Real v012 = v0 * v12;
-    const Real dl0 = D0 * v123;
-    const Real H00 = DD0 * v123;
-    const Real H11 = DD1 * v023;
-    const Real H22 = DD2 * v013;
-    const Real H33 = DD3 * v012;
-    const Real H01 = D0 * D1 * v23;
-    const Real H02 = D0 * D2 * v13;
-    const Real H03 = D0 * D3 * v12;
-    const Real H12 = D1 * D2 * v0 * v3;
-    const Real H13 = D1 * D3 * v0 * v2;
-    const Real H23 = D2 * D3 * v0 * v1;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-
-    value_row[Q] = v0 * v123;
-    g[Q] = D1 * v023 - dl0;
-    g[4u + Q] = D2 * v013 - dl0;
-    g[8u + Q] = D3 * v012 - dl0;
-    H[Q] = H00 - Real(2) * H01 + H11;
-    H[4u + Q] = h01;
-    H[8u + Q] = h02;
-    H[12u + Q] = h01;
-    H[16u + Q] = H00 - Real(2) * H02 + H22;
-    H[20u + Q] = h12;
-    H[24u + Q] = h02;
-    H[28u + Q] = h12;
-    H[32u + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real dphi3[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-    Real d2phi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l3, phi3[q], dphi3[q], d2phi3[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (values_out == nullptr && gradients_out == nullptr) {
-        if (output_stride == 4u) {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT H = hessians_out + node * 36u;
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-            }
-        } else {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                write_tetrahedron_simplex_hessian_q4<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-            }
-        }
-        return;
-    }
-
-    if (values_out != nullptr && gradients_out != nullptr) {
-        if (output_stride == 4u) {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                write_tetrahedron_simplex_all_stride4_q<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-            Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-            Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-            Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-
-            for (std::size_t q = 0; q < 4u; ++q) {
-                const Real v0 = phi0[q][i0];
-                const Real v1 = phi1[q][i1];
-                const Real v2 = phi2[q][i2];
-                const Real v3 = phi3[q][i3];
-                const Real D0 = dphi0[q][i0];
-                const Real D1 = dphi1[q][i1];
-                const Real D2 = dphi2[q][i2];
-                const Real D3 = dphi3[q][i3];
-                const Real DD0 = d2phi0[q][i0];
-                const Real DD1 = d2phi1[q][i1];
-                const Real DD2 = d2phi2[q][i2];
-                const Real DD3 = d2phi3[q][i3];
-                const Real v12 = v1 * v2;
-                const Real v13 = v1 * v3;
-                const Real v23 = v2 * v3;
-                const Real v123 = v1 * v23;
-                const Real v023 = v0 * v23;
-                const Real v013 = v0 * v13;
-                const Real v012 = v0 * v12;
-                const Real dl0 = D0 * v123;
-                const Real H00 = DD0 * v123;
-                const Real H11 = DD1 * v023;
-                const Real H22 = DD2 * v013;
-                const Real H33 = DD3 * v012;
-                const Real H01 = D0 * D1 * v23;
-                const Real H02 = D0 * D2 * v13;
-                const Real H03 = D0 * D3 * v12;
-                const Real H12 = D1 * D2 * v0 * v3;
-                const Real H13 = D1 * D3 * v0 * v2;
-                const Real H23 = D2 * D3 * v0 * v1;
-                const Real h01 = H00 - H01 - H02 + H12;
-                const Real h02 = H00 - H01 - H03 + H13;
-                const Real h12 = H00 - H02 - H03 + H23;
-
-                value_row[q] = v0 * v123;
-                g[0u * output_stride + q] = D1 * v023 - dl0;
-                g[1u * output_stride + q] = D2 * v013 - dl0;
-                g[2u * output_stride + q] = D3 * v012 - dl0;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = h02;
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = h12;
-                H[6u * output_stride + q] = h02;
-                H[7u * output_stride + q] = h12;
-                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-            }
-        }
-        return;
-    }
-
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real v3 = phi3[q][i3];
-            if (value_row != nullptr) {
-                value_row[q] = v0 * v1 * v2 * v3;
-            }
-
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real D3 = dphi3[q][i3];
-            if (g != nullptr) {
-                const Real dl0 = D0 * v1 * v2 * v3;
-                g[0u * output_stride + q] = v0 * D1 * v2 * v3 - dl0;
-                g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
-                g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
-            }
-
-            const Real DD0 = d2phi0[q][i0];
-            const Real DD1 = d2phi1[q][i1];
-            const Real DD2 = d2phi2[q][i2];
-            const Real DD3 = d2phi3[q][i3];
-            const Real H00 = DD0 * v1 * v2 * v3;
-            const Real H11 = v0 * DD1 * v2 * v3;
-            const Real H22 = v0 * v1 * DD2 * v3;
-            const Real H33 = v0 * v1 * v2 * DD3;
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-            const Real h01 = H00 - H01 - H02 + H12;
-            const Real h02 = H00 - H01 - H03 + H13;
-            const Real h12 = H00 - H02 - H03 + H23;
-            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-            H[1u * output_stride + q] = h01;
-            H[2u * output_stride + q] = h02;
-            H[3u * output_stride + q] = h01;
-            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-            H[5u * output_stride + q] = h12;
-            H[6u * output_stride + q] = h02;
-            H[7u * output_stride + q] = h12;
-            H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-        }
-    }
-}
-
-bool try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-    case 2:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<2>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 3:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<3>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 4:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<4>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 5:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<5>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 6:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<6>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 7:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<7>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 8:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<8>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-// Per-thread scratch space for simplex factor sequences. Common low orders use
-// fixed storage; higher orders fall back to dynamic vectors.
-struct SimplexAxisScratch {
-    std::size_t size{0};
-    std::array<Real, kFixedSimplexAxisSize> phi_fixed{};
-    std::array<Real, kFixedSimplexAxisSize> dphi_fixed{};
-    std::array<Real, kFixedSimplexAxisSize> d2phi_fixed{};
-    std::vector<Real> phi_dynamic;
-    std::vector<Real> dphi_dynamic;
-    std::vector<Real> d2phi_dynamic;
-
-    void reserveFor(std::size_t n) {
-        size = n;
-        if (n <= kFixedSimplexAxisSize) {
-            return;
-        }
-        if (phi_dynamic.size() < n) phi_dynamic.resize(n);
-        if (dphi_dynamic.size() < n) dphi_dynamic.resize(n);
-        if (d2phi_dynamic.size() < n) d2phi_dynamic.resize(n);
-    }
-
-    Real* phi() noexcept {
-        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
-    }
-
-    Real* dphi() noexcept {
-        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
-    }
-
-    Real* d2phi() noexcept {
-        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
-    }
-
-    const Real* phi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
-    }
-
-    const Real* dphi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
-    }
-
-    const Real* d2phi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
-    }
-};
-
-SimplexAxisScratch& simplex_axis_scratch_slot(int slot) {
-    thread_local SimplexAxisScratch s[4];
-    return s[slot];
-}
-
-struct SimplexVectorSink {
-    std::vector<Real>* values;
-    std::vector<Gradient>* gradients;
-    std::vector<Hessian>* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t n_nodes) const {
-        if (values)    values->resize(n_nodes);
-        if (gradients) gradients->resize(n_nodes);
-        if (hessians)  hessians->resize(n_nodes);
-    }
-
-    void write_value(std::size_t n, Real value) const {
-        (*values)[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
-        auto& gradient = (*gradients)[n];
-        gradient[0] = x;
-        gradient[1] = y;
-        gradient[2] = z;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Hessian hessian{};
-        hessian(0, 0) = xx;
-        hessian(1, 1) = yy;
-        hessian(2, 2) = zz;
-        hessian(0, 1) = xy; hessian(1, 0) = xy;
-        hessian(0, 2) = xz; hessian(2, 0) = xz;
-        hessian(1, 2) = yz; hessian(2, 1) = yz;
-        (*hessians)[n] = hessian;
-    }
-};
-
-struct SimplexRawSink {
-    Real* values;
-    Real* gradients;
-    Real* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t) const {}
-
-    void write_value(std::size_t n, Real value) const {
-        values[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
-        Real* gradient = gradients + n * 3u;
-        gradient[0] = x;
-        gradient[1] = y;
-        gradient[2] = z;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Real* hessian = hessians + n * 9u;
-        hessian[0] = xx;
-        hessian[1] = xy;
-        hessian[2] = xz;
-        hessian[3] = xy;
-        hessian[4] = yy;
-        hessian[5] = yz;
-        hessian[6] = xz;
-        hessian[7] = yz;
-        hessian[8] = zz;
-    }
-};
-
-template <typename Sink>
-void evaluate_triangle_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                          int order,
-                                          const math::Vector<Real, 3>& xi,
-                                          const Sink& sink) {
-    const Real l1 = xi[0];
-    const Real l2 = xi[1];
-    const Real l0 = Real(1) - l1 - l2;
-
-    const std::size_t n = static_cast<std::size_t>(order + 1);
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(n);
-    s1.reserveFor(n);
-    s2.reserveFor(n);
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    sink.prepare(num_nodes);
-    const bool need_values = sink.wants_values();
-    const bool need_gradients = sink.wants_gradients();
-    const bool need_hessians = sink.wants_hessians();
-    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-
-    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-    const Real* phi0 = s0.phi();
-    const Real* phi1 = s1.phi();
-    const Real* phi2 = s2.phi();
-    const Real* dphi0 = s0.dphi();
-    const Real* dphi1 = s1.dphi();
-    const Real* dphi2 = s2.dphi();
-    const Real* d2phi0 = s0.d2phi();
-    const Real* d2phi1 = s1.d2phi();
-    const Real* d2phi2 = s2.d2phi();
-
-    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
-        const auto& e = simplex_exponents[n_idx];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-        const Real v0 = phi0[i0];
-        const Real v1 = phi1[i1];
-        const Real v2 = phi2[i2];
-        if (need_values) {
-            sink.write_value(n_idx, v0 * v1 * v2);
-        }
-        if (!need_gradients && !need_hessians) {
-            continue;
-        }
-
-        const Real D0 = dphi0[i0];
-        const Real D1 = dphi1[i1];
-        const Real D2 = dphi2[i2];
-
-        if (need_gradients) {
-            const Real dl0 = D0 * v1 * v2;
-            const Real dl1 = v0 * D1 * v2;
-            const Real dl2 = v0 * v1 * D2;
-            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, Real(0));
-        }
-
-        if (need_hessians) {
-            const Real DD0 = d2phi0[i0];
-            const Real DD1 = d2phi1[i1];
-            const Real DD2 = d2phi2[i2];
-
-            const Real H00 = DD0 * v1 * v2;
-            const Real H11 = v0 * DD1 * v2;
-            const Real H22 = v0 * v1 * DD2;
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-
-            sink.write_hessian(n_idx,
-                               H00 - Real(2) * H01 + H11,
-                               H00 - Real(2) * H02 + H22,
-                               Real(0),
-                               H00 - H01 - H02 + H12,
-                               Real(0),
-                               Real(0));
-        }
-    }
-}
-
-void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>* values,
-                                     std::vector<Gradient>* gradients,
-                                     std::vector<Hessian>* hessians) {
-    const SimplexVectorSink sink{values, gradients, hessians};
-    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out) {
-    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_triangle_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_out != nullptr;
-    const bool need_hessians = hessians_out != nullptr;
-    if (num_qpts == 4u &&
-        values_out != nullptr &&
-        !need_gradients &&
-        !need_hessians &&
-        try_evaluate_triangle_simplex_values_q4(
-            simplex_exponents, order, points, output_stride, values_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        values_out == nullptr &&
-        need_gradients &&
-        !need_hessians &&
-        try_evaluate_triangle_simplex_gradients_q4(
-            simplex_exponents, order, points, output_stride, gradients_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        need_hessians &&
-        try_evaluate_triangle_simplex_hessian_outputs_q4(
-            simplex_exponents, order, points, output_stride,
-            values_out, gradients_out, hessians_out)) {
-        return;
-    }
-    const std::size_t batch_entries = sequence_size * num_qpts;
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    value_row[q] =
-                        phi0_batch[offset + i0] *
-                        phi1_batch[offset + i1] *
-                        phi2_batch[offset + i2];
-                }
-            }
-            return;
-        }
-
-        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* g = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                    g[2u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        if (order >= 4 &&
-            values_out == nullptr &&
-            gradients_out == nullptr &&
-            hessians_out != nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset,
-                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset,
-                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset,
-                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* H = hessians_out + node * 9u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    const Real h01 = H00 - H01 - H02 + H12;
-
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = Real(0);
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = Real(0);
-                    H[6u * output_stride + q] = Real(0);
-                    H[7u * output_stride + q] = Real(0);
-                    H[8u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-
-                if (gradients_out != nullptr) {
-                    const Real dl0 = D0 * v1 * v2;
-                    const Real dl1 = v0 * D1 * v2;
-                    const Real dl2 = v0 * v1 * D2;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    g[2u * output_stride + q] = Real(0);
-                }
-
-                if (hessians_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    const Real h01 = H00 - H01 - H02 + H12;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = Real(0);
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = Real(0);
-                    H[6u * output_stride + q] = Real(0);
-                    H[7u * output_stride + q] = Real(0);
-                    H[8u * output_stride + q] = Real(0);
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-            const Real value = v0 * v1 * v2;
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = value;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-
-            if (gradients_out != nullptr) {
-                const Real dl0 = D0 * v1 * v2;
-                const Real dl1 = v0 * D1 * v2;
-                const Real dl2 = v0 * v1 * D2;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-                g[2u * output_stride + q] = Real(0);
-            }
-
-            if (hessians_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real h01 = H00 - H01 - H02 + H12;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = Real(0);
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = Real(0);
-                H[6u * output_stride + q] = Real(0);
-                H[7u * output_stride + q] = Real(0);
-                H[8u * output_stride + q] = Real(0);
-            }
-        }
-    }
-}
-
-void evaluate_triangle_simplex_basis_wedge_components_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_xy_out,
-    Real* SVMP_RESTRICT hessians_xx_xy_yy_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_xy_out != nullptr;
-    const bool need_hessians = hessians_xx_xy_yy_out != nullptr;
-    const std::size_t batch_entries = sequence_size * num_qpts;
-
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr &&
-            gradients_xy_out != nullptr &&
-            hessians_xx_xy_yy_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* value_row = values_out + node * output_stride;
-                Real* g = gradients_xy_out + node * 2u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    value_row[q] = v0 * v1 * v2;
-                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                }
-            }
-            return;
-        }
-
-        if (values_out != nullptr &&
-            gradients_xy_out != nullptr &&
-            hessians_xx_xy_yy_out != nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l0, phi0_batch.data() + offset,
-                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l1, phi1_batch.data() + offset,
-                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l2, phi2_batch.data() + offset,
-                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-                Real* SVMP_RESTRICT g = gradients_xy_out + node * 2u * output_stride;
-                Real* SVMP_RESTRICT H = hessians_xx_xy_yy_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    const Real dl1 = v0 * D1 * v2;
-                    const Real dl2 = v0 * v1 * D2;
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-
-                    value_row[q] = v0 * v1 * v2;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_xy_out ? gradients_xy_out + node * 2u * output_stride : nullptr;
-            Real* H = hessians_xx_xy_yy_out ? hessians_xx_xy_yy_out + node * 3u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-                const Real dl0 = D0 * v1 * v2;
-                const Real dl1 = v0 * D1 * v2;
-                const Real dl2 = v0 * v1 * D2;
-
-                if (gradients_xy_out != nullptr) {
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                }
-
-                if (hessians_xx_xy_yy_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = v0 * v1 * v2;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-            const Real dl0 = D0 * v1 * v2;
-            const Real dl1 = v0 * D1 * v2;
-            const Real dl2 = v0 * v1 * D2;
-
-            if (gradients_xy_out != nullptr) {
-                Real* g = gradients_xy_out + node * 2u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-            }
-
-            if (hessians_xx_xy_yy_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-                Real* H = hessians_xx_xy_yy_out + node * 3u * output_stride;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-            }
-        }
-    }
-}
-
-template <typename Sink>
-void evaluate_tetrahedron_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                             int order,
-                                             const math::Vector<Real, 3>& xi,
-                                             const Sink& sink) {
-    const Real l1 = xi[0];
-    const Real l2 = xi[1];
-    const Real l3 = xi[2];
-    const Real l0 = Real(1) - l1 - l2 - l3;
-
-    const std::size_t n = static_cast<std::size_t>(order + 1);
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
-    s0.reserveFor(n);
-    s1.reserveFor(n);
-    s2.reserveFor(n);
-    s3.reserveFor(n);
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    sink.prepare(num_nodes);
-    const bool need_values = sink.wants_values();
-    const bool need_gradients = sink.wants_gradients();
-    const bool need_hessians = sink.wants_hessians();
-    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-    Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
-    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-    Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
-
-    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-    simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
-    const Real* phi0 = s0.phi();
-    const Real* phi1 = s1.phi();
-    const Real* phi2 = s2.phi();
-    const Real* phi3 = s3.phi();
-    const Real* dphi0 = s0.dphi();
-    const Real* dphi1 = s1.dphi();
-    const Real* dphi2 = s2.dphi();
-    const Real* dphi3 = s3.dphi();
-    const Real* d2phi0 = s0.d2phi();
-    const Real* d2phi1 = s1.d2phi();
-    const Real* d2phi2 = s2.d2phi();
-    const Real* d2phi3 = s3.d2phi();
-
-    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
-        const auto& e = simplex_exponents[n_idx];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-
-        const Real v0 = phi0[i0];
-        const Real v1 = phi1[i1];
-        const Real v2 = phi2[i2];
-        const Real v3 = phi3[i3];
-        if (need_values) {
-            sink.write_value(n_idx, v0 * v1 * v2 * v3);
-        }
-        if (!need_gradients && !need_hessians) {
-            continue;
-        }
-
-        const Real D0 = dphi0[i0];
-        const Real D1 = dphi1[i1];
-        const Real D2 = dphi2[i2];
-        const Real D3 = dphi3[i3];
-
-        if (need_gradients) {
-            const Real dl0 = D0 * v1 * v2 * v3;
-            const Real dl1 = v0 * D1 * v2 * v3;
-            const Real dl2 = v0 * v1 * D2 * v3;
-            const Real dl3 = v0 * v1 * v2 * D3;
-            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, dl3 - dl0);
-        }
-
-        if (need_hessians) {
-            const Real DD0 = d2phi0[i0];
-            const Real DD1 = d2phi1[i1];
-            const Real DD2 = d2phi2[i2];
-            const Real DD3 = d2phi3[i3];
-
-            const Real H00 = DD0 * v1 * v2 * v3;
-            const Real H11 = v0 * DD1 * v2 * v3;
-            const Real H22 = v0 * v1 * DD2 * v3;
-            const Real H33 = v0 * v1 * v2 * DD3;
-
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-
-            sink.write_hessian(n_idx,
-                               H00 - Real(2) * H01 + H11,
-                               H00 - Real(2) * H02 + H22,
-                               H00 - Real(2) * H03 + H33,
-                               H00 - H01 - H02 + H12,
-                               H00 - H01 - H03 + H13,
-                               H00 - H02 - H03 + H23);
-        }
-    }
-}
-
-void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians) {
-    const SimplexVectorSink sink{values, gradients, hessians};
-    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                           int order,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out) {
-    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_tetrahedron_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_out != nullptr;
-    const bool need_hessians = hessians_out != nullptr;
-    if (num_qpts == 4u &&
-        values_out != nullptr &&
-        !need_gradients &&
-        !need_hessians &&
-        try_evaluate_tetrahedron_simplex_values_q4(
-            simplex_exponents, order, points, output_stride, values_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        values_out == nullptr &&
-        need_gradients &&
-        !need_hessians) {
-        switch (order) {
-        case 3:
-            evaluate_tetrahedron_simplex_gradients_q4<3>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 4:
-            evaluate_tetrahedron_simplex_gradients_q4<4>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 5:
-            evaluate_tetrahedron_simplex_gradients_q4<5>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 6:
-            evaluate_tetrahedron_simplex_gradients_q4<6>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 7:
-            evaluate_tetrahedron_simplex_gradients_q4<7>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 8:
-            evaluate_tetrahedron_simplex_gradients_q4<8>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        default:
-            break;
-        }
-    }
-    if (num_qpts == 4u &&
-        need_hessians &&
-        try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
-            simplex_exponents, order, points, output_stride,
-            values_out, gradients_out, hessians_out)) {
-        return;
-    }
-    const std::size_t batch_entries = sequence_size * num_qpts;
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l3 = xi[2];
-                const Real l0 = Real(1) - l1 - l2 - l3;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l3, phi3_batch.data() + offset, nullptr, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    value_row[q] =
-                        phi0_batch[offset + i0] *
-                        phi1_batch[offset + i1] *
-                        phi2_batch[offset + i2] *
-                        phi3_batch[offset + i3];
-                }
-            }
-            return;
-        }
-
-        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l3 = xi[2];
-                const Real l0 = Real(1) - l1 - l2 - l3;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l3, phi3_batch.data() + offset, dphi3_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* g = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real v3 = phi3_batch[offset + i3];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real D3 = dphi3_batch[offset + i3];
-                    const Real v23 = v2 * v3;
-                    const Real dl0 = D0 * v1 * v23;
-                    g[0u * output_stride + q] = v0 * D1 * v23 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
-                    g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi3_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l3 = xi[2];
-            const Real l0 = Real(1) - l1 - l2 - l3;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d3_out = (need_gradients || need_hessians) ? dphi3_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            Real* d23_out = need_hessians ? d2phi3_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-            simplex_lagrange_factor_sequence(order, l3, phi3_batch.data() + offset, d3_out, d23_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                const Real v3 = phi3_batch[offset + i3];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2 * v3;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-                const Real D3 = dphi3_batch[offset + i3];
-
-                if (gradients_out != nullptr) {
-                    const Real dl0 = D0 * v1 * v2 * v3;
-                    const Real dl1 = v0 * D1 * v2 * v3;
-                    const Real dl2 = v0 * v1 * D2 * v3;
-                    const Real dl3 = v0 * v1 * v2 * D3;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    g[2u * output_stride + q] = dl3 - dl0;
-                }
-
-                if (hessians_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real DD3 = d2phi3_batch[offset + i3];
-                    const Real H00 = DD0 * v1 * v2 * v3;
-                    const Real H11 = v0 * DD1 * v2 * v3;
-                    const Real H22 = v0 * v1 * DD2 * v3;
-                    const Real H33 = v0 * v1 * v2 * DD3;
-                    const Real H01 = D0 * D1 * v2 * v3;
-                    const Real H02 = D0 * v1 * D2 * v3;
-                    const Real H03 = D0 * v1 * v2 * D3;
-                    const Real H12 = v0 * D1 * D2 * v3;
-                    const Real H13 = v0 * D1 * v2 * D3;
-                    const Real H23 = v0 * v1 * D2 * D3;
-                    const Real h01 = H00 - H01 - H02 + H12;
-                    const Real h02 = H00 - H01 - H03 + H13;
-                    const Real h12 = H00 - H02 - H03 + H23;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = h02;
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = h12;
-                    H[6u * output_stride + q] = h02;
-                    H[7u * output_stride + q] = h12;
-                    H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-    s3.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-        Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
-
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-        simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* phi3 = s3.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* dphi3 = s3.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-        const Real* d2phi3 = s3.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-            const Real v3 = phi3[i3];
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = v0 * v1 * v2 * v3;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-            const Real D3 = dphi3[i3];
-
-            if (gradients_out != nullptr) {
-                const Real dl0 = D0 * v1 * v2 * v3;
-                const Real dl1 = v0 * D1 * v2 * v3;
-                const Real dl2 = v0 * v1 * D2 * v3;
-                const Real dl3 = v0 * v1 * v2 * D3;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-                g[2u * output_stride + q] = dl3 - dl0;
-            }
-
-            if (hessians_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-                const Real DD3 = d2phi3[i3];
-
-                const Real H00 = DD0 * v1 * v2 * v3;
-                const Real H11 = v0 * DD1 * v2 * v3;
-                const Real H22 = v0 * v1 * DD2 * v3;
-                const Real H33 = v0 * v1 * v2 * DD3;
-
-                const Real H01 = D0 * D1 * v2 * v3;
-                const Real H02 = D0 * v1 * D2 * v3;
-                const Real H03 = D0 * v1 * v2 * D3;
-                const Real H12 = v0 * D1 * D2 * v3;
-                const Real H13 = v0 * D1 * v2 * D3;
-                const Real H23 = v0 * v1 * D2 * D3;
-
-                const Real h01 = H00 - H01 - H02 + H12;
-                const Real h02 = H00 - H01 - H03 + H13;
-                const Real h12 = H00 - H02 - H03 + H23;
-
-                Real* H = hessians_out + node * 9u * output_stride;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = h02;
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = h12;
-                H[6u * output_stride + q] = h02;
-                H[7u * output_stride + q] = h12;
-                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-            }
-        }
-    }
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
deleted file mode 100644
index 19cf725bd..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
-#define SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
-
-// Private declarations for simplex Lagrange evaluation helpers implemented in
-// LagrangeBasisSimplex.cpp.
-
-#include "BasisFunction.h"
-
-#include <array>
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>* values,
-                                     std::vector<Gradient>* gradients,
-                                     std::vector<Hessian>* hessians);
-
-void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_triangle_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_triangle_simplex_basis_wedge_components_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_xy_out,
-    Real* SVMP_RESTRICT hessians_xx_xy_yy_out);
-
-void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians);
-
-void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                           int order,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_tetrahedron_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
deleted file mode 100644
index e622de1c6..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
-#define SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
-
-// Private helper for LagrangeBasis internals.
-// This header is only intended to be included after the FE basis scalar types
-// are already available.
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-inline constexpr Real equispaced_pm_one_coord(int i, int order) {
-    if (order <= 0) {
-        return Real(0);
-    }
-    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 20f743916..ae3ea8ed3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -6,8 +6,8 @@
  */
 
 #include "NodeOrderingConventions.h"
-#include "Basis/BasisExceptions.h"
-#include "Basis/BasisTraits.h"
+#include "BasisExceptions.h"
+#include "BasisTraits.h"
 
 #include <array>
 
@@ -18,160 +18,7 @@ namespace basis {
 namespace {
 
 using Point = math::Vector<Real, 3>;
-using RawPoint = std::array<Real, 3>;
 
-template<std::size_t N>
-using NodeTable = std::array<RawPoint, N>;
-
-struct NodeTableView {
-    const RawPoint* data{nullptr};
-    std::size_t size{0};
-};
-
-inline constexpr NodeTable<2> kLine2Nodes = {{
-    {Real(-1), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<3> kLine3Nodes = {{
-    {Real(-1), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<3> kTriangle3Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<6> kTriangle6Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-}};
-
-inline constexpr NodeTable<4> kQuad4Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<9> kQuad9Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<8> kQuad8Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<4> kTetra4Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-}};
-
-inline constexpr NodeTable<10> kTetra10Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-    {Real(0), Real(0), Real(0.5)},
-    {Real(0.5), Real(0), Real(0.5)},
-    {Real(0), Real(0.5), Real(0.5)},
-}};
-
-inline constexpr NodeTable<8> kHex8Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-}};
-
-inline constexpr NodeTable<27> kHex27Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-    {Real(0), Real(-1), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(-1), Real(0), Real(-1)},
-    {Real(0), Real(-1), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(-1), Real(0), Real(1)},
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<20> kHex20Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-    {Real(0), Real(-1), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(-1), Real(0), Real(-1)},
-    {Real(0), Real(-1), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(-1), Real(0), Real(1)},
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-}};
-
-// Mesh uses conventional Hex20 ordering: corners first, then edge midpoints in
-// {bottom, top, vertical} groups. The quadratic Hex20 serendipity polynomial
-// table uses an axis-grouped edge order. This maps public mesh/reference index
-// to the internal polynomial-table index.
 constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     0, 1, 2, 3, 4, 5, 6, 7,
     8, 13, 10, 12,
@@ -179,157 +26,6 @@ constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     16, 17, 19, 18
 };
 
-inline constexpr NodeTable<6> kWedge6Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-}};
-
-inline constexpr NodeTable<18> kWedge18Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(0.5), Real(0), Real(-1)},
-    {Real(0.5), Real(0.5), Real(-1)},
-    {Real(0), Real(0.5), Real(-1)},
-    {Real(0.5), Real(0), Real(1)},
-    {Real(0.5), Real(0.5), Real(1)},
-    {Real(0), Real(0.5), Real(1)},
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-}};
-
-inline constexpr NodeTable<15> kWedge15Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(0.5), Real(0), Real(-1)},
-    {Real(0.5), Real(0.5), Real(-1)},
-    {Real(0), Real(0.5), Real(-1)},
-    {Real(0.5), Real(0), Real(1)},
-    {Real(0.5), Real(0.5), Real(1)},
-    {Real(0), Real(0.5), Real(1)},
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<5> kPyramid5Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-}};
-
-inline constexpr NodeTable<14> kPyramid14Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(-0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(0.5), Real(0.5)},
-    {Real(-0.5), Real(0.5), Real(0.5)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<13> kPyramid13Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(-0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(0.5), Real(0.5)},
-    {Real(-0.5), Real(0.5), Real(0.5)},
-}};
-
-template<std::size_t N>
-constexpr NodeTableView view(const NodeTable<N>& table) noexcept {
-    return NodeTableView{table.data(), table.size()};
-}
-
-Point to_point(const RawPoint& raw) {
-    return Point{raw[0], raw[1], raw[2]};
-}
-
-constexpr NodeTableView fixed_node_table(ElementType elem_type) noexcept {
-    switch (elem_type) {
-        case ElementType::Line2:     return view(kLine2Nodes);
-        case ElementType::Line3:     return view(kLine3Nodes);
-        case ElementType::Triangle3: return view(kTriangle3Nodes);
-        case ElementType::Triangle6: return view(kTriangle6Nodes);
-        case ElementType::Quad4:     return view(kQuad4Nodes);
-        case ElementType::Quad8:     return view(kQuad8Nodes);
-        case ElementType::Quad9:     return view(kQuad9Nodes);
-        case ElementType::Tetra4:    return view(kTetra4Nodes);
-        case ElementType::Tetra10:   return view(kTetra10Nodes);
-        case ElementType::Hex8:      return view(kHex8Nodes);
-        case ElementType::Hex20:     return view(kHex20Nodes);
-        case ElementType::Hex27:     return view(kHex27Nodes);
-        case ElementType::Wedge6:    return view(kWedge6Nodes);
-        case ElementType::Wedge15:   return view(kWedge15Nodes);
-        case ElementType::Wedge18:   return view(kWedge18Nodes);
-        case ElementType::Pyramid5:  return view(kPyramid5Nodes);
-        case ElementType::Pyramid13: return view(kPyramid13Nodes);
-        case ElementType::Pyramid14: return view(kPyramid14Nodes);
-        default:                     return {};
-    }
-}
-
-constexpr NodeTableView fixed_complete_lagrange_table(ElementType canonical_type,
-                                                      int order) noexcept {
-    switch (canonical_type) {
-        case ElementType::Line2:
-            return order == 1 ? view(kLine2Nodes) :
-                   order == 2 ? view(kLine3Nodes) : NodeTableView{};
-        case ElementType::Triangle3:
-            return order == 1 ? view(kTriangle3Nodes) :
-                   order == 2 ? view(kTriangle6Nodes) : NodeTableView{};
-        case ElementType::Quad4:
-            return order == 1 ? view(kQuad4Nodes) :
-                   order == 2 ? view(kQuad9Nodes) : NodeTableView{};
-        case ElementType::Tetra4:
-            return order == 1 ? view(kTetra4Nodes) :
-                   order == 2 ? view(kTetra10Nodes) : NodeTableView{};
-        case ElementType::Hex8:
-            return order == 1 ? view(kHex8Nodes) :
-                   order == 2 ? view(kHex27Nodes) : NodeTableView{};
-        case ElementType::Wedge6:
-            return order == 1 ? view(kWedge6Nodes) :
-                   order == 2 ? view(kWedge18Nodes) : NodeTableView{};
-        case ElementType::Pyramid5:
-            return order == 1 ? view(kPyramid5Nodes) :
-                   order == 2 ? view(kPyramid14Nodes) : NodeTableView{};
-        default:
-            return {};
-    }
-}
-
 Real line_coord_pm_one(int i, int order) {
     if (order <= 0) {
         return Real(0);
@@ -352,10 +48,10 @@ void append_triangle_face_interior(std::vector<Point>& nodes,
     for (int c = 1; c <= order - 2; ++c) {
         for (int b = 1; b <= order - c - 1; ++b) {
             const int a = order - b - c;
-            const Real la = static_cast<Real>(a) / static_cast<Real>(order);
-            const Real lb = static_cast<Real>(b) / static_cast<Real>(order);
-            const Real lc = static_cast<Real>(c) / static_cast<Real>(order);
-            nodes.push_back(v0 * la + v1 * lb + v2 * lc);
+            const Real inv = Real(1) / Real(order);
+            nodes.push_back(v0 * (Real(a) * inv) +
+                            v1 * (Real(b) * inv) +
+                            v2 * (Real(c) * inv));
         }
     }
 }
@@ -382,7 +78,6 @@ std::vector<Point> generate_triangle_nodes(int order) {
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
-
     nodes.push_back(Point{Real(0), Real(0), Real(0)});
     nodes.push_back(Point{Real(1), Real(0), Real(0)});
     nodes.push_back(Point{Real(0), Real(1), Real(0)});
@@ -398,13 +93,11 @@ std::vector<Point> generate_triangle_nodes(int order) {
         nodes.push_back(Point{Real(0), line_coord_zero_one(order - m, order), Real(0)});
     }
 
-    append_triangle_face_interior(
-        nodes,
-        Point{Real(0), Real(0), Real(0)},
-        Point{Real(1), Real(0), Real(0)},
-        Point{Real(0), Real(1), Real(0)},
-        order);
-
+    append_triangle_face_interior(nodes,
+                                  Point{Real(0), Real(0), Real(0)},
+                                  Point{Real(1), Real(0), Real(0)},
+                                  Point{Real(0), Real(1), Real(0)},
+                                  order);
     return nodes;
 }
 
@@ -415,7 +108,6 @@ std::vector<Point> generate_quad_nodes(int order) {
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
-
     nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
     nodes.push_back(Point{Real(1), Real(-1), Real(0)});
     nodes.push_back(Point{Real(1), Real(1), Real(0)});
@@ -433,13 +125,12 @@ std::vector<Point> generate_quad_nodes(int order) {
     for (int j = order - 1; j >= 1; --j) {
         nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), Real(0)});
     }
-
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+            nodes.push_back(Point{line_coord_pm_one(i, order),
+                                  line_coord_pm_one(j, order), Real(0)});
         }
     }
-
     return nodes;
 }
 
@@ -448,22 +139,20 @@ std::vector<Point> generate_tetra_nodes(int order) {
         return {Point{Real(0.25), Real(0.25), Real(0.25)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
-
     const Point verts[] = {
         Point{Real(0), Real(0), Real(0)},
         Point{Real(1), Real(0), Real(0)},
         Point{Real(0), Real(1), Real(0)},
         Point{Real(0), Real(0), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
 
-    const int edges[6][2] = {
-        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
-    };
+    const int edges[6][2] = {{0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}};
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const Real t = static_cast<Real>(m) / static_cast<Real>(order);
@@ -471,32 +160,24 @@ std::vector<Point> generate_tetra_nodes(int order) {
         }
     }
 
-    const int faces[4][3] = {
-        {0, 1, 2},
-        {0, 1, 3},
-        {1, 2, 3},
-        {0, 2, 3},
-    };
+    const int faces[4][3] = {{0, 1, 2}, {0, 1, 3}, {1, 2, 3}, {0, 2, 3}};
     for (const auto& face : faces) {
-        append_triangle_face_interior(
-            nodes,
-            verts[face[0]],
-            verts[face[1]],
-            verts[face[2]],
-            order);
+        append_triangle_face_interior(nodes,
+                                      verts[face[0]],
+                                      verts[face[1]],
+                                      verts[face[2]],
+                                      order);
     }
 
     for (int l = 1; l <= order - 3; ++l) {
         for (int k = 1; k <= order - l - 2; ++k) {
             for (int j = 1; j <= order - l - k - 1; ++j) {
-                const Real x = static_cast<Real>(j) / static_cast<Real>(order);
-                const Real y = static_cast<Real>(k) / static_cast<Real>(order);
-                const Real z = static_cast<Real>(l) / static_cast<Real>(order);
-                nodes.push_back(Point{x, y, z});
+                nodes.push_back(Point{Real(j) / Real(order),
+                                      Real(k) / Real(order),
+                                      Real(l) / Real(order)});
             }
         }
     }
-
     return nodes;
 }
 
@@ -505,9 +186,6 @@ std::vector<Point> generate_hex_nodes(int order) {
         return {Point{Real(0), Real(0), Real(0)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
-
     const Point verts[] = {
         Point{Real(-1), Real(-1), Real(-1)},
         Point{Real(1), Real(-1), Real(-1)},
@@ -518,6 +196,9 @@ std::vector<Point> generate_hex_nodes(int order) {
         Point{Real(1), Real(1), Real(1)},
         Point{Real(-1), Real(1), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
@@ -564,7 +245,6 @@ std::vector<Point> generate_hex_nodes(int order) {
             nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
-
     for (int k = 1; k < order; ++k) {
         for (int j = 1; j < order; ++j) {
             for (int i = 1; i < order; ++i) {
@@ -574,7 +254,6 @@ std::vector<Point> generate_hex_nodes(int order) {
             }
         }
     }
-
     return nodes;
 }
 
@@ -583,9 +262,6 @@ std::vector<Point> generate_wedge_nodes(int order) {
         return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
-
     const Point verts[] = {
         Point{Real(0), Real(0), Real(-1)},
         Point{Real(1), Real(0), Real(-1)},
@@ -594,6 +270,9 @@ std::vector<Point> generate_wedge_nodes(int order) {
         Point{Real(1), Real(0), Real(1)},
         Point{Real(0), Real(1), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
@@ -610,10 +289,8 @@ std::vector<Point> generate_wedge_nodes(int order) {
         }
     }
 
-    append_triangle_face_interior(
-        nodes, verts[0], verts[1], verts[2], order);
-    append_triangle_face_interior(
-        nodes, verts[3], verts[4], verts[5], order);
+    append_triangle_face_interior(nodes, verts[0], verts[1], verts[2], order);
+    append_triangle_face_interior(nodes, verts[3], verts[4], verts[5], order);
 
     for (int r = 1; r < order; ++r) {
         const Real z = line_coord_pm_one(r, order);
@@ -635,138 +312,21 @@ std::vector<Point> generate_wedge_nodes(int order) {
         const Real z = line_coord_pm_one(r, order);
         for (int c = 1; c <= order - 2; ++c) {
             for (int b = 1; b <= order - c - 1; ++b) {
-                const Real x = static_cast<Real>(b) / static_cast<Real>(order);
-                const Real y = static_cast<Real>(c) / static_cast<Real>(order);
-                nodes.push_back(Point{x, y, z});
-            }
-        }
-    }
-
-    return nodes;
-}
-
-std::vector<Point> generate_pyramid_nodes(int order) {
-    if (order == 0) {
-        return {Point{Real(0), Real(0), Real(0.25)}};
-    }
-
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
-
-    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(1), Real(0)});
-    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
-    nodes.push_back(Point{Real(0), Real(0), Real(1)});
-
-    for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{line_coord_pm_one(m, order), Real(-1), Real(0)});
-    }
-    for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{Real(1), line_coord_pm_one(m, order), Real(0)});
-    }
-    for (int m = order - 1; m >= 1; --m) {
-        nodes.push_back(Point{line_coord_pm_one(m, order), Real(1), Real(0)});
-    }
-    for (int m = order - 1; m >= 1; --m) {
-        nodes.push_back(Point{Real(-1), line_coord_pm_one(m, order), Real(0)});
-    }
-
-    for (int level = 1; level < order; ++level) {
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-        nodes.push_back(Point{-scale, -scale, z});
-        nodes.push_back(Point{scale, -scale, z});
-        nodes.push_back(Point{scale, scale, z});
-        nodes.push_back(Point{-scale, scale, z});
-    }
-
-    for (int j = 1; j < order; ++j) {
-        for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
-        }
-    }
-
-    for (int level = 1; level < order - 1; ++level) {
-        const int n = order - level;
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-
-        for (int m = 1; m < n; ++m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{s, -scale, z});
-        }
-        for (int m = 1; m < n; ++m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{scale, s, z});
-        }
-        for (int m = n - 1; m >= 1; --m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{s, scale, z});
-        }
-        for (int m = n - 1; m >= 1; --m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{-scale, s, z});
-        }
-    }
-
-    for (int level = 1; level < order - 1; ++level) {
-        const int n = order - level;
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-        for (int j = 1; j < n; ++j) {
-            for (int i = 1; i < n; ++i) {
-                nodes.push_back(Point{line_coord_pm_one(i, n) * scale,
-                                      line_coord_pm_one(j, n) * scale,
+                nodes.push_back(Point{Real(b) / Real(order),
+                                      Real(c) / Real(order),
                                       z});
             }
         }
     }
-
     return nodes;
 }
 
-} // namespace
-
-math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
-                                                     std::size_t local_node) {
-    const auto table = fixed_node_table(elem_type);
-    if (table.data != nullptr && local_node < table.size) {
-        return to_point(table.data[local_node]);
-    }
-
-    throw BasisNodeOrderingException("Invalid element type or node index in ReferenceNodeLayout::get_node_coords",
-                                     __FILE__, __LINE__, __func__);
-}
-
-std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
-    const auto table = fixed_node_table(elem_type);
-    if (table.data != nullptr) {
-        return table.size;
-    }
-
-    throw BasisNodeOrderingException("Unknown element type in ReferenceNodeLayout::num_nodes",
-                                     __FILE__, __LINE__, __func__);
-}
-
-std::vector<math::Vector<Real, 3>>
-ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
     if (order < 0) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords requires non-negative order",
+        throw BasisNodeOrderingException("ReferenceNodeLayout requires non-negative Lagrange order",
                                          __FILE__, __LINE__, __func__);
     }
-
     const ElementType type = canonical_lagrange_type(canonical_type);
-    const auto fixed_table = fixed_complete_lagrange_table(type, order);
-    if (fixed_table.data != nullptr) {
-        std::vector<Point> nodes;
-        nodes.reserve(fixed_table.size);
-        for (std::size_t i = 0; i < fixed_table.size; ++i) {
-            nodes.push_back(to_point(fixed_table.data[i]));
-        }
-        return nodes;
-    }
-
     switch (type) {
         case ElementType::Point1:
             return {Point{Real(0), Real(0), Real(0)}};
@@ -783,24 +343,70 @@ ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int or
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            return generate_pyramid_nodes(order);
-        case ElementType::Quad8:
-        case ElementType::Hex20:
-        case ElementType::Wedge15:
+            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
+                                             __FILE__, __LINE__, __func__);
+        default:
+            throw BasisNodeOrderingException("ReferenceNodeLayout: unsupported Lagrange topology",
+                                             __FILE__, __LINE__, __func__);
+    }
+}
+
+std::vector<Point> element_nodes(ElementType elem_type) {
+    const int order = complete_lagrange_alias_order(elem_type);
+    if (order >= 0) {
+        return complete_lagrange_nodes(elem_type, order);
+    }
+
+    switch (elem_type) {
+        case ElementType::Quad8: {
+            auto nodes = generate_quad_nodes(2);
+            nodes.resize(8u);
+            return nodes;
+        }
+        case ElementType::Hex20: {
+            auto nodes = generate_hex_nodes(2);
+            nodes.resize(20u);
+            return nodes;
+        }
+        case ElementType::Wedge15: {
+            auto nodes = generate_wedge_nodes(2);
+            nodes.resize(15u);
+            return nodes;
+        }
         case ElementType::Pyramid13:
-            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords does not support serendipity topologies",
+            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
                                              __FILE__, __LINE__, __func__);
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords: unsupported topology",
+            throw BasisNodeOrderingException("ReferenceNodeLayout: unknown element type",
                                              __FILE__, __LINE__, __func__);
     }
 }
 
+} // namespace
+
+math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+                                                           std::size_t local_node) {
+    const auto nodes = element_nodes(elem_type);
+    if (local_node >= nodes.size()) {
+        throw BasisNodeOrderingException("ReferenceNodeLayout::get_node_coords: node index out of range",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return nodes[local_node];
+}
+
+std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
+    return element_nodes(elem_type).size();
+}
+
+std::vector<math::Vector<Real, 3>>
+ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+    return complete_lagrange_nodes(canonical_type, order);
+}
+
 std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
     if (elem_type == ElementType::Hex20) {
-        return std::span<const std::size_t>(
-            kHex20MeshToBasisOrder.data(),
-            kHex20MeshToBasisOrder.size());
+        return std::span<const std::size_t>(kHex20MeshToBasisOrder.data(),
+                                            kHex20MeshToBasisOrder.size());
     }
     return {};
 }
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 52af4d932..8a43cc4e3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -8,526 +8,28 @@
 #ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 #define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 
-#include "Types.h"
 #include "Math/Vector.h"
-#include <cstddef>
-#include <vector>
-
-/**
- * @file NodeOrderingConventions.h
- * @brief Documentation of node ordering conventions for all element types
- *
- * This file provides comprehensive documentation of the node ordering
- * conventions used throughout the FE library. These orderings are consistent
- * with VTK conventions and must be matched exactly when interfacing with
- * the Mesh library.
- *
- * IMPORTANT: The FE library (Basis, Quadrature, Geometry) uses "node" to refer
- * to degrees of freedom locations on reference elements. The Mesh library uses
- * "vertex" for geometry vertices and "cell" for mesh elements. When interfacing
- * between the two, ensure consistent ordering.
- *
- * Reference Element Conventions:
- * - Line:       xi in [-1, 1]
- * - Quad:       (xi, eta) in [-1, 1] x [-1, 1]
- * - Hex:        (xi, eta, zeta) in [-1, 1]^3
- * - Triangle:   (xi, eta) in simplex with vertices (0,0), (1,0), (0,1)
- * - Tetrahedron: (xi, eta, zeta) in simplex with vertices
- *                (0,0,0), (1,0,0), (0,1,0), (0,0,1)
- * - Wedge:      Triangle base x line height, zeta in [-1, 1]
- * - Pyramid:    Quad base at z=0, apex at (0, 0, 1)
- *
- *
- * =============================================================================
- * 1D ELEMENTS
- * =============================================================================
- *
- * Line2 (Linear Line)
- * -------------------
- *   0---------1
- *   |         |
- *  xi=-1     xi=+1
- *
- * Node 0: xi = -1
- * Node 1: xi = +1
- *
- *
- * Line3 (Quadratic Line)
- * ----------------------
- *   0----2----1
- *   |    |    |
- *  xi=-1 0   xi=+1
- *
- * Node 0: xi = -1
- * Node 1: xi = +1
- * Node 2: xi =  0 (mid-edge)
- *
- *
- * =============================================================================
- * 2D QUADRILATERAL ELEMENTS
- * =============================================================================
- *
- * Quad4 (Bilinear Quadrilateral)
- * ------------------------------
- *
- *   3-----------2
- *   |           |
- *   |           |
- *   |           |
- *   0-----------1
- *
- * Node 0: (xi, eta) = (-1, -1)
- * Node 1: (xi, eta) = (+1, -1)
- * Node 2: (xi, eta) = (+1, +1)
- * Node 3: (xi, eta) = (-1, +1)
- *
- *
- * Quad8 (Serendipity Quadrilateral)
- * ---------------------------------
- *
- *   3-----6-----2
- *   |           |
- *   7           5
- *   |           |
- *   0-----4-----1
- *
- * Corners (same as Quad4):
- *   Node 0: (-1, -1)
- *   Node 1: (+1, -1)
- *   Node 2: (+1, +1)
- *   Node 3: (-1, +1)
- *
- * Mid-edge nodes:
- *   Node 4: ( 0, -1)  (edge 0-1)
- *   Node 5: (+1,  0)  (edge 1-2)
- *   Node 6: ( 0, +1)  (edge 2-3)
- *   Node 7: (-1,  0)  (edge 3-0)
- *
- *
- * Quad9 (Biquadratic Quadrilateral)
- * ---------------------------------
- *
- *   3-----6-----2
- *   |           |
- *   7     8     5
- *   |           |
- *   0-----4-----1
- *
- * Same as Quad8 plus:
- *   Node 8: (0, 0)  (center)
- *
- *
- * =============================================================================
- * 3D HEXAHEDRAL ELEMENTS
- * =============================================================================
- *
- * Hex8 (Trilinear Hexahedron)
- * ---------------------------
- *
- *        7-----------6
- *       /|          /|
- *      / |         / |
- *     4-----------5  |
- *     |  |        |  |
- *     |  3--------|--2
- *     | /         | /
- *     |/          |/
- *     0-----------1
- *
- * Bottom face (zeta = -1):
- *   Node 0: (xi, eta, zeta) = (-1, -1, -1)
- *   Node 1: (xi, eta, zeta) = (+1, -1, -1)
- *   Node 2: (xi, eta, zeta) = (+1, +1, -1)
- *   Node 3: (xi, eta, zeta) = (-1, +1, -1)
- *
- * Top face (zeta = +1):
- *   Node 4: (xi, eta, zeta) = (-1, -1, +1)
- *   Node 5: (xi, eta, zeta) = (+1, -1, +1)
- *   Node 6: (xi, eta, zeta) = (+1, +1, +1)
- *   Node 7: (xi, eta, zeta) = (-1, +1, +1)
- *
- *
- * Hex20 (Serendipity Hexahedron)
- * ------------------------------
- *
- *        7-----14-----6
- *       /|           /|
- *     15 |         13 |
- *     /  19        /  18
- *    4-----12-----5   |
- *    |   |        |   |
- *    |   3-----10-|---2
- *   16  /        17  /
- *    | 11         | 9
- *    |/           |/
- *    0------8-----1
- *
- * Corners (same as Hex8): Nodes 0-7
- *
- * Mid-edge nodes on bottom face (zeta = -1):
- *   Node 8:  ( 0, -1, -1)  (edge 0-1)
- *   Node 9:  (+1,  0, -1)  (edge 1-2)
- *   Node 10: ( 0, +1, -1)  (edge 2-3)
- *   Node 11: (-1,  0, -1)  (edge 3-0)
- *
- * Mid-edge nodes on top face (zeta = +1):
- *   Node 12: ( 0, -1, +1)  (edge 4-5)
- *   Node 13: (+1,  0, +1)  (edge 5-6)
- *   Node 14: ( 0, +1, +1)  (edge 6-7)
- *   Node 15: (-1,  0, +1)  (edge 7-4)
- *
- * Mid-edge nodes on vertical edges:
- *   Node 16: (-1, -1,  0)  (edge 0-4)
- *   Node 17: (+1, -1,  0)  (edge 1-5)
- *   Node 18: (+1, +1,  0)  (edge 2-6)
- *   Node 19: (-1, +1,  0)  (edge 3-7)
- *
- *
- * Hex27 (Triquadratic Hexahedron)
- * -------------------------------
- * Same as Hex20 plus face-center and body-center nodes:
- *
- * Face centers:
- *   Node 20: ( 0,  0, -1)  (bottom face)
- *   Node 21: ( 0,  0, +1)  (top face)
- *   Node 22: ( 0, -1,  0)  (front face)
- *   Node 23: (+1,  0,  0)  (right face)
- *   Node 24: ( 0, +1,  0)  (back face)
- *   Node 25: (-1,  0,  0)  (left face)
- *
- * Body center:
- *   Node 26: (0, 0, 0)
- *
- *
- * =============================================================================
- * 2D TRIANGULAR ELEMENTS
- * =============================================================================
- *
- * Triangle3 (Linear Triangle)
- * ---------------------------
- *
- *   2
- *   |\
- *   | \
- *   |  \
- *   |   \
- *   0----1
- *
- * Reference: (xi, eta) simplex with vertices at:
- *   Node 0: (xi, eta) = (0, 0)
- *   Node 1: (xi, eta) = (1, 0)
- *   Node 2: (xi, eta) = (0, 1)
- *
- *
- * Triangle6 (Quadratic Triangle)
- * ------------------------------
- *
- *   2
- *   |\
- *   | \
- *   5  4
- *   |   \
- *   0--3--1
- *
- * Corners: Nodes 0-2 (same as Triangle3)
- *
- * Mid-edge nodes:
- *   Node 3: (0.5,   0)  (edge 0-1)
- *   Node 4: (0.5, 0.5)  (edge 1-2)
- *   Node 5: (  0, 0.5)  (edge 2-0)
- *
- *
- * =============================================================================
- * 3D TETRAHEDRAL ELEMENTS
- * =============================================================================
- *
- * Tetrahedron4 (Linear Tetrahedron)
- * ---------------------------------
- *
- *             3
- *            /|\
- *           / | \
- *          /  |  \
- *         /   |   \
- *        /    |    \
- *       0-----|-----2
- *        \    |    /
- *         \   |   /
- *          \  |  /
- *           \ | /
- *            \|/
- *             1
- *
- * Reference: (xi, eta, zeta) simplex with vertices at:
- *   Node 0: (0, 0, 0)
- *   Node 1: (1, 0, 0)
- *   Node 2: (0, 1, 0)
- *   Node 3: (0, 0, 1)
- *
- *
- * Tetrahedron10 (Quadratic Tetrahedron)
- * -------------------------------------
- * Corners: Nodes 0-3 (same as Tet4)
- *
- * Mid-edge nodes:
- *   Node 4: (0.5,   0,   0)  (edge 0-1)
- *   Node 5: (0.5, 0.5,   0)  (edge 1-2)
- *   Node 6: (  0, 0.5,   0)  (edge 2-0)
- *   Node 7: (  0,   0, 0.5)  (edge 0-3)
- *   Node 8: (0.5,   0, 0.5)  (edge 1-3)
- *   Node 9: (  0, 0.5, 0.5)  (edge 2-3)
- *
- *
- * =============================================================================
- * 3D WEDGE (PRISM) ELEMENTS
- * =============================================================================
- *
- * Wedge6 (Linear Wedge)
- * ---------------------
- *
- *         5
- *        /|\
- *       / | \
- *      /  |  \
- *     3---|---4
- *     |   2   |
- *     |  / \  |
- *     | /   \ |
- *     |/     \|
- *     0-------1
- *
- * Reference: Triangle base at zeta = -1, top at zeta = +1
- *
- * Bottom face (zeta = -1):
- *   Node 0: (0, 0, -1)
- *   Node 1: (1, 0, -1)
- *   Node 2: (0, 1, -1)
- *
- * Top face (zeta = +1):
- *   Node 3: (0, 0, +1)
- *   Node 4: (1, 0, +1)
- *   Node 5: (0, 1, +1)
- *
- *
- * Wedge15 (Quadratic Wedge)
- * -------------------------
- * Corners: Nodes 0-5 (same as Wedge6)
- *
- * Mid-edge nodes on bottom face:
- *   Node 6:  (0.5,   0, -1)  (edge 0-1)
- *   Node 7:  (0.5, 0.5, -1)  (edge 1-2)
- *   Node 8:  (  0, 0.5, -1)  (edge 2-0)
- *
- * Mid-edge nodes on top face:
- *   Node 9:  (0.5,   0, +1)  (edge 3-4)
- *   Node 10: (0.5, 0.5, +1)  (edge 4-5)
- *   Node 11: (  0, 0.5, +1)  (edge 5-3)
- *
- * Mid-edge nodes on vertical edges:
- *   Node 12: (0, 0, 0)  (edge 0-3)
- *   Node 13: (1, 0, 0)  (edge 1-4)
- *   Node 14: (0, 1, 0)  (edge 2-5)
- *
- *
- * Wedge18 (Complete Quadratic Wedge)
- * ----------------------------------
- * Corners and mid-edges: Nodes 0-14 (same as Wedge15)
- *
- * Face-center nodes on quadrilateral faces:
- *   Node 15: (0.5, 0.0, 0.0)  (face with vertices 0-1-4-3, y = 0)
- *   Node 16: (0.5, 0.5, 0.0)  (face with vertices 1-2-5-4, x + y = 1)
- *   Node 17: (0.0, 0.5, 0.0)  (face with vertices 2-0-3-5, x = 0)
- *
- *
- * =============================================================================
- * 3D PYRAMID ELEMENTS
- * =============================================================================
- *
- * Pyramid5 (Linear Pyramid)
- * -------------------------
- *
- *           4
- *          /|\
- *         / | \
- *        /  |  \
- *       /   |   \
- *      3----|----2
- *      |    |    |
- *      |    +    |   (apex projects to center of base)
- *      |         |
- *      0---------1
- *
- * Reference: Quad base in xi-eta plane at zeta = 0, apex at zeta = 1
- *
- * Base (zeta = 0):
- *   Node 0: (-1, -1, 0)
- *   Node 1: (+1, -1, 0)
- *   Node 2: (+1, +1, 0)
- *   Node 3: (-1, +1, 0)
- *
- * Apex:
- *   Node 4: (0, 0, 1)
- *
- *
- * Pyramid13 (Quadratic Pyramid)
- * -----------------------------
- * Corners: Nodes 0-4 (same as Pyramid5)
- *
- * Mid-edge nodes on base:
- *   Node 5: ( 0, -1, 0)  (edge 0-1)
- *   Node 6: (+1,  0, 0)  (edge 1-2)
- *   Node 7: ( 0, +1, 0)  (edge 2-3)
- *   Node 8: (-1,  0, 0)  (edge 3-0)
- *
- * Mid-edge nodes to apex:
- *   Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
- *   Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
- *   Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
- *   Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
- *
- *
- * Pyramid14 (Quadratic Rational Pyramid)
- * --------------------------------------
- *
- * This retained low-order compatibility layout matches the generated
- * complete-family quadratic Lagrange ordering for the reference pyramid with
- * base (-1,-1,0)..(1,1,0) and apex at (0,0,1). Nodes 0-12 coincide with the
- * Pyramid13 layout; node 13 is the base center.
- *
- *   Base corners (same as Pyramid5):
- *     Node 0: (-1, -1, 0)
- *     Node 1: (+1, -1, 0)
- *     Node 2: (+1, +1, 0)
- *     Node 3: (-1, +1, 0)
- *
- *   Apex:
- *     Node 4: (0, 0, 1)
- *
- *   Base mid-edges (same as Pyramid13):
- *     Node 5:  ( 0, -1, 0)   (edge 0-1)
- *     Node 6:  (+1,  0, 0)   (edge 1-2)
- *     Node 7:  ( 0, +1, 0)   (edge 2-3)
- *     Node 8:  (-1,  0, 0)   (edge 3-0)
- *
- *   Mid-edges to apex (same as Pyramid13):
- *     Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
- *     Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
- *     Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
- *     Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
- *
- *   Base center:
- *     Node 13: (0, 0, 0)
- *
- *
- * =============================================================================
- * NOTES ON VTK COMPATIBILITY
- * =============================================================================
- *
- * The node orderings above are consistent with VTK cell types:
- *
- *   VTK_LINE           (3)  -> Line2
- *   VTK_QUADRATIC_EDGE (21) -> Line3
- *   VTK_TRIANGLE       (5)  -> Triangle3
- *   VTK_QUADRATIC_TRIANGLE (22) -> Triangle6
- *   VTK_QUAD           (9)  -> Quad4
- *   VTK_QUADRATIC_QUAD (23) -> Quad8
- *   VTK_BIQUADRATIC_QUAD (28) -> Quad9
- *   VTK_TETRA          (10) -> Tetrahedron4
- *   VTK_QUADRATIC_TETRA (24) -> Tetrahedron10
- *   VTK_HEXAHEDRON     (12) -> Hex8
- *   VTK_QUADRATIC_HEXAHEDRON (25) -> Hex20
- *   VTK_TRIQUADRATIC_HEXAHEDRON (29) -> Hex27
- *   VTK_WEDGE          (13) -> Wedge6
- *   VTK_QUADRATIC_WEDGE (26) -> Wedge15
- *   VTK_BIQUADRATIC_QUADRATIC_WEDGE (32) -> Wedge18
- *   VTK_PYRAMID        (14) -> Pyramid5
- *   VTK_QUADRATIC_PYRAMID (27) -> Pyramid13
- *
- *
- * =============================================================================
- * BARYCENTRIC COORDINATES
- * =============================================================================
- *
- * For simplex elements, barycentric coordinates (lambda_0, ..., lambda_n)
- * satisfy sum(lambda_i) = 1.
- *
- * Triangle:
- *   lambda_0 = 1 - xi - eta
- *   lambda_1 = xi
- *   lambda_2 = eta
- *
- * Tetrahedron:
- *   lambda_0 = 1 - xi - eta - zeta
- *   lambda_1 = xi
- *   lambda_2 = eta
- *   lambda_3 = zeta
- *
- */
+#include "Types.h"
 
+#include <cstddef>
 #include <span>
+#include <vector>
 
 namespace svmp {
 namespace FE {
 namespace basis {
 
-/**
- * @brief Basis-side reference node coordinate queries
- *
- * This is intentionally named differently from `svmp::NodeOrdering` in Mesh,
- * which handles mesh-format permutations rather than reference basis layouts.
- */
 class ReferenceNodeLayout {
 public:
-    /**
-     * @brief Get reference coordinates for a node
-     * @param elem_type Element type
-     * @param local_node Local node index (0-based)
-     *
-     * Complete-family low-order Lagrange aliases (`Line2/3`, `Triangle3/6`,
-     * `Quad4/9`, `Tetra4/10`, `Hex8/27`, `Wedge6/18`, `Pyramid5/14`) are
-     * served by the generated arbitrary-order Lagrange ordering path. Explicit
-     * hard-coded tables remain only for serendipity-only enums such as
-     * `Quad8`, `Hex20`, `Wedge15`, and `Pyramid13`.
-     *
-     * @return Reference coordinates (xi, eta, zeta)
-     */
-    static math::Vector<Real, 3> get_node_coords(ElementType elem_type, std::size_t local_node);
-
-    /**
-     * @brief Get number of nodes for an element type
-     *
-     * The low-order complete-family Lagrange aliases share the same generated
-     * ordering path used by `get_node_coords`.
-     */
+    static math::Vector<Real, 3> get_node_coords(ElementType elem_type,
+                                                 std::size_t local_node);
     static std::size_t num_nodes(ElementType elem_type);
 
-    /**
-     * @brief Generate complete-family Lagrange node coordinates for a canonical topology and order
-     *
-     * This covers arbitrary-order complete nodal Lagrange spaces on the
-     * canonical topologies `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`,
-     * `Wedge6`, and `Pyramid5`. Serendipity variants are intentionally
-     * excluded.
-     */
     static std::vector<math::Vector<Real, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
-    /**
-     * @brief Optional mapping from mesh/reference node order to internal basis order
-     *
-     * Returns an empty span when the public node order is already the basis
-     * table order or no special mapping is registered.
-     */
     static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
-
-    /**
-     * @brief Check if element is a simplex (triangle, tetrahedron)
-     */
     static bool is_simplex(ElementType elem_type);
-
-    /**
-     * @brief Check if element uses tensor-product topology
-     */
     static bool is_tensor_product(ElementType elem_type);
 };
 
diff --git a/Code/Source/solver/FE/Basis/PyramidModalBasis.h b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
deleted file mode 100644
index 1ecdae282..000000000
--- a/Code/Source/solver/FE/Basis/PyramidModalBasis.h
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
-#define SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
-
-// Shared rational/modal pyramid helpers for scalar complete-family and spectral
-// pyramid bases. The degenerate z=1 top plane is evaluated by its apex limit;
-// callers that reject non-apex top-plane queries must validate before calling.
-
-#include "BasisFunction.h"
-#include "BasisTolerance.h"
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace pyramid_modal {
-
-struct Term {
-    int px{0};
-    int py{0};
-    int pz{0};
-    int denom_power{0};
-};
-
-struct EvaluationPoint {
-    Real x{Real(0)};
-    Real y{Real(0)};
-    Real z{Real(0)};
-    Real t{Real(1)};
-    bool top_plane{false};
-    std::vector<Real> x_powers;
-    std::vector<Real> y_powers;
-    std::vector<Real> z_powers;
-    std::vector<Real> t_powers;
-};
-
-inline std::vector<Term> build_terms(int order) {
-    std::vector<Term> terms;
-    terms.reserve(static_cast<std::size_t>((order + 1) * (order + 2) *
-                                           (2 * order + 3) / 6));
-    for (int pz = 0; pz <= order; ++pz) {
-        const int n = order - pz;
-        for (int py = 0; py <= n; ++py) {
-            for (int px = 0; px <= n; ++px) {
-                terms.push_back(Term{px, py, pz, std::min(px, py)});
-            }
-        }
-    }
-    return terms;
-}
-
-inline bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi,
-                                    Real tolerance = detail::basis_scaled_tolerance()) {
-    return std::abs(Real(1) - xi[2]) <= tolerance;
-}
-
-inline void fill_powers(Real base, int max_power, std::vector<Real>& powers) {
-    powers.assign(static_cast<std::size_t>(max_power + 1), Real(1));
-    for (int p = 1; p <= max_power; ++p) {
-        powers[static_cast<std::size_t>(p)] =
-            powers[static_cast<std::size_t>(p - 1)] * base;
-    }
-}
-
-inline void prepare_evaluation_point(const math::Vector<Real, 3>& xi,
-                                     int max_px,
-                                     int max_py,
-                                     int max_pz,
-                                     int max_denom_power,
-                                     EvaluationPoint& point) {
-    point.x = xi[0];
-    point.y = xi[1];
-    point.z = xi[2];
-    point.t = Real(1) - point.z;
-    point.top_plane = on_degenerate_top_plane(xi);
-
-    fill_powers(point.x, std::max(max_px, 0), point.x_powers);
-    fill_powers(point.y, std::max(max_py, 0), point.y_powers);
-    fill_powers(point.z, std::max(max_pz, 0), point.z_powers);
-    if (point.top_plane) [[unlikely]] {
-        point.t_powers.assign(1u, Real(1));
-    } else {
-        fill_powers(point.t, std::max(max_denom_power + 2, 0), point.t_powers);
-    }
-}
-
-inline void prepare_evaluation_point(const std::vector<Term>& terms,
-                                     const math::Vector<Real, 3>& xi,
-                                     EvaluationPoint& point) {
-    int max_px = 0;
-    int max_py = 0;
-    int max_pz = 0;
-    int max_denom_power = 0;
-    for (const Term& term : terms) {
-        max_px = std::max(max_px, term.px);
-        max_py = std::max(max_py, term.py);
-        max_pz = std::max(max_pz, term.pz);
-        max_denom_power = std::max(max_denom_power, term.denom_power);
-    }
-    prepare_evaluation_point(xi, max_px, max_py, max_pz, max_denom_power, point);
-}
-
-inline void evaluate_term(const Term& term,
-                          const EvaluationPoint& point,
-                          Real& value,
-                          Gradient* gradient = nullptr,
-                          Hessian* hessian = nullptr) {
-    const auto pow_x = [&](int p) -> Real {
-        return point.x_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_y = [&](int p) -> Real {
-        return point.y_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_z = [&](int p) -> Real {
-        return point.z_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_t = [&](int p) -> Real {
-        return point.t_powers[static_cast<std::size_t>(p)];
-    };
-
-    if (point.top_plane) [[unlikely]] {
-        if (term.px == 0 && term.py == 0) {
-            value = pow_z(term.pz);
-        } else {
-            value = Real(0);
-        }
-        if (gradient != nullptr) {
-            *gradient = Gradient{};
-            if (term.px == 0 && term.py == 0 && term.pz > 0) {
-                (*gradient)[2] = static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-            }
-        }
-        if (hessian != nullptr) {
-            *hessian = Hessian{};
-            if (term.px == 0 && term.py == 0 && term.pz > 1) {
-                (*hessian)(2, 2) =
-                    static_cast<Real>(term.pz * (term.pz - 1)) *
-                    pow_z(term.pz - 2);
-            }
-        }
-        return;
-    }
-
-    const Real base = pow_x(term.px) * pow_y(term.py) * pow_z(term.pz);
-    const Real denom = pow_t(term.denom_power);
-    value = base / denom;
-
-    if (gradient != nullptr) {
-        *gradient = Gradient{};
-        if (term.px > 0) {
-            (*gradient)[0] =
-                static_cast<Real>(term.px) * pow_x(term.px - 1) *
-                pow_y(term.py) * pow_z(term.pz) / denom;
-        }
-        if (term.py > 0) {
-            (*gradient)[1] =
-                static_cast<Real>(term.py) * pow_x(term.px) *
-                pow_y(term.py - 1) * pow_z(term.pz) / denom;
-        }
-
-        Real gz = Real(0);
-        if (term.pz > 0) {
-            gz += static_cast<Real>(term.pz) * pow_x(term.px) *
-                  pow_y(term.py) * pow_z(term.pz - 1) / denom;
-        }
-        if (term.denom_power > 0) {
-            gz += static_cast<Real>(term.denom_power) * base / pow_t(term.denom_power + 1);
-        }
-        (*gradient)[2] = gz;
-    }
-
-    if (hessian == nullptr) {
-        return;
-    }
-
-    *hessian = Hessian{};
-    if (term.px > 1) {
-        (*hessian)(0, 0) =
-            static_cast<Real>(term.px * (term.px - 1)) *
-            pow_x(term.px - 2) * pow_y(term.py) * pow_z(term.pz) / denom;
-    }
-    if (term.py > 1) {
-        (*hessian)(1, 1) =
-            static_cast<Real>(term.py * (term.py - 1)) *
-            pow_x(term.px) * pow_y(term.py - 2) * pow_z(term.pz) / denom;
-    }
-    if (term.px > 0 && term.py > 0) {
-        const Real hxy =
-            static_cast<Real>(term.px * term.py) *
-            pow_x(term.px - 1) * pow_y(term.py - 1) * pow_z(term.pz) / denom;
-        (*hessian)(0, 1) = hxy;
-        (*hessian)(1, 0) = hxy;
-    }
-
-    if (term.px > 0) {
-        Real hxz =
-            static_cast<Real>(term.px) * pow_x(term.px - 1) *
-            pow_y(term.py) / denom;
-        if (term.pz > 0) {
-            hxz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-        } else {
-            hxz = Real(0);
-        }
-        if (term.denom_power > 0) {
-            hxz += static_cast<Real>(term.px * term.denom_power) *
-                   pow_x(term.px - 1) * pow_y(term.py) *
-                   pow_z(term.pz) / pow_t(term.denom_power + 1);
-        }
-        (*hessian)(0, 2) = hxz;
-        (*hessian)(2, 0) = hxz;
-    }
-
-    if (term.py > 0) {
-        Real hyz =
-            static_cast<Real>(term.py) * pow_x(term.px) *
-            pow_y(term.py - 1) / denom;
-        if (term.pz > 0) {
-            hyz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-        } else {
-            hyz = Real(0);
-        }
-        if (term.denom_power > 0) {
-            hyz += static_cast<Real>(term.py * term.denom_power) *
-                   pow_x(term.px) * pow_y(term.py - 1) *
-                   pow_z(term.pz) / pow_t(term.denom_power + 1);
-        }
-        (*hessian)(1, 2) = hyz;
-        (*hessian)(2, 1) = hyz;
-    }
-
-    Real hzz = Real(0);
-    if (term.pz > 1) {
-        hzz += static_cast<Real>(term.pz * (term.pz - 1)) *
-               pow_x(term.px) * pow_y(term.py) * pow_z(term.pz - 2) / denom;
-    }
-    if (term.pz > 0 && term.denom_power > 0) {
-        hzz += static_cast<Real>(2 * term.pz * term.denom_power) *
-               pow_x(term.px) * pow_y(term.py) *
-               pow_z(term.pz - 1) / pow_t(term.denom_power + 1);
-    }
-    if (term.denom_power > 0) {
-        hzz += static_cast<Real>(term.denom_power * (term.denom_power + 1)) *
-               base / pow_t(term.denom_power + 2);
-    }
-    (*hessian)(2, 2) = hzz;
-}
-
-inline void evaluate_term(const Term& term,
-                          const math::Vector<Real, 3>& xi,
-                          Real& value,
-                          Gradient* gradient = nullptr,
-                          Hessian* hessian = nullptr) {
-    EvaluationPoint point;
-    prepare_evaluation_point(
-        xi, term.px, term.py, term.pz, term.denom_power, point);
-    evaluate_term(term, point, value, gradient, hessian);
-}
-
-} // namespace pyramid_modal
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 309fd18be..e6395cee4 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -157,13 +157,6 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
     const std::string label = "Quad order " + std::to_string(order);
     return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
 }
-constexpr std::array<Real, 13> kPyramid13CenterRedistribution = {
-    Real(-0.25), Real(-0.25), Real(-0.25), Real(-0.25),
-    Real(0),
-    Real(0.5), Real(0.5), Real(0.5), Real(0.5),
-    Real(0), Real(0), Real(0), Real(0)
-};
-
 constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{0, 0, 0}},
     {{0, 0, 1}},
@@ -497,20 +490,8 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
                 "SerendipityBasis supports up to quadratic on wedge15",
                 __FILE__, __LINE__, __func__);
         }
-    } else if (type == ElementType::Pyramid13) {
-        dimension_ = 3;
-        if (order_ < 2) {
-            order_ = 2;
-        }
-        if (order_ == 2) {
-            size_ = 13;
-        } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on pyramid13",
-                __FILE__, __LINE__, __func__);
-        }
     } else {
-        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, Wedge15, and Pyramid13 elements",
+        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements",
                                                  __FILE__, __LINE__, __func__);
     }
 
@@ -522,17 +503,6 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
     }
 }
 
-bool SerendipityBasis::cache_identity_words(std::vector<std::uint64_t>& words) const {
-    words.push_back(0x736572656e646970ULL);
-    words.push_back(static_cast<std::uint64_t>(basis_type()));
-    words.push_back(static_cast<std::uint64_t>(element_type_));
-    words.push_back(static_cast<std::uint64_t>(dimension_));
-    words.push_back(static_cast<std::uint64_t>(order_));
-    words.push_back(static_cast<std::uint64_t>(size_));
-    words.push_back(geometry_mode_ ? 1u : 0u);
-    return true;
-}
-
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
                                        std::vector<Real>& values) const {
     values.assign(size_, Real(0));
@@ -617,15 +587,6 @@ void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14> parent_values{};
-        parent.evaluate_values_to(xi, parent_values.data());
-        for (std::size_t i = 0; i < 13; ++i) {
-            values[i] = parent_values[i] + kPyramid13CenterRedistribution[i] * parent_values[13];
-        }
-        return;
-    }
 }
 
 void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -762,25 +723,6 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14u * 3u> parent_gradients{};
-        // Pyramid13 inherits the complete-family pyramid apex contract from the
-        // parent basis rather than introducing a separate regularized path.
-        parent.evaluate_gradients_to(xi, parent_gradients.data());
-        const auto parent_gradient = [&](std::size_t node, std::size_t component) {
-            return parent_gradients[node * 3u + component];
-        };
-        for (std::size_t i = 0; i < 13; ++i) {
-            for (std::size_t c = 0; c < 3u; ++c) {
-                gradients[i][c] =
-                    parent_gradient(i, c) +
-                    kPyramid13CenterRedistribution[i] * parent_gradient(13u, c);
-            }
-        }
-        return;
-    }
-
     throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
@@ -859,20 +801,6 @@ void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14u * 9u> parent_hessians{};
-        // Pyramid13 inherits the complete-family pyramid apex contract from the
-        // parent basis rather than introducing a separate regularized path.
-        parent.evaluate_hessians_to(xi, parent_hessians.data());
-        const Hessian center_hessian = load_hessian(parent_hessians.data() + 13u * 9u);
-        for (std::size_t i = 0; i < 13; ++i) {
-            hessians[i] = load_hessian(parent_hessians.data() + i * 9u);
-            add_scaled_hessian(hessians[i], center_hessian, kPyramid13CenterRedistribution[i]);
-        }
-        return;
-    }
-
     throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 98c01415a..10e426164 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -11,17 +11,11 @@
 /**
  * @file SerendipityBasis.h
  * @brief Reduced-degree-of-freedom serendipity bases
- *
- * `Pyramid13` inherits its apex contract from the complete-family rational
- * pyramid basis: values remain exact at the apex, while exact-apex gradient
- * and Hessian queries throw because the inherited nodal derivative limit is
- * not unique.
  */
 
 #include "BasisFunction.h"
 
 #include <array>
-#include <cstdint>
 
 namespace svmp {
 namespace FE {
@@ -37,7 +31,6 @@ class SerendipityBasis : public BasisFunction {
     int order() const noexcept override { return order_; }
     std::size_t size() const noexcept override { return size_; }
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
-    bool cache_identity_words(std::vector<std::uint64_t>& words) const override;
 
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const override;
diff --git a/Code/Source/solver/FE/Basis/VectorBasis.h b/Code/Source/solver/FE/Basis/VectorBasis.h
deleted file mode 100644
index d442c2160..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasis.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASIS_H
-#define SVMP_FE_BASIS_VECTORBASIS_H
-
-/**
- * @file VectorBasis.h
- * @brief Vector-valued bases for H(div) and H(curl) conforming spaces
- */
-
-#include "BasisFunction.h"
-#include "VectorBasisModalPolynomial.h"
-#include <array>
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-/**
- * @brief DOF entity type for vector-valued basis functions
- */
-enum class DofEntity {
-    Vertex,   ///< DOF associated with a vertex
-    Edge,     ///< DOF associated with an edge (tangential moments for H(curl))
-    Face,     ///< DOF associated with a face (normal moments for H(div), tangential for H(curl))
-    Interior  ///< DOF associated with element interior
-};
-
-/**
- * @brief DOF association metadata for a single DOF
- */
-struct DofAssociation {
-    DofEntity entity_type{DofEntity::Interior};
-    int entity_id{-1};      ///< Local index of the entity (edge/face/vertex)
-    int moment_index{0};    ///< Index within the entity's moment space
-};
-
-struct SparseModalCoefficientMatrix {
-    std::size_t rows{0};
-    std::size_t cols{0};
-    std::vector<std::size_t> row_offsets;
-    std::vector<std::size_t> dofs;
-    std::vector<Real> coefficients;
-};
-
-class VectorBasisFunction : public BasisFunction {
-public:
-    bool is_vector_valued() const noexcept override { return true; }
-    bool supports_vector_jacobians() const noexcept override { return true; }
-    void evaluate_values(const math::Vector<Real, 3>&,
-                         std::vector<Real>&) const override {
-        throw BasisEvaluationException("Vector basis uses evaluate_vector_values",
-                                       __FILE__, __LINE__, __func__);
-    }
-
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /**
-     * @brief Get DOF association metadata for all basis functions
-     *
-     * Returns a vector of size(), where each entry describes which
-     * geometric entity (vertex/edge/face/interior) the corresponding
-     * DOF is associated with. This is essential for orientation-aware
-     * assembly of H(div) and H(curl) spaces.
-     */
-    virtual std::vector<DofAssociation> dof_associations() const {
-        // Default: all interior DOFs (subclasses should override)
-        std::vector<DofAssociation> result(size());
-        for (std::size_t i = 0; i < size(); ++i) {
-            result[i].entity_type = DofEntity::Interior;
-            result[i].entity_id = 0;
-            result[i].moment_index = static_cast<int>(i);
-        }
-        return result;
-    }
-};
-
-/**
- * @brief Raviart-Thomas H(div) basis on supported element families
- */
-class RaviartThomasBasis : public VectorBasisFunction {
-public:
-    RaviartThomasBasis(ElementType type, int order = 0);
-
-    BasisType basis_type() const noexcept override { return BasisType::RaviartThomas; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& divergence) const override;
-    bool supports_divergence() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (face/edge DOFs for 2D, face DOFs for 3D H(div))
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-    using SeedJacobianEvaluator = void (*)(
-        const math::Vector<Real, 3>&,
-        std::vector<VectorJacobian>&);
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-
-    bool nodal_generated_{false};
-    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid RT(k=1,2) transformed from direct seed functions
-    std::vector<int> transformed_seed_indices_;
-    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
-    std::vector<ModalPolynomial> monomials_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
-    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
-    // Sparse coefficients for nodal basis in modal monomial basis:
-    //   phi_j = sum_p c(p,j) * modal_p.
-    // Rows index modal functions; entries target nodal DOFs.
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
-};
-
-/**
- * @brief First-kind Nedelec H(curl) basis on supported element families
- */
-class NedelecBasis : public VectorBasisFunction {
-public:
-    NedelecBasis(ElementType type, int order = 0);
-
-    BasisType basis_type() const noexcept override { return BasisType::Nedelec; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_curl(const math::Vector<Real, 3>& xi,
-                       std::vector<math::Vector<Real, 3>>& curl) const override;
-    bool supports_curl() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (edge DOFs for H(curl), face DOFs for 3D interior)
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-    using SeedJacobianEvaluator = void (*)(
-        const math::Vector<Real, 3>&,
-        std::vector<VectorJacobian>&);
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-
-    bool nodal_generated_{false};
-    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid ND(k=1,2) transformed from direct seed/candidate functions
-    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
-    std::vector<ModalPolynomial> monomials_;
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
-    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
-};
-
-/**
- * @brief Brezzi-Douglas-Marini basis (simple linear variant)
- */
-class BDMBasis : public VectorBasisFunction {
-public:
-    BDMBasis(ElementType type, int order = 1);
-
-    BasisType basis_type() const noexcept override { return BasisType::BDM; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& divergence) const override;
-    bool supports_divergence() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (face/edge DOFs for H(div))
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-    bool nodal_generated_{false};
-    std::vector<ModalPolynomial> monomials_;
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
deleted file mode 100644
index 7ec848633..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#include "VectorBasisEvaluationHelpers.h"
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace vector_common {
-
-VectorBasisScratch& vector_basis_scratch() {
-    // Scratch is intentionally thread-local: production assembly uses a
-    // persistent worker-thread team, so buffers stay warm on each worker.
-    static thread_local VectorBasisScratch scratch;
-    return scratch;
-}
-
-void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts) {
-    vector_basis_scratch().prewarm(max_size, max_qpts);
-}
-
-void fill_powers(Real x, int max_p, std::vector<Real>& out) {
-    BASIS_CHECK_CONSTRUCTION(max_p >= 0, "powers: negative max_p");
-    out.assign(static_cast<std::size_t>(max_p + 1), Real(1));
-    for (int i = 1; i <= max_p; ++i) {
-        out[static_cast<std::size_t>(i)] =
-            out[static_cast<std::size_t>(i - 1)] * x;
-    }
-}
-
-void fill_power_tables(const Vec3& xi,
-                       const std::array<int, 3>& limits,
-                       VectorBasisScratch& scratch) {
-    fill_powers(xi[0], limits[0], scratch.px);
-    fill_powers(xi[1], limits[1], scratch.py);
-    fill_powers(xi[2], limits[2], scratch.pz);
-}
-
-namespace {
-
-constexpr Real kSparseCoefficientRelativeTolerance =
-    Real(256) * std::numeric_limits<Real>::epsilon();
-
-void fill_batched_axis_powers(const std::vector<Vec3>& points,
-                              std::size_t axis,
-                              int max_power,
-                              std::vector<Real>& out) {
-    BASIS_CHECK_CONSTRUCTION(max_power >= 0, "batched powers: negative max_p");
-    const std::size_t num_qpts = points.size();
-    out.assign(static_cast<std::size_t>(max_power + 1) * num_qpts, Real(1));
-    if (num_qpts == 0 || max_power == 0) {
-        return;
-    }
-
-    Real* first_power = out.data() + num_qpts;
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        first_power[q] = points[q][axis];
-    }
-    for (int power = 2; power <= max_power; ++power) {
-        const Real* previous =
-            out.data() + static_cast<std::size_t>(power - 1) * num_qpts;
-        Real* current = out.data() + static_cast<std::size_t>(power) * num_qpts;
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            current[q] = previous[q] * points[q][axis];
-        }
-    }
-}
-
-} // namespace
-
-void fill_batched_power_tables(const std::vector<Vec3>& points,
-                               const std::array<int, 3>& limits,
-                               VectorBasisScratch& scratch) {
-    fill_batched_axis_powers(points, 0u, limits[0], scratch.batched_px);
-    fill_batched_axis_powers(points, 1u, limits[1], scratch.batched_py);
-    fill_batched_axis_powers(points, 2u, limits[2], scratch.batched_pz);
-}
-
-void validate_vector_strided_outputs(std::size_t num_qpts,
-                                     std::size_t output_stride,
-                                     const char* family_name) {
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException(
-            std::string(family_name) +
-                " strided vector evaluation requires output_stride >= points.size()",
-            __FILE__, __LINE__, __func__);
-    }
-}
-
-void zero_active_strided_rows(Real* output,
-                              std::size_t rows,
-                              std::size_t output_stride,
-                              std::size_t num_qpts) {
-    for (std::size_t row = 0; row < rows; ++row) {
-        std::fill_n(output + row * output_stride, num_qpts, Real(0));
-    }
-}
-
-SparseModalCoefficientMatrix build_sparse_modal_coefficients(
-    const std::vector<Real>& dense_coefficients,
-    std::size_t rows,
-    std::size_t cols) {
-    BASIS_CHECK_CONSTRUCTION(dense_coefficients.size() == rows * cols,
-                 "build_sparse_modal_coefficients: dense coefficient size mismatch");
-
-    SparseModalCoefficientMatrix sparse;
-    sparse.rows = rows;
-    sparse.cols = cols;
-    sparse.row_offsets.reserve(rows + 1u);
-    sparse.row_offsets.push_back(0u);
-
-    Real max_abs = Real(0);
-    for (const Real coefficient : dense_coefficients) {
-        max_abs = std::max(max_abs, std::abs(coefficient));
-    }
-    const Real prune_threshold = kSparseCoefficientRelativeTolerance * max_abs;
-
-    for (std::size_t row = 0; row < rows; ++row) {
-        const Real* dense_row = dense_coefficients.data() + row * cols;
-        for (std::size_t col = 0; col < cols; ++col) {
-            const Real coefficient = dense_row[col];
-            if (std::abs(coefficient) > prune_threshold) {
-                sparse.dofs.push_back(col);
-                sparse.coefficients.push_back(coefficient);
-            }
-        }
-        sparse.row_offsets.push_back(sparse.dofs.size());
-    }
-
-    return sparse;
-}
-
-Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept {
-    return Vec3{J(2u, 1u) - J(1u, 2u),
-                J(0u, 2u) - J(2u, 0u),
-                J(1u, 0u) - J(0u, 1u)};
-}
-
-Real divergence_from_jacobian(const VectorJacobian& J) noexcept {
-    return J(0u, 0u) + J(1u, 1u) + J(2u, 2u);
-}
-
-void write_vector_values_strided(const std::vector<Vec3>& values,
-                                 std::size_t num_dofs,
-                                 std::size_t output_stride,
-                                 std::size_t q,
-                                 Real* SVMP_RESTRICT values_out) {
-    if (values_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(values.size() == num_dofs,
-                 "vector value evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            values_out[(dof * 3u + component) * output_stride + q] =
-                values[dof][component];
-        }
-    }
-}
-
-void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
-                                    std::size_t num_dofs,
-                                    std::size_t output_stride,
-                                    std::size_t q,
-                                    Real* SVMP_RESTRICT jacobians_out) {
-    if (jacobians_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
-                 "vector Jacobian evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        const auto& J = jacobians[dof];
-        for (std::size_t component = 0; component < 3u; ++component) {
-            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
-                jacobians_out[(dof * 9u + component * 3u + derivative) *
-                                  output_stride + q] = J(component, derivative);
-            }
-        }
-    }
-}
-
-void write_vector_curl_strided(const std::vector<Vec3>& curl,
-                               std::size_t num_dofs,
-                               std::size_t output_stride,
-                               std::size_t q,
-                               Real* SVMP_RESTRICT curls_out) {
-    if (curls_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(curl.size() == num_dofs,
-                 "vector curl evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            curls_out[(dof * 3u + component) * output_stride + q] =
-                curl[dof][component];
-        }
-    }
-}
-
-void write_vector_divergence_strided(const std::vector<Real>& divergence,
-                                     std::size_t num_dofs,
-                                     std::size_t output_stride,
-                                     std::size_t q,
-                                     Real* SVMP_RESTRICT divergence_out) {
-    if (divergence_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(divergence.size() == num_dofs,
-                 "vector divergence evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        divergence_out[dof * output_stride + q] = divergence[dof];
-    }
-}
-
-void write_curl_and_divergence_from_jacobians_strided(
-    const std::vector<VectorJacobian>& jacobians,
-    std::size_t num_dofs,
-    std::size_t output_stride,
-    std::size_t q,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) {
-    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
-                 "vector Jacobian evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        const auto& J = jacobians[dof];
-        if (curls_out != nullptr) {
-            const Vec3 curl = curl_from_jacobian(J);
-            for (std::size_t component = 0; component < 3u; ++component) {
-                curls_out[(dof * 3u + component) * output_stride + q] =
-                    curl[component];
-            }
-        }
-        if (divergence_out != nullptr) {
-            divergence_out[dof * output_stride + q] = divergence_from_jacobian(J);
-        }
-    }
-}
-
-Vec3 lerp(const Vec3& a, const Vec3& b, Real s) {
-    const Real t = (s + Real(1)) * Real(0.5);
-    return a * (Real(1) - t) + b * t;
-}
-
-Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w) {
-    const Real N0 = Real(0.25) * (Real(1) - u) * (Real(1) - w);
-    const Real N1 = Real(0.25) * (Real(1) + u) * (Real(1) - w);
-    const Real N2 = Real(0.25) * (Real(1) + u) * (Real(1) + w);
-    const Real N3 = Real(0.25) * (Real(1) - u) * (Real(1) + w);
-    return v[0] * N0 + v[1] * N1 + v[2] * N2 + v[3] * N3;
-}
-
-Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w) {
-    (void)u;
-    const Real dN0 = -Real(0.25) * (Real(1) - w);
-    const Real dN1 =  Real(0.25) * (Real(1) - w);
-    const Real dN2 =  Real(0.25) * (Real(1) + w);
-    const Real dN3 = -Real(0.25) * (Real(1) + w);
-    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
-}
-
-Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w) {
-    (void)w;
-    const Real dN0 = -Real(0.25) * (Real(1) - u);
-    const Real dN1 = -Real(0.25) * (Real(1) + u);
-    const Real dN2 =  Real(0.25) * (Real(1) + u);
-    const Real dN3 =  Real(0.25) * (Real(1) - u);
-    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
-}
-
-Vec3 cross3(const Vec3& a, const Vec3& b) {
-    return Vec3{a[1] * b[2] - a[2] * b[1],
-                a[2] * b[0] - a[0] * b[2],
-                a[0] * b[1] - a[1] * b[0]};
-}
-
-Vec3 normalize3(const Vec3& v) {
-    const Real n = v.norm();
-    BASIS_CHECK_CONSTRUCTION(n > std::numeric_limits<Real>::epsilon(),
-                 "normalize3: zero-length vector");
-    return v / n;
-}
-
-std::array<int, 3> component_monomial_power_limits(
-    const std::vector<std::array<int, 4>>& candidates) {
-    std::array<int, 3> limits{{0, 0, 0}};
-    for (const auto& mono : candidates) {
-        limits[0] = std::max(limits[0], mono[1]);
-        limits[1] = std::max(limits[1], mono[2]);
-        limits[2] = std::max(limits[2], mono[3]);
-    }
-    return limits;
-}
-
-std::size_t triangle_poly_dim(std::size_t k) {
-    return (k + 1u) * (k + 2u) / 2u;
-}
-
-std::size_t tetra_poly_dim(std::size_t k) {
-    return (k + 1u) * (k + 2u) * (k + 3u) / 6u;
-}
-
-std::size_t rt_wedge_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t face_dofs =
-        2u * triangle_poly_dim(k) + 3u * (k + 1u) * (k + 1u);
-    const std::size_t interior_dofs =
-        (k >= 1u) ? (3u * k * (k + 1u) * (k + 1u) / 2u) : 0u;
-    return face_dofs + interior_dofs;
-}
-
-std::size_t rt_pyramid_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t face_dofs = (k + 1u) * (k + 1u) + 4u * triangle_poly_dim(k);
-    const std::size_t interior_dofs = (k >= 1u) ? (3u * k * k * k) : 0u;
-    return face_dofs + interior_dofs;
-}
-
-std::size_t nd_wedge_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t edge_dofs = 9u * (k + 1u);
-    const std::size_t face_dofs = (k >= 1u) ? (8u * k * (k + 1u)) : 0u;
-    const std::size_t interior_dofs =
-        (k >= 2u) ? (3u * k * (k - 1u) * (k + 1u) / 2u) : 0u;
-    return edge_dofs + face_dofs + interior_dofs;
-}
-
-std::size_t nd_pyramid_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t edge_dofs = 8u * (k + 1u);
-    const std::size_t face_dofs = (k >= 1u) ? (6u * k * (k + 1u)) : 0u;
-    const std::size_t interior_dofs =
-        (k >= 2u) ? (k * (k - 1u) * (k + 1u) / 2u) : 0u;
-    return edge_dofs + face_dofs + interior_dofs;
-}
-
-void ensure_supported_hybrid_vector_order(ElementType type,
-                                          int order,
-                                          const char* family_name) {
-    (void)type;
-    (void)order;
-    (void)family_name;
-}
-
-std::vector<std::array<int, 4>> make_component_monomial_candidates(
-    int max_total_degree) {
-    BASIS_CHECK_CONSTRUCTION(max_total_degree >= 0,
-                 "make_component_monomial_candidates: negative total degree");
-
-    std::vector<std::array<int, 4>> candidates;
-    for (int component = 0; component < 3; ++component) {
-        for (int total = 0; total <= max_total_degree; ++total) {
-            for (int pz = 0; pz <= total; ++pz) {
-                for (int py = 0; py <= total - pz; ++py) {
-                    const int px = total - py - pz;
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
-                                                                  int order) {
-    if (order >= 3) {
-        return make_component_monomial_candidates(3 * order);
-    }
-
-    std::vector<std::array<int, 4>> candidates;
-    if (!is_pyramid(type) || order != 2) {
-        return candidates;
-    }
-
-    for (int component = 0; component < 3; ++component) {
-        for (int pz = 0; pz <= 2; ++pz) {
-            for (int py = 0; py <= 2 - pz; ++py) {
-                for (int px = 0; px <= 2 - py - pz; ++px) {
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz) {
-    return px[static_cast<std::size_t>(mono[1])] *
-           py[static_cast<std::size_t>(mono[2])] *
-           pz[static_cast<std::size_t>(mono[3])];
-}
-
-Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
-                                             const std::vector<Real>& px,
-                                             const std::vector<Real>& py,
-                                             const std::vector<Real>& pz) {
-    const int component = mono[0];
-    const int px_pow = mono[1];
-    const int py_pow = mono[2];
-    const int pz_pow = mono[3];
-
-    if (component == 0) {
-        if (px_pow == 0) {
-            return Real(0);
-        }
-        return Real(px_pow) *
-               px[static_cast<std::size_t>(px_pow - 1)] *
-               py[static_cast<std::size_t>(py_pow)] *
-               pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (component == 1) {
-        if (py_pow == 0) {
-            return Real(0);
-        }
-        return Real(py_pow) *
-               px[static_cast<std::size_t>(px_pow)] *
-               py[static_cast<std::size_t>(py_pow - 1)] *
-               pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (pz_pow == 0) {
-        return Real(0);
-    }
-    return Real(pz_pow) *
-           px[static_cast<std::size_t>(px_pow)] *
-           py[static_cast<std::size_t>(py_pow)] *
-           pz[static_cast<std::size_t>(pz_pow - 1)];
-}
-
-void add_component_monomial_jacobian(VectorJacobian& J,
-                                     int component,
-                                     int px_pow,
-                                     int py_pow,
-                                     int pz_pow,
-                                     Real coefficient,
-                                     const std::vector<Real>& px,
-                                     const std::vector<Real>& py,
-                                     const std::vector<Real>& pz) {
-    const auto comp = static_cast<std::size_t>(component);
-    if (px_pow > 0) {
-        J(comp, 0) += coefficient * Real(px_pow) *
-                      px[static_cast<std::size_t>(px_pow - 1)] *
-                      py[static_cast<std::size_t>(py_pow)] *
-                      pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (py_pow > 0) {
-        J(comp, 1) += coefficient * Real(py_pow) *
-                      px[static_cast<std::size_t>(px_pow)] *
-                      py[static_cast<std::size_t>(py_pow - 1)] *
-                      pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (pz_pow > 0) {
-        J(comp, 2) += coefficient * Real(pz_pow) *
-                      px[static_cast<std::size_t>(px_pow)] *
-                      py[static_cast<std::size_t>(py_pow)] *
-                      pz[static_cast<std::size_t>(pz_pow - 1)];
-    }
-}
-
-VectorJacobian eval_transformed_component_monomial_jacobian(
-    const std::array<int, 4>& mono,
-    const std::vector<Real>& px,
-    const std::vector<Real>& py,
-    const std::vector<Real>& pz) {
-    VectorJacobian J{};
-    add_component_monomial_jacobian(
-        J, mono[0], mono[1], mono[2], mono[3], Real(1), px, py, pz);
-    return J;
-}
-
-void add_component_monomial_curl(Vec3& curl,
-                                 int component,
-                                 int px_pow,
-                                 int py_pow,
-                                 int pz_pow,
-                                 Real coefficient,
-                                 const std::vector<Real>& px,
-                                 const std::vector<Real>& py,
-                                 const std::vector<Real>& pz) {
-    const Real dphidx = (px_pow == 0)
-        ? Real(0)
-        : coefficient * Real(px_pow) *
-              px[static_cast<std::size_t>(px_pow - 1)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidy = (py_pow == 0)
-        ? Real(0)
-        : coefficient * Real(py_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow - 1)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidz = (pz_pow == 0)
-        ? Real(0)
-        : coefficient * Real(pz_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow - 1)];
-
-    if (component == 0) {
-        curl[1] += dphidz;
-        curl[2] -= dphidy;
-    } else if (component == 1) {
-        curl[0] -= dphidz;
-        curl[2] += dphidx;
-    } else {
-        curl[0] += dphidy;
-        curl[1] -= dphidx;
-    }
-}
-
-std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType,
-                                                                  int order) {
-    if (order >= 3) {
-        return make_component_monomial_candidates(3 * order);
-    }
-
-    std::vector<std::array<int, 4>> candidates;
-    const int max_total_degree = (order == 1) ? 4 : 5;
-    for (int component = 0; component < 3; ++component) {
-        for (int total = 0; total <= max_total_degree; ++total) {
-            for (int pz = 0; pz <= total; ++pz) {
-                for (int py = 0; py <= total - pz; ++py) {
-                    const int px = total - py - pz;
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz) {
-    return px[static_cast<std::size_t>(mono[1])] *
-           py[static_cast<std::size_t>(mono[2])] *
-           pz[static_cast<std::size_t>(mono[3])];
-}
-
-Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
-                                       const std::vector<Real>& px,
-                                       const std::vector<Real>& py,
-                                       const std::vector<Real>& pz) {
-    const int component = mono[0];
-    const int px_pow = mono[1];
-    const int py_pow = mono[2];
-    const int pz_pow = mono[3];
-
-    const Real dphidx = (px_pow == 0)
-        ? Real(0)
-        : Real(px_pow) *
-              px[static_cast<std::size_t>(px_pow - 1)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidy = (py_pow == 0)
-        ? Real(0)
-        : Real(py_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow - 1)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidz = (pz_pow == 0)
-        ? Real(0)
-        : Real(pz_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow - 1)];
-
-    if (component == 0) {
-        return Vec3{Real(0), dphidz, -dphidy};
-    }
-    if (component == 1) {
-        return Vec3{-dphidz, Real(0), dphidx};
-    }
-    return Vec3{dphidy, -dphidx, Real(0)};
-}
-
-} // namespace vector_common
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
deleted file mode 100644
index e0e6daa10..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
+++ /dev/null
@@ -1,751 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
-#define SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
-
-#include "VectorBasis.h"
-#include "Basis/BasisTraits.h"
-
-#include <algorithm>
-#include <array>
-#include <limits>
-#include <string>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace vector_common {
-
-using Vec3 = math::Vector<Real, 3>;
-
-struct VectorBasisScratch {
-    std::vector<Real> px;
-    std::vector<Real> py;
-    std::vector<Real> pz;
-    std::vector<Real> batched_px;
-    std::vector<Real> batched_py;
-    std::vector<Real> batched_pz;
-    std::vector<Real> candidate_values;
-    std::vector<Real> candidate_dx;
-    std::vector<Real> candidate_dy;
-    std::vector<Real> candidate_dz;
-    std::vector<Real> modal_values_batched;
-    std::vector<Real> modal_jacobians_batched;
-    std::vector<Real> modal_curls_batched;
-    std::vector<Real> modal_divergence_batched;
-    std::vector<Vec3> vector_values;
-    std::vector<VectorJacobian> vector_jacobians;
-    std::vector<Real> scalars;
-    std::vector<Vec3> api_values;
-    std::vector<VectorJacobian> api_jacobians;
-    std::vector<Vec3> api_curl;
-    std::vector<Real> api_divergence;
-
-    void prewarm(std::size_t max_size, std::size_t max_qpts) {
-        const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
-        px.reserve(max_size);
-        py.reserve(max_size);
-        pz.reserve(max_size);
-        batched_px.reserve(batched_size);
-        batched_py.reserve(batched_size);
-        batched_pz.reserve(batched_size);
-        candidate_values.reserve(max_size);
-        candidate_dx.reserve(max_size);
-        candidate_dy.reserve(max_size);
-        candidate_dz.reserve(max_size);
-        modal_values_batched.reserve(batched_size * 3u);
-        modal_jacobians_batched.reserve(batched_size * 9u);
-        modal_curls_batched.reserve(batched_size * 3u);
-        modal_divergence_batched.reserve(batched_size);
-        vector_values.reserve(max_size);
-        vector_jacobians.reserve(max_size);
-        scalars.reserve(max_size);
-        api_values.reserve(max_size);
-        api_jacobians.reserve(max_size);
-        api_curl.reserve(max_size);
-        api_divergence.reserve(max_size);
-    }
-};
-
-VectorBasisScratch& vector_basis_scratch();
-void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts = 0);
-
-void fill_powers(Real x, int max_p, std::vector<Real>& out);
-void fill_power_tables(const Vec3& xi,
-                       const std::array<int, 3>& limits,
-                       VectorBasisScratch& scratch);
-void fill_batched_power_tables(const std::vector<Vec3>& points,
-                               const std::array<int, 3>& limits,
-                               VectorBasisScratch& scratch);
-void validate_vector_strided_outputs(std::size_t num_qpts,
-                                     std::size_t output_stride,
-                                     const char* family_name);
-void zero_active_strided_rows(Real* output,
-                              std::size_t rows,
-                              std::size_t output_stride,
-                              std::size_t num_qpts);
-SparseModalCoefficientMatrix build_sparse_modal_coefficients(
-    const std::vector<Real>& dense_coefficients,
-    std::size_t rows,
-    std::size_t cols);
-Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept;
-Real divergence_from_jacobian(const VectorJacobian& J) noexcept;
-
-inline Real batched_power_product(const std::vector<Real>& px,
-                                  const std::vector<Real>& py,
-                                  const std::vector<Real>& pz,
-                                  std::size_t stride,
-                                  int px_pow,
-                                  int py_pow,
-                                  int pz_pow,
-                                  std::size_t q) noexcept {
-    return px[static_cast<std::size_t>(px_pow) * stride + q] *
-           py[static_cast<std::size_t>(py_pow) * stride + q] *
-           pz[static_cast<std::size_t>(pz_pow) * stride + q];
-}
-
-inline Real batched_component_partial(const std::vector<Real>& px,
-                                      const std::vector<Real>& py,
-                                      const std::vector<Real>& pz,
-                                      std::size_t stride,
-                                      int px_pow,
-                                      int py_pow,
-                                      int pz_pow,
-                                      int derivative_axis,
-                                      std::size_t q) noexcept {
-    if (derivative_axis == 0) {
-        if (px_pow == 0) {
-            return Real(0);
-        }
-        return Real(px_pow) *
-               px[static_cast<std::size_t>(px_pow - 1) * stride + q] *
-               py[static_cast<std::size_t>(py_pow) * stride + q] *
-               pz[static_cast<std::size_t>(pz_pow) * stride + q];
-    }
-    if (derivative_axis == 1) {
-        if (py_pow == 0) {
-            return Real(0);
-        }
-        return Real(py_pow) *
-               px[static_cast<std::size_t>(px_pow) * stride + q] *
-               py[static_cast<std::size_t>(py_pow - 1) * stride + q] *
-               pz[static_cast<std::size_t>(pz_pow) * stride + q];
-    }
-    if (pz_pow == 0) {
-        return Real(0);
-    }
-    return Real(pz_pow) *
-           px[static_cast<std::size_t>(px_pow) * stride + q] *
-           py[static_cast<std::size_t>(py_pow) * stride + q] *
-           pz[static_cast<std::size_t>(pz_pow - 1) * stride + q];
-}
-
-inline Vec3 curl_from_component_gradient(int component,
-                                         Real dphidx,
-                                         Real dphidy,
-                                         Real dphidz) noexcept {
-    if (component == 0) {
-        return Vec3{Real(0), dphidz, -dphidy};
-    }
-    if (component == 1) {
-        return Vec3{-dphidz, Real(0), dphidx};
-    }
-    return Vec3{dphidy, -dphidx, Real(0)};
-}
-
-inline void axpy_qpoints(Real* target,
-                         const Real* source,
-                         Real coefficient,
-                         std::size_t num_qpts) noexcept {
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        target[q] += coefficient * source[q];
-    }
-}
-
-void write_vector_values_strided(const std::vector<Vec3>& values,
-                                 std::size_t num_dofs,
-                                 std::size_t output_stride,
-                                 std::size_t q,
-                                 Real* SVMP_RESTRICT values_out);
-void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
-                                    std::size_t num_dofs,
-                                    std::size_t output_stride,
-                                    std::size_t q,
-                                    Real* SVMP_RESTRICT jacobians_out);
-void write_vector_curl_strided(const std::vector<Vec3>& curl,
-                               std::size_t num_dofs,
-                               std::size_t output_stride,
-                               std::size_t q,
-                               Real* SVMP_RESTRICT curls_out);
-void write_vector_divergence_strided(const std::vector<Real>& divergence,
-                                     std::size_t num_dofs,
-                                     std::size_t output_stride,
-                                     std::size_t q,
-                                     Real* SVMP_RESTRICT divergence_out);
-void write_curl_and_divergence_from_jacobians_strided(
-    const std::vector<VectorJacobian>& jacobians,
-    std::size_t num_dofs,
-    std::size_t output_stride,
-    std::size_t q,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out);
-
-template <typename BasisLike>
-void evaluate_vector_public_api_strided(
-    const BasisLike& basis,
-    const std::vector<Vec3>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out,
-    bool use_direct_curl,
-    bool use_direct_divergence,
-    const char* family_name) {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = basis.size();
-    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
-
-    auto& scratch = vector_basis_scratch();
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out != nullptr) {
-            basis.evaluate_vector_values(points[q], scratch.api_values);
-            write_vector_values_strided(
-                scratch.api_values, num_dofs, output_stride, q, values_out);
-        }
-
-        const bool needs_jacobians =
-            jacobians_out != nullptr ||
-            (curls_out != nullptr && !use_direct_curl) ||
-            (divergence_out != nullptr && !use_direct_divergence);
-
-        if (needs_jacobians) {
-            basis.evaluate_vector_jacobians(points[q], scratch.api_jacobians);
-            write_vector_jacobians_strided(
-                scratch.api_jacobians, num_dofs, output_stride, q, jacobians_out);
-            write_curl_and_divergence_from_jacobians_strided(
-                scratch.api_jacobians,
-                num_dofs,
-                output_stride,
-                q,
-                curls_out,
-                divergence_out);
-            continue;
-        }
-
-        if (curls_out != nullptr) {
-            basis.evaluate_curl(points[q], scratch.api_curl);
-            write_vector_curl_strided(
-                scratch.api_curl, num_dofs, output_stride, q, curls_out);
-        }
-        if (divergence_out != nullptr) {
-            basis.evaluate_divergence(points[q], scratch.api_divergence);
-            write_vector_divergence_strided(
-                scratch.api_divergence, num_dofs, output_stride, q, divergence_out);
-        }
-    }
-}
-
-Vec3 lerp(const Vec3& a, const Vec3& b, Real s);
-Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 cross3(const Vec3& a, const Vec3& b);
-Vec3 normalize3(const Vec3& v);
-
-template <typename ModalPolynomials>
-std::array<int, 3> modal_power_limits(const ModalPolynomials& monomials) {
-    std::array<int, 3> limits{{0, 0, 0}};
-    for (const auto& poly : monomials) {
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            limits[0] = std::max(limits[0], m.px);
-            limits[1] = std::max(limits[1], m.py);
-            limits[2] = std::max(limits[2], m.pz);
-        }
-    }
-    return limits;
-}
-
-std::array<int, 3> component_monomial_power_limits(
-    const std::vector<std::array<int, 4>>& candidates);
-std::size_t triangle_poly_dim(std::size_t k);
-std::size_t tetra_poly_dim(std::size_t k);
-std::size_t rt_wedge_size(int order);
-std::size_t rt_pyramid_size(int order);
-std::size_t nd_wedge_size(int order);
-std::size_t nd_pyramid_size(int order);
-void ensure_supported_hybrid_vector_order(ElementType type,
-                                          int order,
-                                          const char* family_name);
-std::vector<std::array<int, 4>> make_component_monomial_candidates(int max_total_degree);
-std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
-                                                                  int order);
-Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz);
-Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
-                                             const std::vector<Real>& px,
-                                             const std::vector<Real>& py,
-                                             const std::vector<Real>& pz);
-
-void add_component_monomial_jacobian(VectorJacobian& J,
-                                     int component,
-                                     int px_pow,
-                                     int py_pow,
-                                     int pz_pow,
-                                     Real coefficient,
-                                     const std::vector<Real>& px,
-                                     const std::vector<Real>& py,
-                                     const std::vector<Real>& pz);
-VectorJacobian eval_transformed_component_monomial_jacobian(
-    const std::array<int, 4>& mono,
-    const std::vector<Real>& px,
-    const std::vector<Real>& py,
-    const std::vector<Real>& pz);
-void add_component_monomial_curl(Vec3& curl,
-                                 int component,
-                                 int px_pow,
-                                 int py_pow,
-                                 int pz_pow,
-                                 Real coefficient,
-                                 const std::vector<Real>& px,
-                                 const std::vector<Real>& py,
-                                 const std::vector<Real>& pz);
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_values_with_limits(const ModalPolynomials& monomials,
-                                                    const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                    std::size_t n,
-                                                    const Vec3& xi,
-                                                    const std::array<int, 3>& power_limits,
-                                                    std::vector<Vec3>& values) {
-    values.assign(n, Vec3{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_vals = scratch.vector_values;
-    modal_vals.assign(n, Vec3{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& v = modal_vals[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            const Real mv =
-                px[static_cast<std::size_t>(m.px)] *
-                py[static_cast<std::size_t>(m.py)] *
-                pz[static_cast<std::size_t>(m.pz)];
-            v[static_cast<std::size_t>(m.component)] += m.coefficient * mv;
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_values: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_values: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Vec3& mv = modal_vals[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            values[dof][0] += c * mv[0];
-            values[dof][1] += c * mv[1];
-            values[dof][2] += c * mv[2];
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_jacobians_with_limits(const ModalPolynomials& monomials,
-                                                       const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                       std::size_t n,
-                                                       const Vec3& xi,
-                                                       const std::array<int, 3>& power_limits,
-                                                       std::vector<VectorJacobian>& jacobians) {
-    jacobians.assign(n, VectorJacobian{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_jacs = scratch.vector_jacobians;
-    modal_jacs.assign(n, VectorJacobian{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& J = modal_jacs[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            add_component_monomial_jacobian(J, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& Jp = modal_jacs[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            for (std::size_t r = 0; r < 3; ++r) {
-                for (std::size_t col = 0; col < 3; ++col) {
-                    jacobians[dof](r, col) += c * Jp(r, col);
-                }
-            }
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_curl_with_limits(const ModalPolynomials& monomials,
-                                                  const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                  std::size_t n,
-                                                  const Vec3& xi,
-                                                  const std::array<int, 3>& power_limits,
-                                                  std::vector<Vec3>& curl) {
-    curl.assign(n, Vec3{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_curl = scratch.vector_values;
-    modal_curl.assign(n, Vec3{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& c = modal_curl[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            add_component_monomial_curl(c, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_curl: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_curl: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Vec3& cm = modal_curl[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            curl[dof][0] += c * cm[0];
-            curl[dof][1] += c * cm[1];
-            curl[dof][2] += c * cm[2];
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_divergence_with_limits(const ModalPolynomials& monomials,
-                                                 const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                 std::size_t n,
-                                                 const Vec3& xi,
-                                                 const std::array<int, 3>& power_limits,
-                                                 std::vector<Real>& divergence) {
-    divergence.assign(n, Real(0));
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_divergence = scratch.scalars;
-    modal_divergence.assign(n, Real(0));
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        Real div = Real(0);
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            if (m.component == 0 && m.px > 0) {
-                div += m.coefficient * Real(m.px) *
-                       px[static_cast<std::size_t>(m.px - 1)] *
-                       py[static_cast<std::size_t>(m.py)] *
-                       pz[static_cast<std::size_t>(m.pz)];
-            } else if (m.component == 1 && m.py > 0) {
-                div += m.coefficient * Real(m.py) *
-                       px[static_cast<std::size_t>(m.px)] *
-                       py[static_cast<std::size_t>(m.py - 1)] *
-                       pz[static_cast<std::size_t>(m.pz)];
-            } else if (m.component == 2 && m.pz > 0) {
-                div += m.coefficient * Real(m.pz) *
-                       px[static_cast<std::size_t>(m.px)] *
-                       py[static_cast<std::size_t>(m.py)] *
-                       pz[static_cast<std::size_t>(m.pz - 1)];
-            }
-        }
-        modal_divergence[p] = div;
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_divergence: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_divergence: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Real div = modal_divergence[p];
-        if (div == Real(0)) {
-            continue;
-        }
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            divergence[sparse_coeffs.dofs[entry]] +=
-                sparse_coeffs.coefficients[entry] * div;
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_strided_with_limits(
-    const ModalPolynomials& monomials,
-    const SparseModalCoefficientMatrix& sparse_coeffs,
-    std::size_t n,
-    const std::vector<Vec3>& points,
-    std::size_t output_stride,
-    const std::array<int, 3>& power_limits,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out,
-    const char* family_name) {
-    const std::size_t num_qpts = points.size();
-    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_strided: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_strided: sparse coefficient entry mismatch");
-
-    auto& scratch = vector_basis_scratch();
-    const bool need_values = values_out != nullptr;
-    const bool need_jacobians = jacobians_out != nullptr;
-    const bool need_curls = curls_out != nullptr;
-    const bool need_divergence = divergence_out != nullptr;
-
-    if (need_values) {
-        zero_active_strided_rows(values_out, n * 3u, output_stride, num_qpts);
-    }
-    if (need_jacobians) {
-        zero_active_strided_rows(jacobians_out, n * 9u, output_stride, num_qpts);
-    }
-    if (need_curls) {
-        zero_active_strided_rows(curls_out, n * 3u, output_stride, num_qpts);
-    }
-    if (need_divergence) {
-        zero_active_strided_rows(divergence_out, n, output_stride, num_qpts);
-    }
-    if (num_qpts == 0 || n == 0) {
-        return;
-    }
-
-    fill_batched_power_tables(points, power_limits, scratch);
-    const auto& px = scratch.batched_px;
-    const auto& py = scratch.batched_py;
-    const auto& pz = scratch.batched_pz;
-    const std::size_t power_stride = num_qpts;
-    const bool need_modal_gradient = need_jacobians || need_curls || need_divergence;
-
-    auto& modal_values = scratch.modal_values_batched;
-    auto& modal_jacobians = scratch.modal_jacobians_batched;
-    auto& modal_curls = scratch.modal_curls_batched;
-    auto& modal_divergence = scratch.modal_divergence_batched;
-
-    for (std::size_t p = 0; p < n; ++p) {
-        if (need_values) {
-            modal_values.assign(3u * num_qpts, Real(0));
-        }
-        if (need_jacobians) {
-            modal_jacobians.assign(9u * num_qpts, Real(0));
-        }
-        if (need_curls) {
-            modal_curls.assign(3u * num_qpts, Real(0));
-        }
-        if (need_divergence) {
-            modal_divergence.assign(num_qpts, Real(0));
-        }
-
-        const auto& poly = monomials[p];
-        for (int term_index = 0; term_index < poly.num_terms; ++term_index) {
-            const auto& term = poly.terms[static_cast<std::size_t>(term_index)];
-            const std::size_t component = static_cast<std::size_t>(term.component);
-            Real* modal_value_row = need_values
-                ? modal_values.data() + component * num_qpts
-                : nullptr;
-            Real* modal_jacobian_row = need_jacobians
-                ? modal_jacobians.data() + component * 3u * num_qpts
-                : nullptr;
-            Real* modal_curl_rows = need_curls ? modal_curls.data() : nullptr;
-            Real* modal_divergence_row =
-                need_divergence ? modal_divergence.data() : nullptr;
-
-            if (need_values) {
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    modal_value_row[q] +=
-                        term.coefficient *
-                        batched_power_product(px,
-                                              py,
-                                              pz,
-                                              power_stride,
-                                              term.px,
-                                              term.py,
-                                              term.pz,
-                                              q);
-                }
-            }
-
-            if (need_modal_gradient) {
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const Real dphidx =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  0,
-                                                  q);
-                    const Real dphidy =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  1,
-                                                  q);
-                    const Real dphidz =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  2,
-                                                  q);
-
-                    if (need_jacobians) {
-                        modal_jacobian_row[q] += dphidx;
-                        modal_jacobian_row[num_qpts + q] += dphidy;
-                        modal_jacobian_row[2u * num_qpts + q] += dphidz;
-                    }
-                    if (need_curls) {
-                        const Vec3 curl =
-                            curl_from_component_gradient(term.component,
-                                                         dphidx,
-                                                         dphidy,
-                                                         dphidz);
-                        modal_curl_rows[q] += curl[0];
-                        modal_curl_rows[num_qpts + q] += curl[1];
-                        modal_curl_rows[2u * num_qpts + q] += curl[2];
-                    }
-                    if (need_divergence) {
-                        const Real div = term.component == 0 ? dphidx
-                                       : term.component == 1 ? dphidy
-                                                            : dphidz;
-                        modal_divergence_row[q] += div;
-                    }
-                }
-            }
-        }
-
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            if (need_values) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    axpy_qpoints(values_out + (dof * 3u + component) * output_stride,
-                                 modal_values.data() + component * num_qpts,
-                                 c,
-                                 num_qpts);
-                }
-            }
-            if (need_jacobians) {
-                for (std::size_t row = 0; row < 3u; ++row) {
-                    for (std::size_t col = 0; col < 3u; ++col) {
-                        axpy_qpoints(jacobians_out +
-                                         (dof * 9u + row * 3u + col) * output_stride,
-                                     modal_jacobians.data() +
-                                         (row * 3u + col) * num_qpts,
-                                     c,
-                                     num_qpts);
-                    }
-                }
-            }
-            if (need_curls) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    axpy_qpoints(curls_out + (dof * 3u + component) * output_stride,
-                                 modal_curls.data() + component * num_qpts,
-                                 c,
-                                 num_qpts);
-                }
-            }
-            if (need_divergence) {
-                axpy_qpoints(divergence_out + dof * output_stride,
-                             modal_divergence.data(),
-                             c,
-                             num_qpts);
-            }
-        }
-    }
-}
-
-std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType type,
-                                                                  int order);
-Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz);
-Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
-                                       const std::vector<Real>& px,
-                                       const std::vector<Real>& py,
-                                       const std::vector<Real>& pz);
-
-
-} // namespace vector_common
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
deleted file mode 100644
index 6e1a7202b..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
-#define SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
-
-#include "Types.h"
-
-#include <algorithm>
-#include <array>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-struct VectorBasisModalTerm {
-    int component{0}; // 0=x, 1=y, 2=z
-    int px{0};
-    int py{0};
-    int pz{0};
-    Real coefficient{Real(1)};
-};
-
-struct VectorBasisModalPolynomial {
-    std::array<VectorBasisModalTerm, 4> terms{};
-    int num_terms{0};
-};
-
-inline bool modal_terms_equal(const VectorBasisModalTerm& lhs,
-                              const VectorBasisModalTerm& rhs) noexcept {
-    return lhs.component == rhs.component &&
-           lhs.px == rhs.px &&
-           lhs.py == rhs.py &&
-           lhs.pz == rhs.pz &&
-           lhs.coefficient == rhs.coefficient;
-}
-
-inline bool modal_polynomials_equal(const VectorBasisModalPolynomial& lhs,
-                                    const VectorBasisModalPolynomial& rhs) noexcept {
-    if (lhs.num_terms != rhs.num_terms) {
-        return false;
-    }
-    for (int term = 0; term < lhs.num_terms; ++term) {
-        const auto index = static_cast<std::size_t>(term);
-        if (!modal_terms_equal(lhs.terms[index], rhs.terms[index])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-inline bool append_unique_modal_polynomial(
-    std::vector<VectorBasisModalPolynomial>& polynomials,
-    const VectorBasisModalPolynomial& polynomial) {
-    const auto found = std::find_if(
-        polynomials.begin(),
-        polynomials.end(),
-        [&](const VectorBasisModalPolynomial& existing) {
-            return modal_polynomials_equal(existing, polynomial);
-        });
-    if (found != polynomials.end()) {
-        return false;
-    }
-    polynomials.push_back(polynomial);
-    return true;
-}
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
diff --git a/Code/Source/solver/FE/Common/Alignment.h b/Code/Source/solver/FE/Common/Alignment.h
deleted file mode 100644
index 8d33a7a7a..000000000
--- a/Code/Source/solver/FE/Common/Alignment.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef SVMP_FE_CORE_ALIGNMENT_H
-#define SVMP_FE_CORE_ALIGNMENT_H
-
-/**
- * @file Alignment.h
- * @brief Global alignment constants used across FE modules.
- */
-
-#include <cstddef>
-
-namespace svmp {
-namespace FE {
-
-/// Preferred cache-line/SIMD alignment for performance-critical arrays.
-inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
-
-/// Alignment for small fixed-size math objects that are commonly passed by value.
-inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
-
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_CORE_ALIGNMENT_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 60312a524..bb3f23bca 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -70,8 +70,9 @@ enum class CellFamily {
 #endif
 } // namespace svmp
 #endif
-#include <cstdint>
 #include <array>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <type_traits>
 #include <limits>
@@ -174,6 +175,12 @@ constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
  */
 constexpr FieldId CURRENT_SOLUTION_FIELD_ID = std::numeric_limits<FieldId>::max();
 
+/// Preferred cache-line/SIMD alignment for performance-critical arrays.
+inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
+
+/// Alignment for small fixed-size math objects that are commonly passed by value.
+inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
+
 // ============================================================================
 // Field Value Entry (for point evaluation of field-dependent expressions)
 // ============================================================================
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 0b80091f9..6058ab943 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -14,7 +14,7 @@
 #include "MatrixExpr.h"
 #include "Vector.h"
 #include "MathConstants.h"
-#include "../Common/Alignment.h"
+#include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index e272bd6dd..76c7be152 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -12,7 +12,7 @@
 
 #include "VectorExpr.h"
 #include "MathConstants.h"
-#include "../Common/Alignment.h"
+#include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
diff --git a/Code/Source/solver/FE/Quadrature/QuadratureRule.h b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
deleted file mode 100644
index f7d186891..000000000
--- a/Code/Source/solver/FE/Quadrature/QuadratureRule.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_QUADRATURE_RULE_H
-#define SVMP_FE_QUADRATURE_RULE_H
-
-/**
- * @file QuadratureRule.h
- * @brief Abstracted quadrature rule representation for FE integration
- *
- * This header defines the base class for all quadrature rules used by the
- * finite element infrastructure. Rules are expressed in reference element
- * space only; mapping to physical space is handled by the Geometry module.
- *
- * The interface is intentionally lightweight and header-only to avoid coupling
- * Quadrature to other modules while remaining compatible with the Mesh library
- * through shared type aliases provided by FE/Common/Types.h.
- */
-
-#include "Types.h"
-#include "FEException.h"
-#include "Math/Vector.h"
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <iomanip>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace quadrature {
-
-/// Convenience alias for quadrature point representation in reference space
-using QuadPoint = math::Vector<Real, 3>;
-
-struct QuadraturePointFingerprint {
-    int dimension{0};
-    std::size_t num_points{0};
-    std::uint64_t points_hash_a{0};
-    std::uint64_t points_hash_b{0};
-};
-
-/**
- * @brief Base class for quadrature rules over reference elements
- *
- * Derived classes populate the point/weight data via the protected setters.
- * The class performs lightweight consistency checks (size agreement, basic
- * reference-measure validation) but leaves element-specific checks to callers.
- */
-class QuadratureRule {
-public:
-    virtual ~QuadratureRule() = default;
-
-    /// Number of quadrature points
-    std::size_t num_points() const noexcept { return points_.size(); }
-
-    /// Polynomial exactness degree reported by the rule
-    int order() const noexcept { return order_; }
-
-    /// Spatial dimension of the reference domain
-    int dimension() const noexcept { return dimension_; }
-
-    /// Cell family the rule integrates over (line, tri, quad, ...)
-    svmp::CellFamily cell_family() const noexcept { return cell_family_; }
-
-    /// Access a single quadrature point (no bounds checking)
-    QuadPoint point(std::size_t i) const noexcept { return points_[i]; }
-
-    /// Access a single quadrature weight (no bounds checking)
-    Real weight(std::size_t i) const noexcept { return weights_[i]; }
-
-    /// Bulk accessors
-    const std::vector<QuadPoint>& points() const noexcept { return points_; }
-    const std::vector<Real>& weights() const noexcept { return weights_; }
-
-    /// Cached coordinate-only fingerprint for consumers whose values depend on
-    /// reference points but not quadrature weights.
-    QuadraturePointFingerprint point_fingerprint() const noexcept { return point_fingerprint_; }
-
-    /// Stable semantic identity used by BasisCache
-    virtual std::string cache_identity() const;
-
-    /**
-     * @brief Validate rule data for basic consistency
-     * @param tol Relative tolerance for weight sum check
-     * @return True if rule passes size and weight checks
-     */
-    virtual bool is_valid(Real tol = 1e-12) const;
-
-    /**
-     * @brief Reference-domain measure for the element family
-     *
-     * Length/area/volume of the canonical reference element:
-     * - Line   [-1,1]            -> 2
-     * - Quad   [-1,1]^2          -> 4
-     * - Hex    [-1,1]^3          -> 8
-     * - Tri    (0,0)-(1,0)-(0,1) -> 0.5
-     * - Tet    simplex at origin -> 1/6
-     * - Wedge  (triangle x line) -> 1
-     * - Pyramid (x,y in [-1,1], z in [0,1]) -> 4/3
-     */
-    Real reference_measure() const noexcept;
-
-protected:
-    QuadratureRule(svmp::CellFamily family, int dimension, int order = 0)
-        : cell_family_(family), dimension_(dimension), order_(order) {}
-
-    /// Assign point and weight storage (sizes must match)
-    void set_data(std::vector<QuadPoint> pts, std::vector<Real> wts);
-
-    /// Override computed order in derived classes
-    void set_order(int ord) noexcept { order_ = ord; }
-
-private:
-    std::string build_cache_identity() const;
-    QuadraturePointFingerprint build_point_fingerprint() const noexcept;
-
-    svmp::CellFamily cell_family_;
-    int dimension_;
-    int order_;
-    std::vector<QuadPoint> points_;
-    std::vector<Real> weights_;
-    std::string cache_identity_;
-    QuadraturePointFingerprint point_fingerprint_;
-};
-
-// --------------------------------------------------------------------------------
-// Inline implementations
-// --------------------------------------------------------------------------------
-
-inline void QuadratureRule::set_data(std::vector<QuadPoint> pts, std::vector<Real> wts) {
-    if (pts.size() != wts.size()) {
-        throw FEException("QuadratureRule: points/weights size mismatch",
-                          StatusCode::InvalidArgument,
-                          __FILE__, __LINE__, __func__);
-    }
-    points_ = std::move(pts);
-    weights_ = std::move(wts);
-    point_fingerprint_ = build_point_fingerprint();
-    cache_identity_ = build_cache_identity();
-}
-
-inline bool QuadratureRule::is_valid(Real tol) const {
-    if (points_.empty() || points_.size() != weights_.size()) {
-        return false;
-    }
-    Real sum_w = Real(0);
-    for (Real w : weights_) {
-        if (!std::isfinite(w)) {
-            return false;
-        }
-        sum_w += w;
-    }
-    const Real ref = reference_measure();
-    const Real denom = std::max(Real(1), std::abs(ref));
-    return std::abs(sum_w - ref) <= tol * denom;
-}
-
-inline std::string QuadratureRule::cache_identity() const {
-    if (!cache_identity_.empty()) {
-        return cache_identity_;
-    }
-    return build_cache_identity();
-}
-
-inline std::string QuadratureRule::build_cache_identity() const {
-    std::ostringstream oss;
-    oss << "dim=" << dimension_
-        << "|npts=" << points_.size();
-
-    oss << std::setprecision(std::numeric_limits<Real>::max_digits10);
-    for (const auto& pt : points_) {
-        oss << "|pt=" << pt[0] << ',' << pt[1] << ',' << pt[2];
-    }
-    return oss.str();
-}
-
-inline QuadraturePointFingerprint QuadratureRule::build_point_fingerprint() const noexcept {
-    auto real_bits = [](Real value) noexcept {
-        static_assert(sizeof(Real) <= sizeof(std::uint64_t),
-                      "Quadrature point fingerprints assume Real fits in 64 bits");
-        std::uint64_t bits = 0;
-        std::memcpy(&bits, &value, sizeof(Real));
-        return bits;
-    };
-    auto mix_hash = [](std::uint64_t& seed, std::uint64_t value) noexcept {
-        seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
-    };
-
-    QuadraturePointFingerprint fingerprint;
-    fingerprint.dimension = dimension_;
-    fingerprint.num_points = points_.size();
-    fingerprint.points_hash_a = 1469598103934665603ULL;
-    fingerprint.points_hash_b = 1099511628211ULL;
-
-    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.dimension));
-    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.num_points));
-    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.num_points));
-    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.dimension));
-    for (const auto& point : points_) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            const std::uint64_t bits = real_bits(point[component]);
-            mix_hash(fingerprint.points_hash_a, bits);
-            mix_hash(fingerprint.points_hash_b, bits ^ (0xbf58476d1ce4e5b9ULL + component));
-        }
-    }
-    return fingerprint;
-}
-
-inline Real QuadratureRule::reference_measure() const noexcept {
-    switch (cell_family_) {
-        case svmp::CellFamily::Line:      return Real(2);
-        case svmp::CellFamily::Quad:      return Real(4);
-        case svmp::CellFamily::Hex:       return Real(8);
-        case svmp::CellFamily::Triangle:  return Real(0.5);
-        case svmp::CellFamily::Tetra:     return Real(1.0 / 6.0);
-        case svmp::CellFamily::Wedge:     return Real(1.0);     // 0.5 area * length 2
-        case svmp::CellFamily::Pyramid:   return Real(4.0 / 3.0);
-        case svmp::CellFamily::Point:     return Real(1.0);
-        default:                          return Real(1.0);
-    }
-}
-
-} // namespace quadrature
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_QUADRATURE_RULE_H
diff --git a/Code/Source/solver/Timer.h b/Code/Source/solver/Timer.h
index 6810ae17c..b8ffa29df 100644
--- a/Code/Source/solver/Timer.h
+++ b/Code/Source/solver/Timer.h
@@ -4,28 +4,24 @@
 #ifndef TIMER_H 
 #define TIMER_H 
 
-#include <chrono>
-#include <iostream>
-#include <string>
+#include <sys/time.h>
 
 /// @brief Keep track of time
 class Timer 
 {
   public:
 
-    double get_elapsed_time()
+    double get_elapsed_time() const
     {
       return get_time() - current_time;
     }
 
-    double get_time()
+    double get_time() const
     {
-      auto now = std::chrono::system_clock::now();
-      auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-
-      auto value = now_ms.time_since_epoch();
-      auto duration = value.count() / 1000.0;
-      return static_cast<double>(duration);
+      timeval now{};
+      gettimeofday(&now, nullptr);
+      return static_cast<double>(now.tv_sec) +
+             static_cast<double>(now.tv_usec) * 1.0e-6;
     }
 
     void set_time()
@@ -33,8 +29,7 @@ class Timer
       current_time = get_time();
     }
 
-    double current_time;
+    double current_time{0.0};
 };
 
 #endif
-
diff --git a/Code/Source/solver/load_msh.cpp b/Code/Source/solver/load_msh.cpp
index c7c5a62ba..50d0ca858 100644
--- a/Code/Source/solver/load_msh.cpp
+++ b/Code/Source/solver/load_msh.cpp
@@ -13,7 +13,6 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
-#include <chrono>
 #include <unordered_map>
 #include <string>
 #include <iomanip>
@@ -300,4 +299,3 @@ void read_sv(Simulation* simulation, mshType& mesh, const MeshParameters* mesh_p
         }
     }
 };
-
diff --git a/Code/Source/solver/utils.cpp b/Code/Source/solver/utils.cpp
index 4d5b847cd..233d35474 100644
--- a/Code/Source/solver/utils.cpp
+++ b/Code/Source/solver/utils.cpp
@@ -4,7 +4,6 @@
 #include "utils.h"
 
 #include <bitset>
-#include <chrono>
 #include <cmath> 
 #include <limits>
 
@@ -13,6 +12,7 @@
 #include <iostream>
 #include <fstream>
 #include <sys/resource.h>
+#include <sys/time.h>
 
 /* MacOS
 #include <mach/task.h>
@@ -35,12 +35,10 @@ int CountBits(int n)
 
 double cput()
 {
-  auto now = std::chrono::system_clock::now();
-  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-
-  auto value = now_ms.time_since_epoch();
-  auto duration = value.count() / 1000.0;
-  return static_cast<double>(duration);
+  timeval now{};
+  gettimeofday(&now, nullptr);
+  return static_cast<double>(now.tv_sec) +
+         static_cast<double>(now.tv_usec) * 1.0e-6;
 }
 
 Vector<double> 
@@ -386,4 +384,4 @@ void find_loc(const Array<int>& array, int value, std::array<int, 2>& ind)
   }
 }
 
-};
\ No newline at end of file
+};
diff --git a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
index 0938bb554..45b176fe7 100644
--- a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -34,7 +34,9 @@
   */
 
 #include <atomic>
+#ifdef EIGEN_USE_GPU
 #include <chrono>
+#endif
 #include <cmath>
 #include <cstddef>
 #include <cstring>
diff --git a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
deleted file mode 100644
index 216fd0401..000000000
--- a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * @file test_BasisCacheFactory.cpp
- * @brief Tests for the migrated Basis cache and factory subset.
- */
-
-#include <gtest/gtest.h>
-
-#include "FE/Basis/BasisCache.h"
-#include "FE/Basis/BasisFactory.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/SerendipityBasis.h"
-#include "FE/Quadrature/QuadratureRule.h"
-
-#include <memory>
-#include <vector>
-
-using namespace svmp::FE;
-using namespace svmp::FE::basis;
-using namespace svmp::FE::quadrature;
-
-namespace {
-
-class CustomQuadratureRule final : public QuadratureRule {
-public:
-    CustomQuadratureRule(svmp::CellFamily family,
-                         int dimension,
-                         int order,
-                         std::vector<QuadPoint> points,
-                         std::vector<Real> weights)
-        : QuadratureRule(family, dimension, order)
-    {
-        set_data(std::move(points), std::move(weights));
-    }
-};
-
-CustomQuadratureRule line_rule() {
-    return CustomQuadratureRule(
-        svmp::CellFamily::Line, 1, 3,
-        {
-            QuadPoint{Real(-0.5), Real(0), Real(0)},
-            QuadPoint{Real(0.5), Real(0), Real(0)}
-        },
-        {Real(1), Real(1)});
-}
-
-CustomQuadratureRule quad_rule(Real first_weight = Real(1)) {
-    return CustomQuadratureRule(
-        svmp::CellFamily::Quad, 2, 3,
-        {
-            QuadPoint{Real(-0.5), Real(-0.5), Real(0)},
-            QuadPoint{Real(0.5), Real(-0.25), Real(0)},
-            QuadPoint{Real(0.0), Real(0.5), Real(0)}
-        },
-        {first_weight, Real(1), Real(2)});
-}
-
-class TestCustomScalarBasis final : public BasisFunction {
-public:
-    explicit TestCustomScalarBasis(int tag)
-        : tag_(tag)
-    {
-    }
-
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
-    ElementType element_type() const noexcept override { return ElementType::Line2; }
-    int dimension() const noexcept override { return 1; }
-    int order() const noexcept override { return 1; }
-    std::size_t size() const noexcept override { return 2u; }
-
-    std::string cache_identity() const override {
-        return BasisFunction::cache_identity() + "|tag=" + std::to_string(tag_);
-    }
-
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
-    {
-        values.resize(2u);
-        const Real shift = Real(tag_) * Real(0.125);
-        values[0] = Real(0.5) * (Real(1) - xi[0]) + shift;
-        values[1] = Real(0.5) * (Real(1) + xi[0]) - shift;
-    }
-
-    void evaluate_gradients(const math::Vector<Real, 3>&,
-                            std::vector<Gradient>& gradients) const override
-    {
-        gradients.assign(2u, Gradient{});
-        gradients[0][0] = Real(-0.5);
-        gradients[1][0] = Real(0.5);
-    }
-
-private:
-    int tag_{0};
-};
-
-class StructuredIdentityScalarBasis final : public BasisFunction {
-public:
-    explicit StructuredIdentityScalarBasis(int tag)
-        : tag_(tag)
-    {
-    }
-
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
-    ElementType element_type() const noexcept override { return ElementType::Line2; }
-    int dimension() const noexcept override { return 1; }
-    int order() const noexcept override { return 1; }
-    std::size_t size() const noexcept override { return 2u; }
-
-    bool cache_identity_words(std::vector<std::uint64_t>& words) const override {
-        words.push_back(0x7374727563746964ULL);
-        words.push_back(static_cast<std::uint64_t>(tag_));
-        return true;
-    }
-
-    std::string cache_identity() const override {
-        ++string_identity_calls;
-        return BasisFunction::cache_identity() + "|structured-tag=" + std::to_string(tag_);
-    }
-
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
-    {
-        values.resize(2u);
-        values[0] = Real(1) - xi[0] + Real(tag_);
-        values[1] = xi[0] - Real(tag_);
-    }
-
-    mutable std::size_t string_identity_calls{0};
-
-private:
-    int tag_{0};
-};
-
-} // namespace
-
-TEST(BasisFactory, CreatesLagrangeAndSerendipityBases) {
-    auto lagrange = basis_factory::create(
-        BasisRequest{ElementType::Line2, BasisType::Lagrange, 2});
-    ASSERT_NE(lagrange, nullptr);
-    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
-    EXPECT_EQ(lagrange->element_type(), ElementType::Line2);
-    EXPECT_EQ(lagrange->order(), 2);
-
-    auto serendipity = basis_factory::create(
-        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
-    ASSERT_NE(serendipity, nullptr);
-    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
-    EXPECT_EQ(serendipity->element_type(), ElementType::Quad8);
-    EXPECT_EQ(serendipity->size(), 8u);
-}
-
-TEST(BasisFactory, RejectsOutOfScopeAndInvalidRequests) {
-    EXPECT_THROW(
-        (void)basis_factory::create(BasisRequest{ElementType::Line2, BasisType::Lagrange}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2,
-                         BasisType::Lagrange,
-                         1,
-                         Continuity::H_div,
-                         FieldType::Vector}),
-        BasisConfigurationException);
-}
-
-TEST(BasisFactory, SupportsCustomFactoryRegistration) {
-    basis_factory::clear_custom_registry_for_tests();
-    basis_factory::register_custom(
-        "test-custom",
-        [](const BasisRequest& req) {
-            const int tag = req.order.value_or(0);
-            return std::make_shared<TestCustomScalarBasis>(tag);
-        });
-
-    BasisRequest req{ElementType::Line2, BasisType::Custom, 7};
-    req.custom_id = "test-custom";
-    auto custom = basis_factory::create(req);
-    ASSERT_NE(custom, nullptr);
-    EXPECT_EQ(custom->basis_type(), BasisType::Custom);
-    EXPECT_EQ(custom->size(), 2u);
-
-    basis_factory::unregister_custom("test-custom");
-    EXPECT_THROW((void)basis_factory::create(req), BasisConfigurationException);
-    basis_factory::clear_custom_registry_for_tests();
-}
-
-TEST(BasisCache, ReusesEntriesForSameBasisAndQuadratureCoordinates) {
-    LagrangeBasis basis(ElementType::Line2, 2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry1 = cache.get_or_compute(basis, quad, true, true);
-    const auto& entry2 = cache.get_or_compute(basis, quad, true, true);
-
-    EXPECT_EQ(&entry1, &entry2);
-    EXPECT_EQ(entry1.num_qpts, quad.num_points());
-    EXPECT_EQ(entry1.num_dofs, basis.size());
-    ASSERT_EQ(entry1.scalar_values.size(), basis.size() * quad.num_points());
-    ASSERT_EQ(entry1.gradients.size(), basis.size() * 3u * quad.num_points());
-    ASSERT_EQ(entry1.hessians.size(), basis.size() * 9u * quad.num_points());
-    EXPECT_EQ(cache.size(), 1u);
-}
-
-TEST(BasisCache, ReusesCoordinateIdenticalQuadratureRulesIgnoringWeights) {
-    SerendipityBasis basis(ElementType::Quad8, 2);
-    const auto quad_a = quad_rule(Real(1));
-    const auto quad_b = quad_rule(Real(0.25));
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(basis, quad_a, true, false);
-    const auto& entry_b = cache.get_or_compute(basis, quad_b, true, false);
-
-    EXPECT_EQ(&entry_a, &entry_b);
-    EXPECT_EQ(cache.size(), 1u);
-}
-
-TEST(BasisCache, SeparatesStringIdentityCustomBases) {
-    TestCustomScalarBasis custom_a(1);
-    TestCustomScalarBasis custom_b(2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
-    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
-
-    EXPECT_NE(&entry_a, &entry_b);
-    EXPECT_NE(entry_a.scalar_values, entry_b.scalar_values);
-    EXPECT_EQ(cache.size(), 2u);
-}
-
-TEST(BasisCache, StructuredIdentityAvoidsStringFallbackAndSeparatesBases) {
-    StructuredIdentityScalarBasis custom_a(1);
-    StructuredIdentityScalarBasis custom_b(2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
-    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
-
-    EXPECT_NE(&entry_a, &entry_b);
-    EXPECT_EQ(custom_a.string_identity_calls, 0u);
-    EXPECT_EQ(custom_b.string_identity_calls, 0u);
-    EXPECT_EQ(cache.size(), 2u);
-}
-
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 967f078aa..7838702b0 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -21,7 +21,7 @@ namespace {
 
 class MinimalScalarBasis : public BasisFunction {
 public:
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     ElementType element_type() const noexcept override { return ElementType::Line2; }
     int dimension() const noexcept override { return 1; }
     int order() const noexcept override { return 1; }
@@ -36,7 +36,7 @@ class MinimalScalarBasis : public BasisFunction {
 
 class CompleteFallbackBasis : public BasisFunction {
 public:
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     ElementType element_type() const noexcept override { return ElementType::Triangle3; }
     int dimension() const noexcept override { return 2; }
     int order() const noexcept override { return 1; }
@@ -90,6 +90,8 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3),
                  BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2),
+                 BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2),
                  BasisElementCompatibilityException);
 }
@@ -104,6 +106,13 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
                  BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
+                 BasisElementCompatibilityException);
+
+    BasisRequest vector_req{ElementType::Line2, BasisType::Lagrange, 1};
+    vector_req.field_type = FieldType::Vector;
+    EXPECT_THROW((void)basis_factory::create(vector_req), BasisConfigurationException);
 
     auto serendipity = basis_factory::create(
         BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
@@ -130,6 +139,8 @@ TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
                  BasisNodeOrderingException);
     EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),
                  BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid5),
+                 BasisNodeOrderingException);
 }
 
 TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
@@ -142,25 +153,22 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
     EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
 }
 
-TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.25), Real(0.5), Real(-0.25)},
-        {Real(-0.5), Real(0.75), Real(0.125)}
-    };
-    prewarm_basis_function_scratch(basis.size(), points.size());
+    const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
+    prewarm_basis_function_scratch(basis.size());
 
     std::vector<Real> flat_values(basis.size());
     std::vector<Real> flat_gradients(basis.size() * 3u);
     std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(points.front(), flat_values.data());
-    basis.evaluate_gradients_to(points.front(), flat_gradients.data());
-    basis.evaluate_hessians_to(points.front(), flat_hessians.data());
+    basis.evaluate_values_to(point, flat_values.data());
+    basis.evaluate_gradients_to(point, flat_gradients.data());
+    basis.evaluate_hessians_to(point, flat_hessians.data());
 
     std::vector<Real> expected_values;
     std::vector<Gradient> expected_gradients;
     std::vector<Hessian> expected_hessians;
-    basis.evaluate_all(points.front(), expected_values, expected_gradients, expected_hessians);
+    basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
     for (std::size_t d = 0; d < basis.size(); ++d) {
         EXPECT_EQ(flat_values[d], expected_values[d]);
         for (std::size_t c = 0; c < 3u; ++c) {
@@ -172,32 +180,4 @@ TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
             }
         }
     }
-
-    constexpr std::size_t output_stride = 3u;
-    std::vector<Real> values(basis.size() * output_stride, Real(-99));
-    std::vector<Real> gradients(basis.size() * 3u * output_stride, Real(-99));
-    std::vector<Real> hessians(basis.size() * 9u * output_stride, Real(-99));
-    basis.evaluate_at_quadrature_points_strided(
-        points, output_stride, values.data(), gradients.data(), hessians.data());
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        basis.evaluate_all(points[q], expected_values, expected_gradients, expected_hessians);
-        for (std::size_t d = 0; d < basis.size(); ++d) {
-            EXPECT_EQ(values[d * output_stride + q], expected_values[d]);
-            for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_EQ(gradients[(d * 3u + c) * output_stride + q],
-                          expected_gradients[d][c]);
-            }
-            for (std::size_t r = 0; r < 3u; ++r) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_EQ(hessians[(d * 9u + r * 3u + c) * output_stride + q],
-                              expected_hessians[d](r, c));
-                }
-            }
-        }
-    }
-
-    for (std::size_t d = 0; d < basis.size(); ++d) {
-        EXPECT_EQ(values[d * output_stride + 2u], Real(-99));
-    }
 }
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index 0899ce358..f786b07cd 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -61,8 +61,6 @@ std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
             return {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
         case ElementType::Wedge6:
             return {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}};
-        case ElementType::Pyramid5:
-            return {{Real(0.0), Real(0.0), Real(0.2)}, {Real(0.12), Real(-0.08), Real(0.24)}};
         default:
             return {{Real(0), Real(0), Real(0)}};
     }
@@ -200,8 +198,6 @@ TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
         {ElementType::Tetra4, 2, Real(1e-6), Real(1e-5)},
         {ElementType::Hex8, 2, Real(1e-6), Real(1e-5)},
         {ElementType::Wedge6, 2, Real(1e-5), Real(1e-5)},
-        {ElementType::Pyramid5, 1, Real(2e-6), Real(1e-5)},
-        {ElementType::Pyramid5, 3, Real(4e-4), Real(2e-5)},
     };
 
     for (const auto& c : cases) {
@@ -223,7 +219,6 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
         {ElementType::Tetra4, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
         {ElementType::Hex8, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
         {ElementType::Wedge6, 2, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-10)},
-        {ElementType::Pyramid5, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-8)},
     };
 
     for (const auto& c : cases) {
@@ -233,25 +228,6 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
     }
 }
 
-TEST(BasisHessians, LagrangePyramidExactApexHessianThrows) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    const math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        std::vector<Hessian> hessians;
-        EXPECT_THROW(basis.evaluate_hessians(apex, hessians), BasisEvaluationException)
-            << "order " << c.order;
-    }
-}
-
 TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
     const struct Case {
         ElementType type;
@@ -262,7 +238,6 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
         {ElementType::Quad8, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
         {ElementType::Hex20, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
         {ElementType::Wedge15, 2, {Real(0.2), Real(0.3), Real(0.1)}, Real(1e-10)},
-        {ElementType::Pyramid13, 2, {Real(0.1), Real(-0.2), Real(0.4)}, Real(1e-8)},
     };
 
     for (const auto& c : cases) {
@@ -272,13 +247,6 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
     }
 }
 
-TEST(BasisHessians, SerendipityPyramidExactApexHessianThrows) {
-    SerendipityBasis basis(ElementType::Pyramid13, 2);
-    std::vector<Hessian> hessians;
-    EXPECT_THROW(basis.evaluate_hessians({Real(0), Real(0), Real(1)}, hessians),
-                 BasisEvaluationException);
-}
-
 TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
     const struct Case {
         ElementType type;
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index a1031fa76..44e588fdc 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -1,21 +1,16 @@
 /**
  * @file test_ConstexprBasis.cpp
- * @brief Compile-time and lightweight runtime checks for migrated Basis helpers.
+ * @brief Compile-time and lightweight runtime checks for reduced Basis helpers.
  */
 
-#include "FE/Basis/BasisTolerance.h"
+#include "FE/Basis/BasisExceptions.h"
 #include "FE/Basis/BasisTraits.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/LagrangeBasisFast.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 
 #include <gtest/gtest.h>
 
-#include <array>
 #include <limits>
 #include <tuple>
-#include <type_traits>
-#include <utility>
 #include <vector>
 
 namespace svmp {
@@ -30,26 +25,27 @@ static_assert(is_quadrilateral(ElementType::Quad8));
 static_assert(is_tetrahedron(ElementType::Tetra10));
 static_assert(is_hexahedron(ElementType::Hex20));
 static_assert(is_wedge(ElementType::Wedge18));
-static_assert(is_pyramid(ElementType::Pyramid14));
+static_assert(!is_pyramid(ElementType::Pyramid5));
+static_assert(!is_pyramid(ElementType::Pyramid14));
 static_assert(is_simplex(ElementType::Triangle3));
 static_assert(is_simplex(ElementType::Tetra4));
 static_assert(!is_simplex(ElementType::Wedge6));
 static_assert(is_tensor_product(ElementType::Line2));
 static_assert(is_tensor_product(ElementType::Quad9));
 static_assert(is_tensor_product(ElementType::Hex27));
-static_assert(!is_tensor_product(ElementType::Pyramid5));
-static_assert(reference_dimension(ElementType::Pyramid14) == 3);
+static_assert(!is_tensor_product(ElementType::Wedge6));
+static_assert(topology(ElementType::Pyramid5) == BasisTopology::Unknown);
 static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
 static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
-static_assert(complete_lagrange_alias_order(ElementType::Hex20) == -1);
+static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
 static_assert(line_lagrange_size(2) == 3u);
 static_assert(triangle_lagrange_size(2) == 6u);
 static_assert(quad_lagrange_size(2) == 9u);
 static_assert(tetra_lagrange_size(2) == 10u);
 static_assert(hex_lagrange_size(2) == 27u);
 static_assert(wedge_lagrange_size(2) == 18u);
-static_assert(pyramid_lagrange_size(2) == 14u);
+static_assert(complete_lagrange_alias_size(ElementType::Pyramid14) == 0u);
 static_assert(detail::basis_abs(Real(-2)) == Real(2));
 static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
 static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));
@@ -57,73 +53,7 @@ static_assert(detail::basis_nearly_equal(
     Real(1),
     Real(1) + std::numeric_limits<Real>::epsilon() * Real(32)));
 
-constexpr auto kLineFastValues = [] {
-    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
-    std::array<Real, LagrangeLineFast<1>::n_dofs> values{};
-    LagrangeLineFast<1>::evaluate(xi, values);
-    return values;
-}();
-static_assert(kLineFastValues[0] == Real(0.5));
-static_assert(kLineFastValues[1] == Real(0.5));
-
-constexpr auto kLineP2FastHessians = [] {
-    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
-    std::array<Hessian, LagrangeLineFast<2>::n_dofs> hessians{};
-    LagrangeLineFast<2>::evaluate_hessians(xi, hessians);
-    return hessians;
-}();
-static_assert(kLineP2FastHessians[0](0, 0) == Real(1));
-static_assert(kLineP2FastHessians[1](0, 0) == Real(1));
-static_assert(kLineP2FastHessians[2](0, 0) == Real(-2));
-
-constexpr auto kTriP2FastValues = [] {
-    math::Vector<Real, 3> xi{Real(0.25), Real(0.25), Real(0)};
-    std::array<Real, LagrangeTriFast<2>::n_dofs> values{};
-    LagrangeTriFast<2>::evaluate(xi, values);
-    return values;
-}();
-static_assert(kTriP2FastValues[0] == Real(0));
-static_assert(kTriP2FastValues[3] == Real(0.5));
-static_assert(kTriP2FastValues[4] == Real(0.25));
-
-template<typename Basis>
-constexpr bool overrides_scalar_strided_v =
-    !std::is_same_v<decltype(&Basis::evaluate_at_quadrature_points_strided),
-                    decltype(&BasisFunction::evaluate_at_quadrature_points_strided)>;
-
-template<typename FastBasis>
-void expect_fast_matches_lagrange(ElementType type,
-                                  int order,
-                                  const std::vector<math::Vector<Real, 3>>& points)
-{
-    LagrangeBasis basis(type, order);
-    for (const auto& xi : points) {
-        std::vector<Real> expected_values;
-        std::vector<Gradient> expected_gradients;
-        std::vector<Hessian> expected_hessians;
-        basis.evaluate_all(xi, expected_values, expected_gradients, expected_hessians);
-
-        std::array<Real, FastBasis::n_dofs> values{};
-        std::array<Gradient, FastBasis::n_dofs> gradients{};
-        std::array<Hessian, FastBasis::n_dofs> hessians{};
-        FastBasis::evaluate(xi, values);
-        FastBasis::evaluate_gradients(xi, gradients);
-        FastBasis::evaluate_hessians(xi, hessians);
-
-        ASSERT_EQ(expected_values.size(), values.size());
-        for (std::size_t i = 0; i < values.size(); ++i) {
-            EXPECT_NEAR(values[i], expected_values[i], Real(1e-14));
-            for (std::size_t d = 0; d < 3u; ++d) {
-                EXPECT_NEAR(gradients[i][d], expected_gradients[i][d], Real(1e-14));
-                for (std::size_t e = 0; e < 3u; ++e) {
-                    EXPECT_NEAR(hessians[i](d, e), expected_hessians[i](d, e), Real(1e-14));
-                }
-            }
-        }
-    }
-}
-
-TEST(ConstexprBasis, FixedNodeTableSizes) {
+TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     const std::vector<std::pair<ElementType, std::size_t>> expected = {
         {ElementType::Line2, 2u},
         {ElementType::Line3, 3u},
@@ -140,9 +70,6 @@ TEST(ConstexprBasis, FixedNodeTableSizes) {
         {ElementType::Wedge6, 6u},
         {ElementType::Wedge15, 15u},
         {ElementType::Wedge18, 18u},
-        {ElementType::Pyramid5, 5u},
-        {ElementType::Pyramid13, 13u},
-        {ElementType::Pyramid14, 14u},
     };
 
     for (const auto& [type, size] : expected) {
@@ -150,7 +77,7 @@ TEST(ConstexprBasis, FixedNodeTableSizes) {
     }
 }
 
-TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
+TEST(ConstexprBasis, TraitToleranceScalesWithRealPrecision) {
     const Real eps = std::numeric_limits<Real>::epsilon();
     EXPECT_GT(detail::basis_scaled_tolerance(), eps);
     EXPECT_TRUE(detail::basis_near_zero(eps * Real(32)));
@@ -159,37 +86,6 @@ TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
     EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(128)));
 }
 
-TEST(ConstexprBasis, LagrangeOverridesStridedEvaluation) {
-    EXPECT_TRUE(overrides_scalar_strided_v<LagrangeBasis>);
-}
-
-TEST(ConstexprBasis, FastSidecarsMatchRuntimeLagrangeBasis) {
-    expect_fast_matches_lagrange<LagrangeLineFast<1>>(
-        ElementType::Line2, 1,
-        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeLineFast<2>>(
-        ElementType::Line2, 2,
-        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeQuadFast<1>>(
-        ElementType::Quad4, 1,
-        {{Real(-0.2), Real(0.3), Real(0)}, {Real(0.35), Real(-0.45), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeHexFast<1>>(
-        ElementType::Hex8, 1,
-        {{Real(-0.2), Real(0.3), Real(0.1)}, {Real(0.35), Real(-0.45), Real(0.25)}});
-    expect_fast_matches_lagrange<LagrangeTriFast<1>>(
-        ElementType::Triangle3, 1,
-        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeTriFast<2>>(
-        ElementType::Triangle3, 2,
-        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeTetFast<1>>(
-        ElementType::Tetra4, 1,
-        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
-    expect_fast_matches_lagrange<LagrangeTetFast<2>>(
-        ElementType::Tetra4, 2,
-        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
-}
-
 TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
     const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
         {ElementType::Line2, ElementType::Line2, 1},
@@ -204,8 +100,6 @@ TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
         {ElementType::Hex27, ElementType::Hex8, 2},
         {ElementType::Wedge6, ElementType::Wedge6, 1},
         {ElementType::Wedge18, ElementType::Wedge6, 2},
-        {ElementType::Pyramid5, ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
     };
 
     for (const auto& [alias, canonical_type, order] : aliases) {
@@ -220,6 +114,15 @@ TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
     }
 }
 
+TEST(ConstexprBasis, PyramidNodeOrderingIsOutsideCurrentScope) {
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid5),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid13),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Pyramid5, 1),
+                 BasisNodeOrderingException);
+}
+
 } // namespace
 } // namespace basis
 } // namespace FE
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
similarity index 64%
rename from tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
rename to tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 26efc4070..3faffd9e0 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -1,6 +1,6 @@
 /**
- * @file test_HigherOrderWedgePyramid.cpp
- * @brief Focused higher-order wedge and pyramid checks for LagrangeBasis.
+ * @file test_HigherOrderWedge.cpp
+ * @brief Focused higher-order wedge checks for LagrangeBasis.
  */
 
 #include <gtest/gtest.h>
@@ -9,8 +9,6 @@
 #include "FE/Basis/NodeOrderingConventions.h"
 
 #include <cmath>
-#include <tuple>
-#include <utility>
 #include <vector>
 
 using namespace svmp::FE;
@@ -107,28 +105,18 @@ void expect_all_entries_finite(const LagrangeBasis& basis,
 
 } // namespace
 
-TEST(HigherOrderWedgePyramid, CompleteAliasesMatchGeneratedNodeLayouts) {
-    const std::vector<std::tuple<ElementType, ElementType, int>> cases = {
-        {ElementType::Wedge18, ElementType::Wedge6, 2},
-        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
-    };
-
-    for (const auto& [alias, canonical, order] : cases) {
-        LagrangeBasis alias_basis(alias, order);
-        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
-        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
-        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
-
-        for (std::size_t i = 0; i < generated.size(); ++i) {
-            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
-            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node " << i;
-            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node " << i;
-            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node " << i;
-        }
-    }
+TEST(HigherOrderWedge, CompleteAliasMatchesGeneratedNodeLayout) {
+    LagrangeBasis alias_basis(ElementType::Wedge18, 1);
+    const auto generated =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Wedge6, 2);
+
+    ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(ElementType::Wedge18));
+    EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge6);
+    EXPECT_EQ(alias_basis.order(), 2);
+    expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
 }
 
-TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
+TEST(HigherOrderWedge, OrderThreeIsNodalAndPartitionsUnity) {
     LagrangeBasis wedge(ElementType::Wedge6, 3);
 
     expect_kronecker_at_nodes(wedge, Real(2e-10));
@@ -143,31 +131,9 @@ TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
         Real(1e-9));
 }
 
-TEST(HigherOrderWedgePyramid, PyramidOrderThreeIsNodalAndPartitionsUnity) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, 3);
+TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
+    LagrangeBasis wedge(ElementType::Wedge6, 4);
 
-    expect_kronecker_at_nodes(pyramid, Real(5e-8));
-    expect_partition_gradient_hessian_sums(
-        pyramid,
-        {
-            {Real(0), Real(0), Real(0.2)},
-            {Real(0.12), Real(-0.08), Real(0.24)},
-            {Real(-0.08), Real(0.1), Real(0.55)},
-        },
-        Real(1e-11),
-        Real(5e-7));
-}
-
-TEST(HigherOrderWedgePyramid, PyramidNearApexDerivativeQueriesRemainFinite) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    for (const auto& [type, order] : cases) {
-        LagrangeBasis basis(type, order);
-        expect_all_entries_finite(basis, {Real(0.01), Real(-0.005), Real(0.92)});
-        expect_all_entries_finite(basis, {Real(-0.004), Real(0.007), Real(0.98)});
-    }
+    expect_all_entries_finite(wedge, {Real(0.2), Real(0.1), Real(-0.6)});
+    expect_all_entries_finite(wedge, {Real(0.05), Real(0.8), Real(0.3)});
 }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index a88d860e9..9d93f8931 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -1,2249 +1,98 @@
 /**
  * @file test_LagrangeBasis.cpp
- * @brief Unit tests for Lagrange basis functions
+ * @brief Unit tests for the reduced scalar Lagrange basis implementation.
  */
 
 #include <gtest/gtest.h>
-#include "FE/Basis/BasisFactory.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/NodeOrderingConventions.h"
-#include "FE/Basis/SerendipityBasis.h"
-#include "fs.h"
-#include "nn.h"
-#include <array>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include <map>
-#include <math.h>
-#include <numeric>
-#include <string>
-#include <vector>
-
-namespace legacy_solver_nn {
-using namespace consts;
-#include "nn_elem_gip.h"
-#include "nn_elem_gnn.h"
-#include "nn_elem_gnnxx.h"
-} // namespace legacy_solver_nn
-
-using svmp::FE::basis::LagrangeBasis;
-using svmp::FE::ElementType;
-using svmp::FE::Real;
-using svmp::FE::basis::Gradient;
-using svmp::FE::basis::Hessian;
-using svmp::FE::basis::ReferenceNodeLayout;
-
-namespace {
-
-using Point = svmp::FE::math::Vector<Real, 3>;
-
-struct SolverBasisAdapterCase {
-    consts::ElementType type;
-    consts::ElementType quadrature_type;
-    int insd;
-    int eNoN;
-    int nG;
-};
-
-std::vector<SolverBasisAdapterCase> solver_basis_adapter_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
-        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
-        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::QUD8, ElementType::QUD9, 2, 8, 9},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-        {ElementType::TET4, ElementType::TET4, 3, 4, 4},
-        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
-        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
-        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
-        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
-        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
-    };
-}
-
-std::vector<SolverBasisAdapterCase> solver_face_basis_adapter_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
-        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
-        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::QUD8, ElementType::QUD8, 2, 8, 9},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-    };
-}
-
-std::vector<SolverBasisAdapterCase> solver_hessian_adapter_cases() {
-    return solver_basis_adapter_cases();
-}
-
-std::vector<SolverBasisAdapterCase> solver_legacy_hessian_parity_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
-    };
-}
-
-int packed_hessian_components(int insd) {
-    if (insd == 1) {
-        return 1;
-    }
-    if (insd == 2) {
-        return 3;
-    }
-    return 6;
-}
-
-void fill_legacy_quadrature(const SolverBasisAdapterCase& c,
-                            Vector<double>& w,
-                            Array<double>& xi) {
-    mshType mesh;
-    mesh.eType = c.quadrature_type;
-    mesh.eNoN = c.eNoN;
-    mesh.nG = c.nG;
-    mesh.w.resize(c.nG);
-    mesh.xi.resize(c.insd, c.nG);
-    legacy_solver_nn::set_element_gauss_int_data.at(c.quadrature_type)(mesh);
-    w = mesh.w;
-    xi = mesh.xi;
-}
-
-faceType initialized_face_for_case(const SolverBasisAdapterCase& c) {
-    faceType face;
-    face.eType = c.type;
-    face.eNoN = c.eNoN;
-    face.nG = c.nG;
-    face.w.resize(c.nG);
-    face.xi.resize(c.insd, c.nG);
-    legacy_solver_nn::set_face_gauss_int_data.at(c.quadrature_type)(face);
-    face.N.resize(c.eNoN, c.nG);
-    face.Nx.resize(c.insd, c.eNoN, c.nG);
-    return face;
-}
-
-void expect_arrays_near(const Array<double>& actual,
-                        const Array<double>& expected,
-                        double tol) {
-    ASSERT_EQ(actual.nrows(), expected.nrows());
-    ASSERT_EQ(actual.ncols(), expected.ncols());
-    for (int col = 0; col < actual.ncols(); ++col) {
-        for (int row = 0; row < actual.nrows(); ++row) {
-            EXPECT_NEAR(actual(row, col), expected(row, col), tol)
-                << "row=" << row << ", col=" << col;
-        }
-    }
-}
-
-void expect_vectors_near(const Vector<double>& actual,
-                         const Vector<double>& expected,
-                         double tol) {
-    ASSERT_EQ(actual.size(), expected.size());
-    for (int i = 0; i < actual.size(); ++i) {
-        EXPECT_NEAR(actual(i), expected(i), tol) << "index=" << i;
-    }
-}
-
-void expect_array3_near(const Array3<double>& actual,
-                        const Array3<double>& expected,
-                        double tol) {
-    ASSERT_EQ(actual.nrows(), expected.nrows());
-    ASSERT_EQ(actual.ncols(), expected.ncols());
-    ASSERT_EQ(actual.nslices(), expected.nslices());
-    for (int slice = 0; slice < actual.nslices(); ++slice) {
-        for (int col = 0; col < actual.ncols(); ++col) {
-            for (int row = 0; row < actual.nrows(); ++row) {
-                EXPECT_NEAR(actual(row, col, slice), expected(row, col, slice), tol)
-                    << "row=" << row << ", col=" << col << ", slice=" << slice;
-            }
-        }
-    }
-}
-
-void fill_array3(Array3<double>& values, double value) {
-    for (int slice = 0; slice < values.nslices(); ++slice) {
-        for (int col = 0; col < values.ncols(); ++col) {
-            for (int row = 0; row < values.nrows(); ++row) {
-                values(row, col, slice) = value;
-            }
-        }
-    }
-}
-
-void expect_face_partition_identities(const SolverBasisAdapterCase& c,
-                                      const faceType& face,
-                                      int g,
-                                      double tol) {
-    double partition = 0.0;
-    std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
-
-    for (int a = 0; a < c.eNoN; ++a) {
-        EXPECT_TRUE(std::isfinite(face.N(a, g)))
-            << "element=" << static_cast<int>(c.type)
-            << ", node=" << a
-            << ", g=" << g;
-        partition += face.N(a, g);
-
-        for (int d = 0; d < c.insd; ++d) {
-            EXPECT_TRUE(std::isfinite(face.Nx(d, a, g)))
-                << "element=" << static_cast<int>(c.type)
-                << ", d=" << d
-                << ", node=" << a
-                << ", g=" << g;
-            gradient_sum[static_cast<std::size_t>(d)] += face.Nx(d, a, g);
-        }
-    }
-
-    EXPECT_NEAR(partition, 1.0, tol)
-        << "element=" << static_cast<int>(c.type) << ", g=" << g;
-    for (int d = 0; d < c.insd; ++d) {
-        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
-            << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
-    }
-}
-
-bool array3_has_nonzero_component(const Array3<double>& values,
-                                  int row,
-                                  double tol) {
-    for (int slice = 0; slice < values.nslices(); ++slice) {
-        for (int col = 0; col < values.ncols(); ++col) {
-            if (std::abs(values(row, col, slice)) > tol) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-Array<double> single_point_xi(const SolverBasisAdapterCase& c,
-                              const Array<double>& xi,
-                              int g) {
-    Array<double> point(c.insd, 1);
-    for (int d = 0; d < c.insd; ++d) {
-        point(d, 0) = xi(d, g);
-    }
-    return point;
-}
-
-std::vector<double> finite_difference_solver_second_derivative(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& point,
-    int gradient_component,
-    int coordinate_component,
-    double eps) {
-    Array<double> xi_plus = point;
-    Array<double> xi_minus = point;
-    xi_plus(coordinate_component, 0) += eps;
-    xi_minus(coordinate_component, 0) -= eps;
-
-    Array<double> N_plus(c.eNoN, 1);
-    Array<double> N_minus(c.eNoN, 1);
-    Array3<double> Nx_plus(c.insd, c.eNoN, 1);
-    Array3<double> Nx_minus(c.insd, c.eNoN, 1);
-
-    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_plus, N_plus, Nx_plus);
-    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_minus, N_minus, Nx_minus);
-
-    std::vector<double> values(static_cast<std::size_t>(c.eNoN));
-    for (int a = 0; a < c.eNoN; ++a) {
-        values[static_cast<std::size_t>(a)] =
-            (Nx_plus(gradient_component, a, 0) - Nx_minus(gradient_component, a, 0)) /
-            (2.0 * eps);
-    }
-    return values;
-}
-
-void expect_packed_hessian_component_matches_finite_difference(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& point,
-    const Array3<double>& Nxx,
-    int g,
-    int packed_row,
-    int first_derivative_component,
-    int second_derivative_component,
-    double tol) {
-    const double eps = 2e-6;
-    const auto numerical = finite_difference_solver_second_derivative(
-        c, point, first_derivative_component, second_derivative_component, eps);
-    for (int a = 0; a < c.eNoN; ++a) {
-        EXPECT_NEAR(Nxx(packed_row, a, g), numerical[static_cast<std::size_t>(a)], tol)
-            << "element=" << static_cast<int>(c.type)
-            << ", packed_row=" << packed_row
-            << ", node=" << a
-            << ", g=" << g;
-    }
-
-    if (first_derivative_component != second_derivative_component) {
-        const auto symmetric_numerical = finite_difference_solver_second_derivative(
-            c, point, second_derivative_component, first_derivative_component, eps);
-        for (int a = 0; a < c.eNoN; ++a) {
-            EXPECT_NEAR(Nxx(packed_row, a, g),
-                        symmetric_numerical[static_cast<std::size_t>(a)],
-                        tol)
-                << "element=" << static_cast<int>(c.type)
-                << ", symmetry packed_row=" << packed_row
-                << ", node=" << a
-                << ", g=" << g;
-        }
-    }
-}
-
-void expect_solver_hessian_matches_gradient_finite_difference(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& xi,
-    int g,
-    const Array3<double>& Nxx,
-    double tol) {
-    const Array<double> point = single_point_xi(c, xi, g);
-
-    expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 0, 0, 0, tol);
-    if (c.insd >= 2) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 1, 1, 1, tol);
-    }
-    if (c.insd == 2) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 0, 1, tol);
-    } else if (c.insd >= 3) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 2, 2, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 3, 0, 1, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 4, 1, 2, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 5, 0, 2, tol);
-    }
-}
-
-void expect_partition_hessian_identity(const SolverBasisAdapterCase& c,
-                                       const Array3<double>& Nxx,
-                                       int g,
-                                       double tol) {
-    for (int row = 0; row < Nxx.nrows(); ++row) {
-        double sum = 0.0;
-        for (int a = 0; a < c.eNoN; ++a) {
-            sum += Nxx(row, a, g);
-        }
-        EXPECT_NEAR(sum, 0.0, tol)
-            << "element=" << static_cast<int>(c.type)
-            << ", packed_row=" << row
-            << ", g=" << g;
-    }
-}
-
-void expect_all_hessians_zero(const SolverBasisAdapterCase& c,
-                              const Array3<double>& Nxx,
-                              int g,
-                              double tol) {
-    for (int row = 0; row < Nxx.nrows(); ++row) {
-        for (int a = 0; a < c.eNoN; ++a) {
-            EXPECT_NEAR(Nxx(row, a, g), 0.0, tol)
-                << "element=" << static_cast<int>(c.type)
-                << ", packed_row=" << row
-                << ", node=" << a
-                << ", g=" << g;
-        }
-    }
-}
-
-mshType initialized_mesh_for_case(const SolverBasisAdapterCase& c, bool force_lShpF) {
-    mshType mesh;
-    mesh.nFs = 1;
-    mesh.eType = c.type;
-    mesh.eNoN = c.eNoN;
-    mesh.nG = c.nG;
-    mesh.lShpF = force_lShpF;
-    mesh.w.resize(c.nG);
-    mesh.xi.resize(c.insd, c.nG);
-    mesh.N.resize(c.eNoN, c.nG);
-    mesh.Nx.resize(c.insd, c.eNoN, c.nG);
-    mesh.xib.resize(2, c.insd);
-    mesh.Nb.resize(2, c.eNoN);
-
-    nn::get_gip(c.insd, c.quadrature_type, c.nG, mesh.w, mesh.xi);
-    for (int g = 0; g < c.nG; ++g) {
-        nn::get_gnn(c.insd, c.type, c.eNoN, g, mesh.xi, mesh.N, mesh.Nx);
-    }
-    nn::get_nn_bnds(c.insd, c.type, c.eNoN, mesh.xib, mesh.Nb);
-    return mesh;
-}
-
-enum class PyramidFace {
-    Base,
-    South,
-    East,
-    North,
-    West
-};
-
-enum class PyramidEdge {
-    BaseSouth,
-    BaseEast,
-    BaseNorth,
-    BaseWest,
-    VerticalSW,
-    VerticalSE,
-    VerticalNE,
-    VerticalNW
-};
-
-struct LagrangeAccuracyCase {
-    ElementType type;
-    int order;
-    std::vector<Point> points;
-};
-
-std::size_t expected_lagrange_size(ElementType type, int order) {
-    switch (type) {
-        case ElementType::Point1:
-            return 1u;
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return static_cast<std::size_t>(order + 1);
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 2) / 2;
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 1);
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) *
-                   static_cast<std::size_t>(order + 3) / 6;
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1);
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) / 2;
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) *
-                   static_cast<std::size_t>(2 * order + 3) / 6;
-        default:
-            return 0u;
-    }
-}
-
-int expected_dimension(ElementType type) {
-    switch (type) {
-        case ElementType::Point1:
-            return 0;
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return 1;
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return 2;
-        default:
-            return 3;
-    }
-}
-
-bool points_close(const Point& a,
-                  const Point& b,
-                  Real tol = Real(1e-12)) {
-    return std::abs(a[0] - b[0]) <= tol &&
-           std::abs(a[1] - b[1]) <= tol &&
-           std::abs(a[2] - b[2]) <= tol;
-}
-
-std::vector<Point> reference_node_coords(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-            return {
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-            };
-        case ElementType::Line3:
-            return {
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Triangle3:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-            };
-        case ElementType::Triangle6:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-            };
-        case ElementType::Quad4:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-            };
-        case ElementType::Quad8:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-            };
-        case ElementType::Quad9:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Tetra4:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-            };
-        case ElementType::Tetra10:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-                Point{Real(0), Real(0), Real(0.5)},
-                Point{Real(0.5), Real(0), Real(0.5)},
-                Point{Real(0), Real(0.5), Real(0.5)},
-            };
-        case ElementType::Hex8:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-            };
-        case ElementType::Hex20:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-                Point{Real(0), Real(-1), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(-1), Real(0), Real(-1)},
-                Point{Real(0), Real(-1), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(-1), Real(0), Real(1)},
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-            };
-        case ElementType::Hex27:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-                Point{Real(0), Real(-1), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(-1), Real(0), Real(-1)},
-                Point{Real(0), Real(-1), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(-1), Real(0), Real(1)},
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Wedge6:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-            };
-        case ElementType::Wedge15:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(0.5), Real(0), Real(-1)},
-                Point{Real(0.5), Real(0.5), Real(-1)},
-                Point{Real(0), Real(0.5), Real(-1)},
-                Point{Real(0.5), Real(0), Real(1)},
-                Point{Real(0.5), Real(0.5), Real(1)},
-                Point{Real(0), Real(0.5), Real(1)},
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-            };
-        case ElementType::Wedge18:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(0.5), Real(0), Real(-1)},
-                Point{Real(0.5), Real(0.5), Real(-1)},
-                Point{Real(0), Real(0.5), Real(-1)},
-                Point{Real(0.5), Real(0), Real(1)},
-                Point{Real(0.5), Real(0.5), Real(1)},
-                Point{Real(0), Real(0.5), Real(1)},
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-            };
-        case ElementType::Pyramid5:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-            };
-        case ElementType::Pyramid13:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(-0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(0.5), Real(0.5)},
-                Point{Real(-0.5), Real(0.5), Real(0.5)},
-            };
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(-0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(0.5), Real(0.5)},
-                Point{Real(-0.5), Real(0.5), Real(0.5)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        default:
-            return {};
-    }
-}
-
-void expect_nodes_match_node_ordering(ElementType canonical_type,
-                                      int order,
-                                      ElementType node_ordering_type) {
-    LagrangeBasis basis(canonical_type, order);
-    const auto& nodes = basis.nodes();
-
-    ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(node_ordering_type));
-    ASSERT_EQ(nodes.size(), basis.size());
-
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        const auto expected = ReferenceNodeLayout::get_node_coords(node_ordering_type, i);
-        EXPECT_NEAR(nodes[i][0], expected[0], 1e-14);
-        EXPECT_NEAR(nodes[i][1], expected[1], 1e-14);
-        EXPECT_NEAR(nodes[i][2], expected[2], 1e-14);
-
-        std::vector<Real> vals;
-        basis.evaluate_values(expected, vals);
-        ASSERT_EQ(vals.size(), nodes.size());
-        for (std::size_t j = 0; j < vals.size(); ++j) {
-            const double expected_delta = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(vals[j], expected_delta, 1e-12);
-        }
-    }
-}
-
-void expect_alias_matches_canonical(ElementType alias_type,
-                                    ElementType canonical_type,
-                                    int canonical_order,
-                                    const std::vector<Point>& points,
-                                    Real tol = Real(1e-12)) {
-    LagrangeBasis alias(alias_type, canonical_order);
-    LagrangeBasis canonical(canonical_type, canonical_order);
-
-    ASSERT_EQ(alias.element_type(), canonical.element_type());
-    ASSERT_EQ(alias.order(), canonical.order());
-    ASSERT_EQ(alias.size(), canonical.size());
-    ASSERT_EQ(alias.nodes().size(), canonical.nodes().size());
-
-    for (std::size_t i = 0; i < alias.nodes().size(); ++i) {
-        EXPECT_NEAR(alias.nodes()[i][0], canonical.nodes()[i][0], tol);
-        EXPECT_NEAR(alias.nodes()[i][1], canonical.nodes()[i][1], tol);
-        EXPECT_NEAR(alias.nodes()[i][2], canonical.nodes()[i][2], tol);
-    }
-
-    for (const auto& xi : points) {
-        std::vector<Real> alias_values;
-        std::vector<Real> canonical_values;
-        std::vector<Gradient> alias_gradients;
-        std::vector<Gradient> canonical_gradients;
-        std::vector<Hessian> alias_hessians;
-        std::vector<Hessian> canonical_hessians;
-
-        alias.evaluate_values(xi, alias_values);
-        canonical.evaluate_values(xi, canonical_values);
-        alias.evaluate_gradients(xi, alias_gradients);
-        canonical.evaluate_gradients(xi, canonical_gradients);
-        alias.evaluate_hessians(xi, alias_hessians);
-        canonical.evaluate_hessians(xi, canonical_hessians);
-
-        ASSERT_EQ(alias_values.size(), canonical_values.size());
-        ASSERT_EQ(alias_gradients.size(), canonical_gradients.size());
-        ASSERT_EQ(alias_hessians.size(), canonical_hessians.size());
-
-        for (std::size_t i = 0; i < alias_values.size(); ++i) {
-            EXPECT_NEAR(alias_values[i], canonical_values[i], tol);
-            for (int d = 0; d < canonical.dimension(); ++d) {
-                const std::size_t sd = static_cast<std::size_t>(d);
-                EXPECT_NEAR(alias_gradients[i][sd], canonical_gradients[i][sd], tol);
-                for (int e = 0; e < canonical.dimension(); ++e) {
-                    const std::size_t se = static_cast<std::size_t>(e);
-                    EXPECT_NEAR(alias_hessians[i](sd, se), canonical_hessians[i](sd, se), Real(5) * tol);
-                }
-            }
-        }
-    }
-}
-
-std::vector<Point> sample_points_for(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return {
-                Point{Real(-0.7), Real(0), Real(0)},
-                Point{Real(0.1), Real(0), Real(0)},
-                Point{Real(0.65), Real(0), Real(0)}
-            };
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return {
-                Point{Real(0.15), Real(0.2), Real(0)},
-                Point{Real(0.25), Real(0.1), Real(0)},
-                Point{Real(0.2), Real(0.3), Real(0)}
-            };
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return {
-                Point{Real(0.2), Real(-0.35), Real(0)},
-                Point{Real(-0.4), Real(0.25), Real(0)},
-                Point{Real(0.55), Real(0.1), Real(0)}
-            };
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return {
-                Point{Real(0.1), Real(0.2), Real(0.15)},
-                Point{Real(0.2), Real(0.1), Real(0.25)},
-                Point{Real(0.15), Real(0.15), Real(0.2)}
-            };
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return {
-                Point{Real(0.2), Real(-0.3), Real(0.25)},
-                Point{Real(-0.5), Real(0.4), Real(-0.2)},
-                Point{Real(0.1), Real(0.15), Real(0.6)}
-            };
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return {
-                Point{Real(0.2), Real(0.25), Real(0.0)},
-                Point{Real(0.1), Real(0.2), Real(-0.45)},
-                Point{Real(0.3), Real(0.15), Real(0.5)}
-            };
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(0.0), Real(0.0), Real(0.25)},
-                Point{Real(0.15), Real(-0.1), Real(0.3)},
-                Point{Real(-0.1), Real(0.2), Real(0.4)}
-            };
-        default:
-            return {Point{Real(0), Real(0), Real(0)}};
-    }
-}
-
-std::vector<Point> boundary_stress_points_for(ElementType type);
-
-std::vector<Point> dense_sample_points_for(ElementType type) {
-    const auto interior = sample_points_for(type);
-    const auto boundary = boundary_stress_points_for(type);
-
-    std::vector<Point> points;
-    points.reserve(interior.size() + boundary.size());
-    points.insert(points.end(), interior.begin(), interior.end());
-    points.insert(points.end(), boundary.begin(), boundary.end());
-
-    if (type == ElementType::Pyramid5 || type == ElementType::Pyramid14) {
-        points.push_back(Point{Real(0.0), Real(0.0), Real(0.85)});
-        points.push_back(Point{Real(0.02), Real(-0.015), Real(0.95)});
-    }
-    return points;
-}
-
-std::vector<Point> boundary_stress_points_for(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return {
-                Point{Real(-0.999), Real(0), Real(0)},
-                Point{Real(-0.75), Real(0), Real(0)},
-                Point{Real(0.0), Real(0), Real(0)},
-                Point{Real(0.8), Real(0), Real(0)},
-                Point{Real(0.999), Real(0), Real(0)}
-            };
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(0)},
-                Point{Real(0.98), Real(0.01), Real(0)},
-                Point{Real(0.01), Real(0.98), Real(0)},
-                Point{Real(0.25), Real(1e-4), Real(0)},
-                Point{Real(0.49), Real(0.49), Real(0)}
-            };
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return {
-                Point{Real(-0.99), Real(-0.99), Real(0)},
-                Point{Real(0.99), Real(-0.99), Real(0)},
-                Point{Real(0.99), Real(0.99), Real(0)},
-                Point{Real(-0.99), Real(0.99), Real(0)},
-                Point{Real(0.0), Real(0.95), Real(0)}
-            };
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(1e-6)},
-                Point{Real(0.97), Real(0.01), Real(0.01)},
-                Point{Real(0.01), Real(0.97), Real(0.01)},
-                Point{Real(0.01), Real(0.01), Real(0.97)},
-                Point{Real(0.32), Real(0.33), Real(0.01)}
-            };
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return {
-                Point{Real(-0.99), Real(-0.99), Real(-0.99)},
-                Point{Real(0.99), Real(-0.99), Real(0.99)},
-                Point{Real(0.99), Real(0.99), Real(-0.99)},
-                Point{Real(-0.99), Real(0.99), Real(0.99)},
-                Point{Real(0.0), Real(0.0), Real(0.95)}
-            };
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(-0.99)},
-                Point{Real(0.98), Real(0.01), Real(-0.99)},
-                Point{Real(0.01), Real(0.98), Real(0.99)},
-                Point{Real(0.49), Real(0.49), Real(0.0)},
-                Point{Real(0.25), Real(1e-4), Real(0.95)}
-            };
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(0.0), Real(0.0), Real(0.95)},
-                Point{Real(0.01), Real(-0.01), Real(0.98)},
-                Point{Real(0.6), Real(-0.6), Real(0.2)},
-                Point{Real(0.79), Real(0.0), Real(0.2)},
-                Point{Real(0.0), Real(0.79), Real(0.2)}
-            };
-        default:
-            return {Point{Real(0), Real(0), Real(0)}};
-    }
-}
-
-Real monomial_value(const Point& xi, int px, int py, int pz) {
-    return std::pow(xi[0], px) * std::pow(xi[1], py) * std::pow(xi[2], pz);
-}
-
-void expect_gradients_match_finite_difference(const LagrangeAccuracyCase& c,
-                                              Real eps,
-                                              Real tol) {
-    LagrangeBasis basis(c.type, c.order);
-
-    for (const auto& xi : c.points) {
-        std::vector<Gradient> gradients;
-        basis.evaluate_gradients(xi, gradients);
-        ASSERT_EQ(gradients.size(), basis.size());
-
-        for (int d = 0; d < basis.dimension(); ++d) {
-            Point xp = xi;
-            Point xm = xi;
-            xp[d] += eps;
-            xm[d] -= eps;
-
-            std::vector<Real> values_p;
-            std::vector<Real> values_m;
-            basis.evaluate_values(xp, values_p);
-            basis.evaluate_values(xm, values_m);
-
-            ASSERT_EQ(values_p.size(), basis.size());
-            ASSERT_EQ(values_m.size(), basis.size());
-            for (std::size_t i = 0; i < basis.size(); ++i) {
-                const Real fd = (values_p[i] - values_m[i]) / (Real(2) * eps);
-                EXPECT_NEAR(gradients[i][d], fd, tol)
-                    << "type=" << static_cast<int>(c.type)
-                    << ", order=" << c.order
-                    << ", dim=" << d
-                    << ", basis_i=" << i
-                    << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-            }
-        }
-    }
-}
-
-void expect_polynomial_reproduction(const LagrangeAccuracyCase& c,
-                                    const std::vector<std::array<int, 3>>& exponents,
-                                    Real tol) {
-    LagrangeBasis basis(c.type, c.order);
-    const auto& nodes = basis.nodes();
-    ASSERT_EQ(nodes.size(), basis.size());
-
-    for (const auto& exp : exponents) {
-        std::vector<Real> coeffs(basis.size(), Real(0));
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            coeffs[i] = monomial_value(nodes[i], exp[0], exp[1], exp[2]);
-        }
-
-        for (const auto& xi : c.points) {
-            std::vector<Real> values;
-            basis.evaluate_values(xi, values);
-            ASSERT_EQ(values.size(), basis.size());
-
-            Real interpolated = Real(0);
-            for (std::size_t i = 0; i < basis.size(); ++i) {
-                interpolated += coeffs[i] * values[i];
-            }
-
-            const Real exact = monomial_value(xi, exp[0], exp[1], exp[2]);
-            EXPECT_NEAR(interpolated, exact, tol)
-                << "type=" << static_cast<int>(c.type)
-                << ", order=" << c.order
-                << ", monomial=(" << exp[0] << "," << exp[1] << "," << exp[2] << ")"
-                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-        }
-    }
-}
-
-template<typename Container>
-void expect_all_finite(const Container& values) {
-    for (const auto& value : values) {
-        for (std::size_t d = 0; d < 3; ++d) {
-            EXPECT_TRUE(std::isfinite(value[d]));
-        }
-    }
-}
-
-void expect_hessians_finite(const std::vector<Hessian>& hessians,
-                            int dimension) {
-    for (const auto& H : hessians) {
-        for (int i = 0; i < dimension; ++i) {
-            for (int j = 0; j < dimension; ++j) {
-                EXPECT_TRUE(std::isfinite(H(static_cast<std::size_t>(i),
-                                            static_cast<std::size_t>(j))));
-            }
-        }
-    }
-}
-
-void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
-                                            const std::vector<Point>& points,
-                                            Real value_tol,
-                                            Real derivative_tol) {
-    for (const auto& xi : points) {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        basis.evaluate_values(xi, values);
-        basis.evaluate_gradients(xi, gradients);
-        basis.evaluate_hessians(xi, hessians);
-
-        ASSERT_EQ(values.size(), basis.size());
-        ASSERT_EQ(gradients.size(), basis.size());
-        ASSERT_EQ(hessians.size(), basis.size());
-
-        Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            value_sum += values[i];
-            for (int d = 0; d < basis.dimension(); ++d) {
-                const std::size_t sd = static_cast<std::size_t>(d);
-                gradient_sum[sd] += gradients[i][sd];
-                for (int e = 0; e < basis.dimension(); ++e) {
-                    const std::size_t se = static_cast<std::size_t>(e);
-                    hessian_sum(sd, se) += hessians[i](sd, se);
-                }
-            }
-        }
-
-        EXPECT_NEAR(value_sum, Real(1), value_tol)
-            << "Element type " << static_cast<int>(basis.element_type())
-            << ", order " << basis.order()
-            << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-
-        for (int d = 0; d < basis.dimension(); ++d) {
-            const std::size_t sd = static_cast<std::size_t>(d);
-            EXPECT_NEAR(gradient_sum[sd], Real(0), derivative_tol)
-                << "Gradient sum mismatch for element type " << static_cast<int>(basis.element_type())
-                << ", order " << basis.order()
-                << ", dim " << d;
-            for (int e = 0; e < basis.dimension(); ++e) {
-                const std::size_t se = static_cast<std::size_t>(e);
-                EXPECT_NEAR(hessian_sum(sd, se), Real(0), derivative_tol)
-                    << "Hessian sum mismatch for element type " << static_cast<int>(basis.element_type())
-                    << ", order " << basis.order()
-                    << ", component (" << d << "," << e << ")";
-            }
-        }
-    }
-}
-
-bool is_on_pyramid_face(const Point& point,
-                        PyramidFace face,
-                        Real tol = Real(1e-12)) {
-    const Real scale = Real(1) - point[2];
-    switch (face) {
-        case PyramidFace::Base:
-            return std::abs(point[2]) <= tol;
-        case PyramidFace::South:
-            return std::abs(point[1] + scale) <= tol;
-        case PyramidFace::East:
-            return std::abs(point[0] - scale) <= tol;
-        case PyramidFace::North:
-            return std::abs(point[1] - scale) <= tol;
-        case PyramidFace::West:
-            return std::abs(point[0] + scale) <= tol;
-    }
-    return false;
-}
-
-Point map_pyramid_face_to_reference(PyramidFace face,
-                                    const Point& point) {
-    const Real scale = Real(1) - point[2];
-    switch (face) {
-        case PyramidFace::Base:
-            return Point{point[0], point[1], Real(0)};
-        case PyramidFace::South:
-            return Point{(scale - point[0]) / Real(2), point[2], Real(0)};
-        case PyramidFace::East:
-            return Point{(scale + point[1]) / Real(2), point[2], Real(0)};
-        case PyramidFace::North:
-            return Point{(scale + point[0]) / Real(2), point[2], Real(0)};
-        case PyramidFace::West:
-            return Point{(scale - point[1]) / Real(2), point[2], Real(0)};
-    }
-    return Point{};
-}
-
-std::vector<Point> sample_points_for_pyramid_face(PyramidFace face) {
-    switch (face) {
-        case PyramidFace::Base:
-            return {
-                Point{Real(0.15), Real(-0.2), Real(0)},
-                Point{Real(-0.55), Real(0.35), Real(0)}
-            };
-        case PyramidFace::South:
-            return {
-                Point{Real(-0.2), Real(-0.8), Real(0.2)},
-                Point{Real(0.05), Real(-0.35), Real(0.65)}
-            };
-        case PyramidFace::East:
-            return {
-                Point{Real(0.8), Real(-0.25), Real(0.2)},
-                Point{Real(0.3), Real(0.08), Real(0.7)}
-            };
-        case PyramidFace::North:
-            return {
-                Point{Real(0.25), Real(0.8), Real(0.2)},
-                Point{Real(-0.08), Real(0.35), Real(0.65)}
-            };
-        case PyramidFace::West:
-            return {
-                Point{Real(-0.8), Real(0.2), Real(0.2)},
-                Point{Real(-0.3), Real(-0.05), Real(0.7)}
-            };
-    }
-    return {};
-}
-
-bool is_on_pyramid_edge(const Point& point,
-                        PyramidEdge edge,
-                        Real tol = Real(1e-12)) {
-    const Real scale = Real(1) - point[2];
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-            return std::abs(point[2]) <= tol && std::abs(point[1] + Real(1)) <= tol;
-        case PyramidEdge::BaseEast:
-            return std::abs(point[2]) <= tol && std::abs(point[0] - Real(1)) <= tol;
-        case PyramidEdge::BaseNorth:
-            return std::abs(point[2]) <= tol && std::abs(point[1] - Real(1)) <= tol;
-        case PyramidEdge::BaseWest:
-            return std::abs(point[2]) <= tol && std::abs(point[0] + Real(1)) <= tol;
-        case PyramidEdge::VerticalSW:
-            return std::abs(point[0] + scale) <= tol && std::abs(point[1] + scale) <= tol;
-        case PyramidEdge::VerticalSE:
-            return std::abs(point[0] - scale) <= tol && std::abs(point[1] + scale) <= tol;
-        case PyramidEdge::VerticalNE:
-            return std::abs(point[0] - scale) <= tol && std::abs(point[1] - scale) <= tol;
-        case PyramidEdge::VerticalNW:
-            return std::abs(point[0] + scale) <= tol && std::abs(point[1] - scale) <= tol;
-    }
-    return false;
-}
-
-Point map_pyramid_edge_to_reference(PyramidEdge edge,
-                                    const Point& point) {
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-        case PyramidEdge::BaseNorth:
-            return Point{point[0], Real(0), Real(0)};
-        case PyramidEdge::BaseEast:
-        case PyramidEdge::BaseWest:
-            return Point{point[1], Real(0), Real(0)};
-        case PyramidEdge::VerticalSW:
-        case PyramidEdge::VerticalSE:
-        case PyramidEdge::VerticalNE:
-        case PyramidEdge::VerticalNW:
-            return Point{Real(2) * point[2] - Real(1), Real(0), Real(0)};
-    }
-    return Point{};
-}
-
-std::vector<Point> sample_points_for_pyramid_edge(PyramidEdge edge) {
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-            return {Point{Real(-0.65), Real(-1), Real(0)}, Point{Real(0.35), Real(-1), Real(0)}};
-        case PyramidEdge::BaseEast:
-            return {Point{Real(1), Real(-0.45), Real(0)}, Point{Real(1), Real(0.55), Real(0)}};
-        case PyramidEdge::BaseNorth:
-            return {Point{Real(-0.55), Real(1), Real(0)}, Point{Real(0.45), Real(1), Real(0)}};
-        case PyramidEdge::BaseWest:
-            return {Point{Real(-1), Real(-0.55), Real(0)}, Point{Real(-1), Real(0.45), Real(0)}};
-        case PyramidEdge::VerticalSW:
-            return {Point{Real(-0.75), Real(-0.75), Real(0.25)}, Point{Real(-0.3), Real(-0.3), Real(0.7)}};
-        case PyramidEdge::VerticalSE:
-            return {Point{Real(0.75), Real(-0.75), Real(0.25)}, Point{Real(0.3), Real(-0.3), Real(0.7)}};
-        case PyramidEdge::VerticalNE:
-            return {Point{Real(0.75), Real(0.75), Real(0.25)}, Point{Real(0.3), Real(0.3), Real(0.7)}};
-        case PyramidEdge::VerticalNW:
-            return {Point{Real(-0.75), Real(0.75), Real(0.25)}, Point{Real(-0.3), Real(0.3), Real(0.7)}};
-    }
-    return {};
-}
-
-std::vector<int> map_pyramid_nodes_to_lower_basis_nodes(
-    const std::vector<Point>& pyramid_nodes,
-    const std::vector<Point>& lower_basis_nodes,
-    const std::function<bool(const Point&)>& selector,
-    const std::function<Point(const Point&)>& mapper) {
-    std::vector<int> mapping(pyramid_nodes.size(), -1);
-    std::size_t face_count = 0;
-    for (std::size_t i = 0; i < pyramid_nodes.size(); ++i) {
-        if (!selector(pyramid_nodes[i])) {
-            continue;
-        }
-
-        ++face_count;
-        const Point mapped = mapper(pyramid_nodes[i]);
-        bool found = false;
-        for (std::size_t j = 0; j < lower_basis_nodes.size(); ++j) {
-            if (points_close(mapped, lower_basis_nodes[j])) {
-                mapping[i] = static_cast<int>(j);
-                found = true;
-                break;
-            }
-        }
-        EXPECT_TRUE(found)
-            << "Failed to match pyramid trace node at (" << pyramid_nodes[i][0] << ","
-            << pyramid_nodes[i][1] << "," << pyramid_nodes[i][2] << ")";
-    }
-
-    EXPECT_EQ(face_count, lower_basis_nodes.size());
-    return mapping;
-}
-
-void expect_pyramid_face_trace_matches_lower_basis(int order,
-                                                   PyramidFace face,
-                                                   Real tol = Real(2e-10)) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, order);
-    const bool base_face = face == PyramidFace::Base;
-    LagrangeBasis lower(base_face ? ElementType::Quad4 : ElementType::Triangle3, order);
-
-    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
-        pyramid.nodes(),
-        lower.nodes(),
-        [&](const Point& point) { return is_on_pyramid_face(point, face); },
-        [&](const Point& point) { return map_pyramid_face_to_reference(face, point); });
-
-    for (const auto& face_point : sample_points_for_pyramid_face(face)) {
-        std::vector<Real> pyramid_values;
-        std::vector<Real> lower_values;
-        pyramid.evaluate_values(face_point, pyramid_values);
-        lower.evaluate_values(map_pyramid_face_to_reference(face, face_point), lower_values);
-
-        ASSERT_EQ(pyramid_values.size(), pyramid.size());
-        ASSERT_EQ(lower_values.size(), lower.size());
-
-        for (std::size_t i = 0; i < pyramid.size(); ++i) {
-            if (mapping[i] >= 0) {
-                EXPECT_NEAR(pyramid_values[i], lower_values[static_cast<std::size_t>(mapping[i])], tol)
-                    << "Face trace mismatch for order " << order
-                    << ", face " << static_cast<int>(face)
-                    << ", basis " << i;
-            } else {
-                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
-                    << "Off-face pyramid basis should vanish on face for order " << order
-                    << ", face " << static_cast<int>(face)
-                    << ", basis " << i;
-            }
-        }
-    }
-}
-
-void expect_pyramid_edge_trace_matches_line_basis(int order,
-                                                  PyramidEdge edge,
-                                                  Real tol = Real(2e-10)) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, order);
-    LagrangeBasis line(ElementType::Line2, order);
-
-    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
-        pyramid.nodes(),
-        line.nodes(),
-        [&](const Point& point) { return is_on_pyramid_edge(point, edge); },
-        [&](const Point& point) { return map_pyramid_edge_to_reference(edge, point); });
-
-    for (const auto& edge_point : sample_points_for_pyramid_edge(edge)) {
-        std::vector<Real> pyramid_values;
-        std::vector<Real> line_values;
-        pyramid.evaluate_values(edge_point, pyramid_values);
-        line.evaluate_values(map_pyramid_edge_to_reference(edge, edge_point), line_values);
-
-        ASSERT_EQ(pyramid_values.size(), pyramid.size());
-        ASSERT_EQ(line_values.size(), line.size());
-
-        for (std::size_t i = 0; i < pyramid.size(); ++i) {
-            if (mapping[i] >= 0) {
-                EXPECT_NEAR(pyramid_values[i], line_values[static_cast<std::size_t>(mapping[i])], tol)
-                    << "Edge trace mismatch for order " << order
-                    << ", edge " << static_cast<int>(edge)
-                    << ", basis " << i;
-            } else {
-                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
-                    << "Off-edge pyramid basis should vanish on edge for order " << order
-                    << ", edge " << static_cast<int>(edge)
-                    << ", basis " << i;
-            }
-        }
-    }
-}
-
-struct StridedOutputRequest {
-    bool values;
-    bool gradients;
-    bool hessians;
-};
-
-void expect_strided_matches_pointwise(ElementType type,
-                                      int order,
-                                      const StridedOutputRequest& request) {
-    LagrangeBasis basis(type, order);
-    const auto points = dense_sample_points_for(type);
-    const std::size_t stride = points.size() + 3u;
-    constexpr Real sentinel = Real(-12345.25);
-
-    std::vector<Real> values(request.values ? basis.size() * stride : 0u, sentinel);
-    std::vector<Real> gradients(request.gradients ? basis.size() * 3u * stride : 0u, sentinel);
-    std::vector<Real> hessians(request.hessians ? basis.size() * 9u * stride : 0u, sentinel);
-
-    basis.evaluate_at_quadrature_points_strided(
-        points,
-        stride,
-        request.values ? values.data() : nullptr,
-        request.gradients ? gradients.data() : nullptr,
-        request.hessians ? hessians.data() : nullptr);
-
-    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
-        ? Real(5e-10)
-        : Real(1e-12);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        if (request.values) {
-            std::vector<Real> expected;
-            basis.evaluate_values(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                EXPECT_NEAR(values[d * stride + q], expected[d], tol)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", dof=" << d
-                    << ", q=" << q;
-            }
-        }
-
-        if (request.gradients) {
-            std::vector<Gradient> expected;
-            basis.evaluate_gradients(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_NEAR(gradients[(d * 3u + c) * stride + q], expected[d][c], tol)
-                        << "type=" << static_cast<int>(type)
-                        << ", order=" << order
-                        << ", dof=" << d
-                        << ", component=" << c
-                        << ", q=" << q;
-                }
-            }
-        }
-
-        if (request.hessians) {
-            std::vector<Hessian> expected;
-            basis.evaluate_hessians(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                for (std::size_t r = 0; r < 3u; ++r) {
-                    for (std::size_t c = 0; c < 3u; ++c) {
-                        EXPECT_NEAR(hessians[(d * 9u + r * 3u + c) * stride + q],
-                                    expected[d](r, c),
-                                    Real(4) * tol)
-                            << "type=" << static_cast<int>(type)
-                            << ", order=" << order
-                            << ", dof=" << d
-                            << ", hessian=(" << r << "," << c << ")"
-                            << ", q=" << q;
-                    }
-                }
-            }
-        }
-    }
-
-    const auto expect_padding_untouched = [&](const std::vector<Real>& buffer,
-                                              std::size_t rows) {
-        for (std::size_t row = 0; row < rows; ++row) {
-            for (std::size_t q = points.size(); q < stride; ++q) {
-                EXPECT_EQ(buffer[row * stride + q], sentinel)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", row=" << row
-                    << ", padding q=" << q;
-            }
-        }
-    };
-
-    if (request.values) {
-        expect_padding_untouched(values, basis.size());
-    }
-    if (request.gradients) {
-        expect_padding_untouched(gradients, basis.size() * 3u);
-    }
-    if (request.hessians) {
-        expect_padding_untouched(hessians, basis.size() * 9u);
-    }
-}
-
-void expect_raw_to_matches_vector_evaluation(ElementType type, int order) {
-    LagrangeBasis basis(type, order);
-    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
-        ? Real(5e-10)
-        : Real(1e-12);
-
-    for (const auto& point : sample_points_for(type)) {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        basis.evaluate_all(point, values, gradients, hessians);
-
-        std::vector<Real> raw_values(basis.size());
-        std::vector<Real> raw_gradients(basis.size() * 3u);
-        std::vector<Real> raw_hessians(basis.size() * 9u);
-        basis.evaluate_values_to(point, raw_values.data());
-        basis.evaluate_gradients_to(point, raw_gradients.data());
-        basis.evaluate_hessians_to(point, raw_hessians.data());
-
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            EXPECT_NEAR(raw_values[i], values[i], tol)
-                << "type=" << static_cast<int>(type) << ", order=" << order << ", dof=" << i;
-            for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_NEAR(raw_gradients[i * 3u + c], gradients[i][c], tol)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", dof=" << i
-                    << ", gradient component=" << c;
-            }
-            for (std::size_t r = 0; r < 3u; ++r) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_NEAR(raw_hessians[i * 9u + r * 3u + c], hessians[i](r, c), Real(4) * tol)
-                        << "type=" << static_cast<int>(type)
-                        << ", order=" << order
-                        << ", dof=" << i
-                        << ", hessian=(" << r << "," << c << ")";
-                }
-            }
-        }
-    }
-}
-
-} // namespace
-
-TEST(SolverBasisAdapter, ShapeValuesGradientsAndMeshOverloadMatchLegacy) {
-    constexpr double tol = 2e-12;
-
-    for (const auto& c : solver_basis_adapter_cases()) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        Array<double> legacy_N(c.eNoN, c.nG);
-        Array<double> adapter_N(c.eNoN, c.nG);
-        Array3<double> legacy_Nx(c.insd, c.eNoN, c.nG);
-        Array3<double> adapter_Nx(c.insd, c.eNoN, c.nG);
-        auto legacy_shape = legacy_solver_nn::get_element_shape_data.find(c.type);
-
-        faceType legacy_face;
-        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
-            ASSERT_EQ(c.type, consts::ElementType::QUD8);
-            legacy_face.eType = c.type;
-            legacy_face.eNoN = c.eNoN;
-            legacy_face.nG = c.nG;
-            legacy_face.xi = xi;
-            legacy_face.N.resize(c.eNoN, c.nG);
-            legacy_face.Nx.resize(c.insd, c.eNoN, c.nG);
-        }
-
-        for (int g = 0; g < c.nG; ++g) {
-            if (legacy_shape != legacy_solver_nn::get_element_shape_data.end()) {
-                legacy_shape->second(c.insd, c.eNoN, g, xi, legacy_N, legacy_Nx);
-            } else {
-                legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
-            }
-            nn::get_gnn(c.insd, c.type, c.eNoN, g, xi, adapter_N, adapter_Nx);
-
-            double partition = 0.0;
-            std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
-            for (int a = 0; a < c.eNoN; ++a) {
-                partition += adapter_N(a, g);
-                for (int d = 0; d < c.insd; ++d) {
-                    gradient_sum[static_cast<std::size_t>(d)] += adapter_Nx(d, a, g);
-                }
-            }
-
-            EXPECT_NEAR(partition, 1.0, tol)
-                << "element=" << static_cast<int>(c.type) << ", g=" << g;
-            for (int d = 0; d < c.insd; ++d) {
-                EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
-                    << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
-            }
-        }
-
-        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
-            legacy_N = legacy_face.N;
-            legacy_Nx = legacy_face.Nx;
-        }
-
-        expect_arrays_near(adapter_N, legacy_N, tol);
-        expect_array3_near(adapter_Nx, legacy_Nx, tol);
-
-        mshType mesh;
-        mesh.eType = c.type;
-        mesh.eNoN = c.eNoN;
-        mesh.nG = c.nG;
-        mesh.xi = xi;
-        mesh.N.resize(c.eNoN, c.nG);
-        mesh.Nx.resize(c.insd, c.eNoN, c.nG);
-        for (int g = 0; g < c.nG; ++g) {
-            nn::get_gnn(g, mesh);
-        }
-
-        expect_arrays_near(mesh.N, legacy_N, tol);
-        expect_array3_near(mesh.Nx, legacy_Nx, tol);
-    }
-}
-
-TEST(SolverFaceBasisAdapter, ShapeValuesGradientsAndDispatchMatchLegacyFaceTable) {
-    constexpr double tol = 2e-12;
-
-    int covered = 0;
-    for (const auto& c : solver_face_basis_adapter_cases()) {
-        SCOPED_TRACE("face element=" + std::to_string(static_cast<int>(c.type)));
-
-        faceType legacy_face = initialized_face_for_case(c);
-        faceType basis_face = initialized_face_for_case(c);
-
-        for (int g = 0; g < c.nG; ++g) {
-            legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
-            nn::get_gnn(nullptr, g, basis_face);
-            expect_face_partition_identities(c, basis_face, g, tol);
-        }
-
-        expect_arrays_near(basis_face.N, legacy_face.N, tol);
-        expect_array3_near(basis_face.Nx, legacy_face.Nx, tol);
-        ++covered;
-    }
-
-    EXPECT_EQ(covered, 7);
-}
-
-TEST(SolverFaceBasisAdapter, MappedFacesFailClosedWithoutLegacyFallback) {
-    using consts::ElementType;
-
-    SolverBasisAdapterCase c{ElementType::LIN1, ElementType::LIN1, 1, 3, 2};
-    faceType face = initialized_face_for_case(c);
-
-    try {
-        nn::get_gnn(nullptr, 0, face);
-        FAIL() << "Expected mapped face dispatch to reject mismatched eNoN";
-    } catch (const svmp::FE::basis::BasisEvaluationException& exception) {
-        const std::string message = exception.what();
-        EXPECT_NE(message.find("legacy fallback was not attempted"), std::string::npos)
-            << message;
-    }
-}
-
-TEST(SolverFaceBasisAdapter, PointFaceRemainsLegacyValuePath) {
-    faceType face;
-    face.eType = consts::ElementType::PNT;
-    face.eNoN = 1;
-    face.nG = 1;
-    face.N.resize(1, 1);
-    face.Nx.resize(1, 1, 1);
-    face.N(0, 0) = -7.0;
-    face.Nx(0, 0, 0) = 42.0;
-
-    nn::get_gnn(nullptr, 0, face);
-
-    EXPECT_DOUBLE_EQ(face.N(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(face.Nx(0, 0, 0), 42.0);
-}
-
-TEST(SolverFaceBasisAdapter, UnsupportedFacesThrowClearErrors) {
-    faceType nrb_face;
-    nrb_face.eType = consts::ElementType::NRB;
-    nrb_face.eNoN = 1;
-    nrb_face.nG = 1;
-    nrb_face.N.resize(1, 1);
-    nrb_face.Nx.resize(1, 1, 1);
-    EXPECT_THROW(nn::get_gnn(nullptr, 0, nrb_face), svmp::FE::NotImplementedException);
-
-    faceType unknown_face;
-    unknown_face.eType = consts::ElementType::NA;
-    unknown_face.eNoN = 1;
-    unknown_face.nG = 1;
-    unknown_face.N.resize(1, 1);
-    unknown_face.Nx.resize(1, 1, 1);
-    EXPECT_THROW(nn::get_gnn(nullptr, 0, unknown_face), svmp::FE::InvalidElementException);
-}
-
-TEST(SolverBasisAdapter, QuadraturePathsRemainLegacyCompatible) {
-    constexpr double tol = 0.0;
-
-    for (const auto& c : solver_basis_adapter_cases()) {
-        auto mesh_it = legacy_solver_nn::set_element_gauss_int_data.find(c.type);
-        if (mesh_it != legacy_solver_nn::set_element_gauss_int_data.end()) {
-            mshType legacy_mesh;
-            legacy_mesh.eType = c.type;
-            legacy_mesh.eNoN = c.eNoN;
-            legacy_mesh.nG = c.nG;
-            legacy_mesh.w.resize(c.nG);
-            legacy_mesh.xi.resize(c.insd, c.nG);
-            mesh_it->second(legacy_mesh);
-
-            mshType adapter_mesh;
-            adapter_mesh.eType = c.type;
-            adapter_mesh.eNoN = c.eNoN;
-            adapter_mesh.nG = c.nG;
-            adapter_mesh.w.resize(c.nG);
-            adapter_mesh.xi.resize(c.insd, c.nG);
-            nn::get_gip(adapter_mesh);
-
-            expect_vectors_near(adapter_mesh.w, legacy_mesh.w, tol);
-            expect_arrays_near(adapter_mesh.xi, legacy_mesh.xi, tol);
-        }
-
-        auto scalar_it = legacy_solver_nn::get_element_gauss_int_data.find(c.type);
-        if (scalar_it != legacy_solver_nn::get_element_gauss_int_data.end()) {
-            Vector<double> legacy_w(c.nG);
-            Vector<double> adapter_w(c.nG);
-            Array<double> legacy_xi(c.insd, c.nG);
-            Array<double> adapter_xi(c.insd, c.nG);
-
-            scalar_it->second(c.insd, c.nG, legacy_w, legacy_xi);
-            nn::get_gip(c.insd, c.type, c.nG, adapter_w, adapter_xi);
-
-            expect_vectors_near(adapter_w, legacy_w, tol);
-            expect_arrays_near(adapter_xi, legacy_xi, tol);
-        }
-    }
-
-    mshType legacy_tet;
-    legacy_tet.eType = consts::ElementType::TET4;
-    legacy_tet.eNoN = 4;
-    legacy_tet.nG = 4;
-    legacy_tet.qmTET4 = 0.25;
-    legacy_tet.w.resize(4);
-    legacy_tet.xi.resize(3, 4);
-    legacy_solver_nn::set_element_gauss_int_data.at(consts::ElementType::TET4)(legacy_tet);
-
-    mshType adapter_tet;
-    adapter_tet.eType = consts::ElementType::TET4;
-    adapter_tet.eNoN = 4;
-    adapter_tet.nG = 4;
-    adapter_tet.qmTET4 = 0.25;
-    adapter_tet.w.resize(4);
-    adapter_tet.xi.resize(3, 4);
-    nn::get_gip(adapter_tet);
-
-    expect_vectors_near(adapter_tet.w, legacy_tet.w, tol);
-    expect_arrays_near(adapter_tet.xi, legacy_tet.xi, tol);
-}
-
-TEST(SolverBasisAdapter, HessiansCoverEveryMappedScalarVolumeElement) {
-    constexpr double partition_tol = 2e-10;
-    constexpr double finite_difference_tol = 2e-5;
-    constexpr double zero_tol = 2e-12;
-
-    int covered = 0;
-    for (const auto& c : solver_hessian_adapter_cases()) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        const int ind2 = packed_hessian_components(c.insd);
-        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-        fill_array3(adapter_Nxx, std::numeric_limits<double>::quiet_NaN());
-
-        for (int g = 0; g < c.nG; ++g) {
-            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-            expect_partition_hessian_identity(c, adapter_Nxx, g, partition_tol);
-            expect_solver_hessian_matches_gradient_finite_difference(
-                c, xi, g, adapter_Nxx, finite_difference_tol);
-
-            if (c.type == consts::ElementType::LIN1 ||
-                c.type == consts::ElementType::TRI3 ||
-                c.type == consts::ElementType::TET4) {
-                expect_all_hessians_zero(c, adapter_Nxx, g, zero_tol);
-            }
-        }
-
-        if (c.type == consts::ElementType::QUD4) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 2, zero_tol));
-        } else if (c.type == consts::ElementType::HEX8) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 3, zero_tol));
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 4, zero_tol));
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
-        } else if (c.type == consts::ElementType::WDG) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
-        }
-        ++covered;
-    }
-
-    EXPECT_EQ(covered, 13);
-}
-
-TEST(SolverBasisAdapter, HessianPackingMatchesLegacyWhereLegacyIsApproved) {
-    constexpr double tol = 2e-12;
-
-    for (const auto& c : solver_legacy_hessian_parity_cases()) {
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        const int ind2 = packed_hessian_components(c.insd);
-        Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
-        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-
-        for (int g = 0; g < c.nG; ++g) {
-            legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
-                c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
-            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-        }
-
-        expect_array3_near(adapter_Nxx, legacy_Nxx, tol);
-    }
-}
-
-TEST(SolverBasisAdapter, Qud8HessiansDoNotUseLegacyFallback) {
-    using consts::ElementType;
-    SolverBasisAdapterCase c{ElementType::QUD8, ElementType::QUD9, 2, 8, 9};
-
-    Vector<double> weights;
-    Array<double> xi;
-    fill_legacy_quadrature(c, weights, xi);
-
-    const int ind2 = packed_hessian_components(c.insd);
-    Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
-    Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-    fill_array3(legacy_Nxx, 0.0);
-    fill_array3(adapter_Nxx, 0.0);
-
-    for (int g = 0; g < c.nG; ++g) {
-        legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
-            c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
-        nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-    }
-
-    double max_abs_difference = 0.0;
-    for (int g = 0; g < c.nG; ++g) {
-        for (int a = 0; a < c.eNoN; ++a) {
-            for (int row = 0; row < ind2; ++row) {
-                max_abs_difference = std::max(
-                    max_abs_difference,
-                    std::abs(adapter_Nxx(row, a, g) - legacy_Nxx(row, a, g)));
-            }
-        }
-    }
-
-    EXPECT_GT(max_abs_difference, 1e-8);
-}
-
-TEST(SolverBasisAdapter, UnsupportedHessianFamiliesRemainNoOp) {
-    Array<double> xi(1, 1);
-    xi(0, 0) = 0.0;
-    Array3<double> Nxx(1, 1, 1);
-
-    for (const auto unsupported : {consts::ElementType::NRB, consts::ElementType::PNT}) {
-        fill_array3(Nxx, 42.0);
-        nn::get_gn_nxx(1, 1, unsupported, 1, 0, xi, Nxx);
-        EXPECT_DOUBLE_EQ(Nxx(0, 0, 0), 42.0)
-            << "element=" << static_cast<int>(unsupported);
-    }
-}
-
-TEST(SolverBasisAdapter, InitFsMshPopulatesMappedHessiansWithoutLShpFGate) {
-    using consts::ElementType;
-    const SolverBasisAdapterCase cases[] = {
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
-        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
-        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
-        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
-    };
-
-    for (const auto& c : cases) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        ComMod com_mod;
-        com_mod.nsd = c.insd;
-        mshType mesh = initialized_mesh_for_case(c, true);
-
-        fs::init_fs_msh(com_mod, mesh);
-
-        ASSERT_EQ(mesh.fs.size(), 1u);
-        ASSERT_EQ(mesh.fs[0].Nxx.nrows(), packed_hessian_components(c.insd));
-        if (c.type == ElementType::QUD4) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 2, 2e-12));
-        } else if (c.type == ElementType::HEX8) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 3, 2e-12));
-        } else if (c.type == ElementType::WDG) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 5, 2e-12));
-        } else {
-            bool has_nonzero = false;
-            for (int row = 0; row < mesh.fs[0].Nxx.nrows(); ++row) {
-                has_nonzero = has_nonzero ||
-                    array3_has_nonzero_component(mesh.fs[0].Nxx, row, 2e-12);
-            }
-            EXPECT_TRUE(has_nonzero);
-        }
-    }
-}
-
-TEST(LagrangeBasis, QuadPartitionOfUnity) {
-    LagrangeBasis basis(ElementType::Quad4, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.2, -0.3, 0.0};
-
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-
-    double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-}
-
-TEST(LagrangeBasis, LineGradientLinear) {
-    LagrangeBasis basis(ElementType::Line2, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.0, 0.0, 0.0};
-    std::vector<Gradient> grad;
-    basis.evaluate_gradients(xi, grad);
-
-    ASSERT_EQ(grad.size(), 2u);
-    EXPECT_NEAR(grad[0][0], -0.5, 1e-12);
-    EXPECT_NEAR(grad[1][0], 0.5, 1e-12);
-}
-
-TEST(LagrangeBasis, TrianglePartitionOfUnity) {
-    LagrangeBasis basis(ElementType::Triangle3, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.2, 0.3, 0.0};
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-
-    double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-}
-
-TEST(LagrangeBasis, SizeFormulasPerElement) {
-    for (int order = 0; order <= 3; ++order) {
-        {
-            LagrangeBasis line(ElementType::Line2, order);
-            EXPECT_EQ(line.size(), static_cast<std::size_t>(order + 1));
-        }
-        {
-            LagrangeBasis quad(ElementType::Quad4, order);
-            const std::size_t n1d = static_cast<std::size_t>(order + 1);
-            EXPECT_EQ(quad.size(), n1d * n1d);
-        }
-        {
-            LagrangeBasis hex(ElementType::Hex8, order);
-            const std::size_t n1d = static_cast<std::size_t>(order + 1);
-            EXPECT_EQ(hex.size(), n1d * n1d * n1d);
-        }
-        {
-            LagrangeBasis tri(ElementType::Triangle3, order);
-            const std::size_t expected =
-                static_cast<std::size_t>(order + 1) *
-                static_cast<std::size_t>(order + 2) / 2;
-            EXPECT_EQ(tri.size(), expected);
-        }
-        {
-            LagrangeBasis tet(ElementType::Tetra4, order);
-            const std::size_t expected =
-                static_cast<std::size_t>(order + 1) *
-                static_cast<std::size_t>(order + 2) *
-                static_cast<std::size_t>(order + 3) / 6;
-            EXPECT_EQ(tet.size(), expected);
-        }
-    }
-}
-
-TEST(LagrangeBasis, KroneckerDeltaAtNodes) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 1},
-        {ElementType::Quad4, 1},
-        {ElementType::Triangle3, 1},
-        {ElementType::Tetra4, 1},
-        {ElementType::Hex8, 1},
-        {ElementType::Triangle3, 2},
-        {ElementType::Tetra4, 2},
-        {ElementType::Quad4, 2},
-        {ElementType::Hex8, 2},
-        {ElementType::Wedge6, 2}
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.first, c.second);
-        const auto& nodes = basis.nodes();
-        ASSERT_EQ(nodes.size(), basis.size());
-
-        for (std::size_t i = 0; i < nodes.size(); ++i) {
-            std::vector<Real> vals;
-            basis.evaluate_values(nodes[i], vals);
-            ASSERT_EQ(vals.size(), nodes.size());
-            for (std::size_t j = 0; j < nodes.size(); ++j) {
-                if (i == j) {
-                    EXPECT_NEAR(vals[j], 1.0, 1e-12);
-                } else {
-                    EXPECT_NEAR(vals[j], 0.0, 1e-12);
-                }
-            }
-        }
-    }
-}
-
-TEST(LagrangeBasis, MatchesNodeOrderingConventionsForLinearAndQuadratic) {
-    // Tensor-product elements
-    expect_nodes_match_node_ordering(ElementType::Line2, 1, ElementType::Line2);
-    expect_nodes_match_node_ordering(ElementType::Line2, 2, ElementType::Line3);
-    expect_nodes_match_node_ordering(ElementType::Quad4, 1, ElementType::Quad4);
-    expect_nodes_match_node_ordering(ElementType::Quad4, 2, ElementType::Quad9);
-    expect_nodes_match_node_ordering(ElementType::Hex8, 1, ElementType::Hex8);
-    expect_nodes_match_node_ordering(ElementType::Hex8, 2, ElementType::Hex27);
-
-    // Simplex elements
-    expect_nodes_match_node_ordering(ElementType::Triangle3, 1, ElementType::Triangle3);
-    expect_nodes_match_node_ordering(ElementType::Triangle3, 2, ElementType::Triangle6);
-    expect_nodes_match_node_ordering(ElementType::Tetra4, 1, ElementType::Tetra4);
-    expect_nodes_match_node_ordering(ElementType::Tetra4, 2, ElementType::Tetra10);
-
-    // Mixed topology
-    expect_nodes_match_node_ordering(ElementType::Wedge6, 1, ElementType::Wedge6);
-    expect_nodes_match_node_ordering(ElementType::Wedge6, 2, ElementType::Wedge18);
-
-    // Pyramid
-    expect_nodes_match_node_ordering(ElementType::Pyramid5, 1, ElementType::Pyramid5);
-    expect_nodes_match_node_ordering(ElementType::Pyramid14, 2, ElementType::Pyramid14);
-}
-
-TEST(LagrangeBasis, WedgeAndPyramidPartitionOfUnity) {
-    {
-        LagrangeBasis wedge(ElementType::Wedge6, 1);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(0.3)};
-        std::vector<Real> vals;
-        wedge.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-    }
-
-    {
-        LagrangeBasis wedge_q(ElementType::Wedge18, 2);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(-0.25)};
-        std::vector<Real> vals;
-        wedge_q.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-
-        // Wedge18 should report 18 nodes in ReferenceNodeLayout
-        EXPECT_EQ(ReferenceNodeLayout::num_nodes(ElementType::Wedge18), 18u);
-        // Corner nodes should match Wedge6 vertices
-        auto v0 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 0);
-        auto v1 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 1);
-        auto v2 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 2);
-        EXPECT_NEAR(v0[0], Real(0), 1e-14);
-        EXPECT_NEAR(v0[1], Real(0), 1e-14);
-        EXPECT_NEAR(v0[2], Real(-1), 1e-14);
-        EXPECT_NEAR(v1[0], Real(1), 1e-14);
-        EXPECT_NEAR(v1[1], Real(0), 1e-14);
-        EXPECT_NEAR(v1[2], Real(-1), 1e-14);
-        EXPECT_NEAR(v2[0], Real(0), 1e-14);
-        EXPECT_NEAR(v2[1], Real(1), 1e-14);
-        EXPECT_NEAR(v2[2], Real(-1), 1e-14);
-    }
-
-    {
-        LagrangeBasis pyr(ElementType::Pyramid5, 1);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.4)};
-        std::vector<Real> vals;
-        pyr.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-    }
-}
-
-TEST(LagrangeBasis, NonTensorStridedEvaluationMatchesPointwise) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Triangle3, 3},
-        {ElementType::Tetra4, 3},
-        {ElementType::Wedge6, 3},
-        {ElementType::Pyramid5, 3},
-    };
-    const std::vector<StridedOutputRequest> requests = {
-        {true, false, false},
-        {false, true, false},
-        {false, false, true},
-        {true, true, false},
-        {true, false, true},
-        {false, true, true},
-        {true, true, true},
-    };
-
-    for (const auto& [type, order] : cases) {
-        for (const auto& request : requests) {
-            SCOPED_TRACE(static_cast<int>(type));
-            SCOPED_TRACE(order);
-            SCOPED_TRACE(request.values ? "values" : "no values");
-            SCOPED_TRACE(request.gradients ? "gradients" : "no gradients");
-            SCOPED_TRACE(request.hessians ? "hessians" : "no hessians");
-            expect_strided_matches_pointwise(type, order, request);
-        }
-    }
-}
-
-TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 4},
-        {ElementType::Quad4, 3},
-        {ElementType::Hex8, 3},
-        {ElementType::Triangle3, 4},
-        {ElementType::Tetra4, 3},
-        {ElementType::Wedge6, 3},
-        {ElementType::Pyramid5, 3},
-    };
-
-    for (const auto& [type, order] : cases) {
-        SCOPED_TRACE(static_cast<int>(type));
-        SCOPED_TRACE(order);
-        expect_raw_to_matches_vector_evaluation(type, order);
-    }
-}
-
-TEST(LagrangeBasis, CanonicalConstructorsSupportArbitraryOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
-
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            EXPECT_EQ(basis.element_type(), c.type);
-            EXPECT_EQ(basis.order(), order);
-            EXPECT_EQ(basis.dimension(), expected_dimension(c.type));
-            EXPECT_EQ(basis.size(), expected_lagrange_size(c.type, order));
-            EXPECT_EQ(basis.nodes().size(), basis.size());
-        }
-    }
-}
-
-TEST(LagrangeBasis, AliasVariantsNormalizeToCanonicalPaths) {
-    expect_alias_matches_canonical(
-        ElementType::Line3, ElementType::Line2, 2, sample_points_for(ElementType::Line2));
-    expect_alias_matches_canonical(
-        ElementType::Triangle6, ElementType::Triangle3, 2, sample_points_for(ElementType::Triangle3));
-    expect_alias_matches_canonical(
-        ElementType::Quad9, ElementType::Quad4, 2, sample_points_for(ElementType::Quad4));
-    expect_alias_matches_canonical(
-        ElementType::Tetra10, ElementType::Tetra4, 2, sample_points_for(ElementType::Tetra4));
-    expect_alias_matches_canonical(
-        ElementType::Hex27, ElementType::Hex8, 2, sample_points_for(ElementType::Hex8));
-    expect_alias_matches_canonical(
-        ElementType::Wedge18, ElementType::Wedge6, 2, sample_points_for(ElementType::Wedge6));
-    expect_alias_matches_canonical(
-        ElementType::Pyramid14, ElementType::Pyramid5, 2, sample_points_for(ElementType::Pyramid5),
-        Real(2e-10));
-}
 
-TEST(LagrangeBasis, SerendipityVariantsRemainRejected) {
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Quad8, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Hex20, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Wedge15, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Pyramid13, 2), svmp::FE::FEException);
-}
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
 
-TEST(LagrangeBasis, GeneratedNodeOrderingIsDeterministicAcrossOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
+#include <array>
+#include <tuple>
+#include <vector>
 
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            const auto generated_a = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
-            const auto generated_b = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
-            ASSERT_EQ(generated_a.size(), expected_lagrange_size(c.type, order));
-            ASSERT_EQ(generated_a.size(), generated_b.size());
-            for (std::size_t i = 0; i < generated_a.size(); ++i) {
-                EXPECT_TRUE(points_close(generated_a[i], generated_b[i]));
-            }
-        }
-    }
-}
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
 
-TEST(LagrangeBasis, NodeOrderingMatchesReferenceCoordinateOracles) {
-    const std::array<ElementType, 18> cases = {
-        ElementType::Line2, ElementType::Line3,
-        ElementType::Triangle3, ElementType::Triangle6,
-        ElementType::Quad4, ElementType::Quad8, ElementType::Quad9,
-        ElementType::Tetra4, ElementType::Tetra10,
-        ElementType::Hex8, ElementType::Hex20, ElementType::Hex27,
-        ElementType::Wedge6, ElementType::Wedge15, ElementType::Wedge18,
-        ElementType::Pyramid5, ElementType::Pyramid13, ElementType::Pyramid14,
-    };
+namespace {
 
-    for (ElementType type : cases) {
-        const auto expected = reference_node_coords(type);
-        ASSERT_FALSE(expected.empty());
-        ASSERT_EQ(ReferenceNodeLayout::num_nodes(type), expected.size());
-        for (std::size_t i = 0; i < expected.size(); ++i) {
-            const auto actual = ReferenceNodeLayout::get_node_coords(type, i);
-            EXPECT_TRUE(points_close(actual, expected[i]))
-                << "Element type " << static_cast<int>(type)
-                << ", node " << i;
-        }
-    }
-}
+using Point = math::Vector<Real, 3>;
 
-TEST(LagrangeBasis, GeneratedLowOrderOrderingMatchesPublicAliasPaths) {
-    const struct Case {
-        ElementType type;
-        int order;
-        ElementType public_alias;
-    } cases[] = {
-        {ElementType::Line2, 1, ElementType::Line2},
-        {ElementType::Line2, 2, ElementType::Line3},
-        {ElementType::Triangle3, 1, ElementType::Triangle3},
-        {ElementType::Triangle3, 2, ElementType::Triangle6},
-        {ElementType::Quad4, 1, ElementType::Quad4},
-        {ElementType::Quad4, 2, ElementType::Quad9},
-        {ElementType::Tetra4, 1, ElementType::Tetra4},
-        {ElementType::Tetra4, 2, ElementType::Tetra10},
-        {ElementType::Hex8, 1, ElementType::Hex8},
-        {ElementType::Hex8, 2, ElementType::Hex27},
-        {ElementType::Wedge6, 1, ElementType::Wedge6},
-        {ElementType::Wedge6, 2, ElementType::Wedge18},
-        {ElementType::Pyramid5, 1, ElementType::Pyramid5},
-        {ElementType::Pyramid5, 2, ElementType::Pyramid14},
-    };
+struct CanonicalCase {
+    ElementType type;
+    int order;
+    std::size_t size;
+    int dimension;
+    std::vector<Point> points;
+    Real derivative_tol;
+};
 
-    for (const auto& c : cases) {
-        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(c.type, c.order);
-        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(c.public_alias));
-        for (std::size_t i = 0; i < generated.size(); ++i) {
-            const auto public_alias = ReferenceNodeLayout::get_node_coords(c.public_alias, i);
-            EXPECT_TRUE(points_close(generated[i], public_alias));
-        }
-    }
+const std::vector<CanonicalCase>& canonical_cases() {
+    static const std::vector<CanonicalCase> cases = {
+        {ElementType::Line2, 3, 4u, 1,
+         {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}},
+         Real(1e-11)},
+        {ElementType::Triangle3, 3, 10u, 2,
+         {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}},
+         Real(1e-9)},
+        {ElementType::Quad4, 3, 16u, 2,
+         {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}},
+         Real(1e-11)},
+        {ElementType::Tetra4, 2, 10u, 3,
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}},
+         Real(1e-9)},
+        {ElementType::Hex8, 2, 27u, 3,
+         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}},
+         Real(1e-10)},
+        {ElementType::Wedge6, 2, 18u, 3,
+         {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}},
+         Real(1e-9)},
+    };
+    return cases;
 }
 
-TEST(LagrangeBasis, KroneckerDeltaAcrossCanonicalTopologiesAndOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
-
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            ASSERT_EQ(basis.size(), expected_lagrange_size(c.type, order));
-
-            std::vector<Real> values;
-            for (std::size_t node_i = 0; node_i < basis.size(); ++node_i) {
-                basis.evaluate_values(basis.nodes()[node_i], values);
-                ASSERT_EQ(values.size(), basis.size());
-                for (std::size_t basis_i = 0; basis_i < basis.size(); ++basis_i) {
-                    EXPECT_NEAR(values[basis_i], basis_i == node_i ? Real(1) : Real(0), Real(2e-10))
-                        << "Element type " << static_cast<int>(c.type)
-                        << ", order " << order
-                        << ", node " << node_i
-                        << ", basis " << basis_i;
-                }
-            }
+std::vector<Point> sample_points_for(ElementType type) {
+    for (const auto& c : canonical_cases()) {
+        if (c.type == type) {
+            return c.points;
         }
     }
+    return {};
 }
 
-TEST(LagrangeBasis, PartitionGradientAndHessianSumsAcrossCanonicalTopologiesAndOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-        Real tol;
-    } cases[] = {
-        {ElementType::Line2, 8, Real(1e-11)},
-        {ElementType::Triangle3, 6, Real(1e-10)},
-        {ElementType::Quad4, 6, Real(1e-10)},
-        {ElementType::Tetra4, 5, Real(2e-10)},
-        {ElementType::Hex8, 5, Real(2e-10)},
-        {ElementType::Wedge6, 5, Real(5e-10)},
-        {ElementType::Pyramid5, 5, Real(5e-7)},
-    };
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+{
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
 
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            expect_partition_gradient_hessian_sums(basis, dense_sample_points_for(c.type), c.tol, c.tol);
+    std::vector<Real> values;
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_NEAR(values[i], i == node ? Real(1) : Real(0), tol)
+                << "node=" << node << " basis=" << i;
         }
     }
 }
 
-TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Point point;
-        Real tolerance;
-    } cases[] = {
-        {ElementType::Triangle3, 13, Point{Real(0.19), Real(0.31), Real(0)}, Real(1e-8)},
-        {ElementType::Tetra4, 13, Point{Real(0.13), Real(0.17), Real(0.19)}, Real(1e-7)},
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<Point>& points,
+                                            Real derivative_tol)
+{
+    for (const auto& xi : points) {
         std::vector<Real> values;
         std::vector<Gradient> gradients;
         std::vector<Hessian> hessians;
-        basis.evaluate_all(c.point, values, gradients, hessians);
-
-        ASSERT_EQ(values.size(), basis.size());
-        ASSERT_EQ(gradients.size(), basis.size());
-        ASSERT_EQ(hessians.size(), basis.size());
+        basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
         Gradient gradient_sum{};
         Hessian hessian_sum{};
-        for (std::size_t i = 0; i < basis.size(); ++i) {
+        for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
                 gradient_sum[d] += gradients[i][d];
@@ -2253,776 +102,297 @@ TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
             }
         }
 
-        EXPECT_NEAR(value_sum, Real(1), c.tolerance);
-        for (std::size_t d = 0; d < 3u; ++d) {
-            EXPECT_NEAR(gradient_sum[d], Real(0), c.tolerance);
-            for (std::size_t e = 0; e < 3u; ++e) {
-                EXPECT_NEAR(hessian_sum(d, e), Real(0), Real(10) * c.tolerance);
+        EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            for (int e = 0; e < basis.dimension(); ++e) {
+                EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
+                                        static_cast<std::size_t>(e)),
+                            Real(0),
+                            derivative_tol);
             }
         }
     }
 }
 
-TEST(LagrangeBasis, HighOrderAxisNearNodeMaintainsPartitionAndDerivativeSums) {
-    const int order = 16;
-    const LagrangeBasis basis(ElementType::Line2, order);
-    const Real node = Real(-1) + Real(2 * 5) / static_cast<Real>(order);
-    const Point point{node + Real(1e-7), Real(0), Real(0)};
-
+void expect_raw_sinks_match_vector_evaluation(const LagrangeBasis& basis,
+                                              const Point& xi)
+{
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
-    basis.evaluate_all(point, values, gradients, hessians);
-    ASSERT_EQ(values.size(), basis.size());
-
-    Real value_sum = Real(0);
-    Real gradient_sum = Real(0);
-    Real hessian_sum = Real(0);
-    for (std::size_t i = 0; i < basis.size(); ++i) {
-        value_sum += values[i];
-        gradient_sum += gradients[i][0];
-        hessian_sum += hessians[i](0, 0);
-    }
-
-    EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
-    EXPECT_NEAR(gradient_sum, Real(0), Real(1e-8));
-    EXPECT_NEAR(hessian_sum, Real(0), Real(1e-5));
-}
-
-TEST(LagrangeBasis, PyramidFaceTracesMatchLowerDimensionalLagrangeBases) {
-    const PyramidFace faces[] = {
-        PyramidFace::Base,
-        PyramidFace::South,
-        PyramidFace::East,
-        PyramidFace::North,
-        PyramidFace::West,
-    };
-
-    for (int order = 1; order <= 5; ++order) {
-        for (const auto face : faces) {
-            expect_pyramid_face_trace_matches_lower_basis(
-                order, face, face == PyramidFace::Base ? Real(2e-10) : Real(5e-10));
-        }
-    }
-}
-
-TEST(LagrangeBasis, PyramidEdgeTracesMatchLineLagrangeBasis) {
-    const PyramidEdge edges[] = {
-        PyramidEdge::BaseSouth,
-        PyramidEdge::BaseEast,
-        PyramidEdge::BaseNorth,
-        PyramidEdge::BaseWest,
-        PyramidEdge::VerticalSW,
-        PyramidEdge::VerticalSE,
-        PyramidEdge::VerticalNE,
-        PyramidEdge::VerticalNW,
-    };
-
-    for (int order = 1; order <= 5; ++order) {
-        for (const auto edge : edges) {
-            expect_pyramid_edge_trace_matches_line_basis(order, edge, Real(5e-10));
-        }
-    }
-}
+    basis.evaluate_all(xi, values, gradients, hessians);
 
-TEST(LagrangeBasis, Pyramid14RationalNodalAndPartition) {
-    using svmp::FE::basis::ReferenceNodeLayout;
+    std::vector<Real> flat_values(basis.size());
+    std::vector<Real> flat_gradients(basis.size() * 3u);
+    std::vector<Real> flat_hessians(basis.size() * 9u);
+    basis.evaluate_values_to(xi, flat_values.data());
+    basis.evaluate_gradients_to(xi, flat_gradients.data());
+    basis.evaluate_hessians_to(xi, flat_hessians.data());
 
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    EXPECT_EQ(basis.dimension(), 3);
-    EXPECT_EQ(basis.size(), 14u);
-
-    // Kronecker nodal property at all Pyramid14 nodes
     for (std::size_t i = 0; i < basis.size(); ++i) {
-        auto xi = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
-        std::vector<Real> vals;
-        basis.evaluate_values(xi, vals);
-        ASSERT_EQ(vals.size(), basis.size());
-        for (std::size_t j = 0; j < basis.size(); ++j) {
-            const double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(vals[j], expected, 1e-12);
+        EXPECT_NEAR(flat_values[i], values[i], Real(1e-14));
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_NEAR(flat_gradients[i * 3u + d], gradients[i][d], Real(1e-14));
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_NEAR(flat_hessians[i * 9u + d * 3u + e],
+                            hessians[i](d, e),
+                            Real(1e-14));
+            }
         }
     }
-
-    // Partition of unity at an interior point
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.3)};
-    std::vector<Real> vals;
-    basis.evaluate_values(xi, vals);
-    const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
 }
 
-TEST(LagrangeBasis, Pyramid14GradientSumZero) {
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.15), Real(-0.1), Real(0.3)};
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    ASSERT_EQ(grads.size(), basis.size());
-
-    Gradient sum{};
-    for (const auto& g : grads) {
-        sum[0] += g[0];
-        sum[1] += g[1];
-        sum[2] += g[2];
+void expect_nodes_close(const std::vector<Point>& lhs,
+                        const std::vector<Point>& rhs,
+                        Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
+    for (std::size_t i = 0; i < lhs.size(); ++i) {
+        EXPECT_NEAR(lhs[i][0], rhs[i][0], tol) << "node=" << i;
+        EXPECT_NEAR(lhs[i][1], rhs[i][1], tol) << "node=" << i;
+        EXPECT_NEAR(lhs[i][2], rhs[i][2], tol) << "node=" << i;
     }
-    EXPECT_NEAR(sum[0], 0.0, 1e-8);
-    EXPECT_NEAR(sum[1], 0.0, 1e-8);
-    EXPECT_NEAR(sum[2], 0.0, 1e-8);
 }
 
-TEST(LagrangeBasis, HigherOrderP4KroneckerAndPartition) {
-    struct Case {
-        ElementType type;
-        int order;
-        svmp::FE::math::Vector<Real, 3> xi;
-    };
-
-    const std::vector<Case> cases = {
-        {ElementType::Line2, 4, {Real(0.11), Real(0), Real(0)}},
-        {ElementType::Quad4, 4, {Real(0.2), Real(-0.3), Real(0)}},
-        {ElementType::Triangle3, 4, {Real(0.2), Real(0.3), Real(0)}},
-        {ElementType::Hex8, 4, {Real(0.2), Real(-0.3), Real(0.4)}},
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-
-        // Partition of unity at an interior point
-        std::vector<Real> values;
-        basis.evaluate_values(c.xi, values);
-        const double sum = std::accumulate(values.begin(), values.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
+void expect_evaluations_match(const LagrangeBasis& lhs,
+                              const LagrangeBasis& rhs,
+                              const std::vector<Point>& points,
+                              Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
 
-        // Kronecker delta property at all nodes
-        const auto& nodes = basis.nodes();
-        ASSERT_EQ(nodes.size(), basis.size());
-        for (std::size_t i = 0; i < nodes.size(); ++i) {
-            basis.evaluate_values(nodes[i], values);
-            ASSERT_EQ(values.size(), nodes.size());
-            for (std::size_t j = 0; j < nodes.size(); ++j) {
-                const double expected = (i == j) ? 1.0 : 0.0;
-                EXPECT_NEAR(values[j], expected, 1e-12);
+    for (const auto& xi : points) {
+        std::vector<Real> lhs_values;
+        std::vector<Real> rhs_values;
+        std::vector<Gradient> lhs_gradients;
+        std::vector<Gradient> rhs_gradients;
+        std::vector<Hessian> lhs_hessians;
+        std::vector<Hessian> rhs_hessians;
+
+        lhs.evaluate_all(xi, lhs_values, lhs_gradients, lhs_hessians);
+        rhs.evaluate_all(xi, rhs_values, rhs_gradients, rhs_hessians);
+
+        for (std::size_t i = 0; i < lhs.size(); ++i) {
+            EXPECT_NEAR(lhs_values[i], rhs_values[i], tol);
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(lhs_gradients[i][d], rhs_gradients[i][d], tol);
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(lhs_hessians[i](d, e), rhs_hessians[i](d, e), tol);
+                }
             }
         }
     }
 }
 
-TEST(LagrangeBasis, Pyramid14InterpolatesQuadraticPolynomials) {
-    using svmp::FE::basis::ReferenceNodeLayout;
-
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    const std::size_t n = basis.size();
-
-    // Precompute nodal coordinates
-    std::vector<svmp::FE::math::Vector<Real,3>> nodes;
-    nodes.reserve(n);
-    for (std::size_t i = 0; i < n; ++i) {
-        nodes.push_back(ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i));
-    }
-
-    auto interpolate_and_check = [&](auto f, Real tol) {
-        // Nodal coefficients
-        std::vector<Real> coeffs(n);
-        for (std::size_t i = 0; i < n; ++i) {
-            const auto& x = nodes[i];
-            coeffs[i] = f(x[0], x[1], x[2]);
-        }
-
-        // Test at a few interior points
-        const svmp::FE::math::Vector<Real,3> test_pts[] = {
-            {Real(0.1), Real(-0.2), Real(0.2)},
-            {Real(-0.2), Real(0.15), Real(0.4)},
-            {Real(0.05), Real(0.05), Real(0.3)}
-        };
-
-        for (const auto& xi : test_pts) {
-            std::vector<Real> vals;
-            basis.evaluate_values(xi, vals);
-            ASSERT_EQ(vals.size(), n);
-
-            Real u_interp = Real(0);
-            for (std::size_t i = 0; i < n; ++i) {
-                u_interp += coeffs[i] * vals[i];
-            }
-
-            const Real u_exact = f(xi[0], xi[1], xi[2]);
-            EXPECT_NEAR(u_interp, u_exact, tol);
-        }
-    };
-
-    // Constant, linear and quadratic monomials
-    interpolate_and_check([](Real, Real, Real) { return Real(1); }, Real(1e-12));
-    interpolate_and_check([](Real x, Real, Real) { return x; }, Real(1e-11));
-    interpolate_and_check([](Real, Real y, Real) { return y; }, Real(1e-11));
-    interpolate_and_check([](Real, Real, Real z) { return z; }, Real(1e-11));
-    interpolate_and_check([](Real x, Real y, Real) { return x * y; }, Real(1e-10));
-    interpolate_and_check([](Real x, Real, Real z) { return x * z; }, Real(1e-10));
-    interpolate_and_check([](Real, Real y, Real z) { return y * z; }, Real(1e-10));
-    interpolate_and_check([](Real x, Real, Real) { return x * x; }, Real(1e-10));
-    interpolate_and_check([](Real, Real y, Real) { return y * y; }, Real(1e-10));
-    interpolate_and_check([](Real, Real, Real z) { return z * z; }, Real(1e-10));
+Real linear_function(const Point& p) {
+    return Real(2) + Real(3) * p[0] - Real(4) * p[1] + Real(5) * p[2];
 }
 
-TEST(LagrangeBasis, Pyramid14GradientMatchesLinearFunctionGradient) {
-    using svmp::FE::basis::ReferenceNodeLayout;
-
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    const std::size_t n = basis.size();
-
-    // Nodal coordinates and coefficients for f(x,y,z) = ax + by + cz
-    const Real a = Real(1.2);
-    const Real b = Real(-0.7);
-    const Real c = Real(0.5);
-
-    std::vector<Real> coeffs(n);
-    for (std::size_t i = 0; i < n; ++i) {
-        const auto x = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
-        coeffs[i] = a * x[0] + b * x[1] + c * x[2];
-    }
-
-    const svmp::FE::math::Vector<Real,3> xi{Real(0.1), Real(-0.15), Real(0.35)};
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    ASSERT_EQ(grads.size(), n);
-
-    Gradient g_interp{};
-    for (std::size_t i = 0; i < n; ++i) {
-        g_interp[0] += coeffs[i] * grads[i][0];
-        g_interp[1] += coeffs[i] * grads[i][1];
-        g_interp[2] += coeffs[i] * grads[i][2];
-    }
-
-    EXPECT_NEAR(g_interp[0], a, 1e-6);
-    EXPECT_NEAR(g_interp[1], b, 1e-6);
-    EXPECT_NEAR(g_interp[2], c, 1e-6);
+Gradient linear_gradient() {
+    Gradient g{};
+    g[0] = Real(3);
+    g[1] = Real(-4);
+    g[2] = Real(5);
+    return g;
 }
 
-TEST(LagrangeBasis, PyramidApexValuesRemainExactAcrossRepresentativeOrders) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        std::vector<Real> values;
-        basis.evaluate_values(apex, values);
-        ASSERT_EQ(values.size(), basis.size());
-
-        const auto& nodes = basis.nodes();
-        auto apex_it = std::find_if(
-            nodes.begin(), nodes.end(),
-            [](const auto& node) {
-                return std::abs(node[0]) <= Real(1e-14) &&
-                       std::abs(node[1]) <= Real(1e-14) &&
-                       std::abs(node[2] - Real(1)) <= Real(1e-14);
-            });
-        ASSERT_NE(apex_it, nodes.end());
-        const std::size_t apex_index = static_cast<std::size_t>(
-            std::distance(nodes.begin(), apex_it));
+Real quadratic_function(const Point& p) {
+    return Real(1) + Real(2) * p[0] - p[1] + Real(0.5) * p[2] +
+           p[0] * p[0] + Real(0.75) * p[1] * p[1] - Real(0.25) * p[2] * p[2] +
+           Real(0.2) * p[0] * p[1] - Real(0.3) * p[0] * p[2] +
+           Real(0.4) * p[1] * p[2];
+}
 
-        Real sum = Real(0);
-        for (std::size_t i = 0; i < values.size(); ++i) {
-            EXPECT_TRUE(std::isfinite(static_cast<double>(values[i])));
-            sum += values[i];
-            const Real expected = (i == apex_index) ? Real(1) : Real(0);
-            EXPECT_NEAR(values[i], expected, 1e-12)
-                << "order " << c.order << ", basis " << i;
-        }
-        EXPECT_NEAR(sum, Real(1), 1e-12);
+template<typename Function>
+Real interpolate_value(const LagrangeBasis& basis,
+                       const std::vector<Real>& values,
+                       Function&& nodal_function)
+{
+    Real result = Real(0);
+    const auto& nodes = basis.nodes();
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        result += values[i] * nodal_function(nodes[i]);
     }
+    return result;
 }
 
-TEST(LagrangeBasis, PyramidGradientAtExactApexThrowsWhenLimitIsNotUnique) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
+} // namespace
 
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        std::vector<Gradient> gradients;
-        EXPECT_THROW(basis.evaluate_gradients(apex, gradients), svmp::FE::basis::BasisEvaluationException)
-            << "order " << c.order;
+        EXPECT_EQ(basis.basis_type(), BasisType::Lagrange);
+        EXPECT_EQ(basis.element_type(), c.type);
+        EXPECT_EQ(basis.order(), c.order);
+        EXPECT_EQ(basis.size(), c.size);
+        EXPECT_EQ(basis.dimension(), c.dimension);
     }
 }
 
-TEST(LagrangeBasis, PyramidApexValuesMatchDirectionalNearApexLimits) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Real tol;
-    } cases[] = {
-        {ElementType::Pyramid5, 1, Real(3e-6)},
-        {ElementType::Pyramid14, 2, Real(4e-6)},
-        {ElementType::Pyramid5, 4, Real(1e-5)},
-    };
-
-    const std::array<std::array<Real, 2>, 4> directions = {{
-        {Real(0), Real(0)},
-        {Real(0.35), Real(-0.25)},
-        {Real(-0.50), Real(0.45)},
-        {Real(0.20), Real(0.60)},
-    }};
-    const Real t = Real(1e-6);
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        std::vector<Real> apex_values;
-        basis.evaluate_values(apex, apex_values);
-
-        for (const auto& direction : directions) {
-            const svmp::FE::math::Vector<Real, 3> xi{
-                t * direction[0],
-                t * direction[1],
-                Real(1) - t
-            };
-
-            std::vector<Real> values;
-            basis.evaluate_values(xi, values);
-            ASSERT_EQ(values.size(), apex_values.size());
-
-            for (std::size_t i = 0; i < values.size(); ++i) {
-                EXPECT_NEAR(values[i], apex_values[i], c.tol)
-                    << "order " << c.order
-                    << ", basis " << i
-                    << ", direction (" << direction[0] << ", " << direction[1] << ")";
-            }
-        }
+        expect_kronecker_at_nodes(basis, Real(2e-10));
+        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
     }
 }
 
-TEST(LagrangeBasis, PyramidNearApexGradientShowsDirectionalSpread) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Real min_spread;
-    } cases[] = {
-        {ElementType::Pyramid5, 1, Real(5e-2)},
-        {ElementType::Pyramid14, 2, Real(5e-2)},
-    };
-
-    const std::array<std::array<Real, 2>, 4> directions = {{
-        {Real(0), Real(0)},
-        {Real(0.45), Real(-0.30)},
-        {Real(-0.35), Real(0.40)},
-        {Real(0.25), Real(0.55)},
-    }};
-    const Real t = Real(1e-6);
-
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        double max_spread = 0.0;
-
-        std::vector<std::vector<Gradient>> directional_gradients;
-        directional_gradients.reserve(directions.size());
-        for (const auto& direction : directions) {
-            const svmp::FE::math::Vector<Real, 3> xi{
-                t * direction[0],
-                t * direction[1],
-                Real(1) - t
-            };
-
-            std::vector<Gradient> gradients;
-            basis.evaluate_gradients(xi, gradients);
-            directional_gradients.push_back(std::move(gradients));
-        }
-
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            for (int d = 0; d < 3; ++d) {
-                double min_value = std::numeric_limits<double>::infinity();
-                double max_value = -std::numeric_limits<double>::infinity();
-                for (const auto& gradients : directional_gradients) {
-                    const double value = static_cast<double>(gradients[i][static_cast<std::size_t>(d)]);
-                    min_value = std::min(min_value, value);
-                    max_value = std::max(max_value, value);
-                }
-                max_spread = std::max(max_spread, max_value - min_value);
-            }
-        }
-
-        EXPECT_GT(max_spread, static_cast<double>(c.min_spread))
-            << "order " << c.order;
+        expect_raw_sinks_match_vector_evaluation(basis, c.points.front());
     }
 }
 
-TEST(LagrangeBasis, GradientSumZeroQuadAndTet) {
-    const std::vector<std::pair<ElementType, svmp::FE::math::Vector<Real, 3>>> cases = {
-        {ElementType::Quad4, svmp::FE::math::Vector<Real, 3>{Real(0.2), Real(-0.1), Real(0)}},
-        {ElementType::Tetra4, svmp::FE::math::Vector<Real, 3>{Real(0.1), Real(0.2), Real(0.1)}}
+TEST(LagrangeBasis, CompleteAliasesNormalizeToCanonicalBases) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
     };
 
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.first, 1);
-        std::vector<Gradient> grads;
-        basis.evaluate_gradients(c.second, grads);
-
-        ASSERT_EQ(grads.size(), basis.size());
-        Gradient sum{};
-        for (const auto& g : grads) {
-            sum[0] += g[0];
-            sum[1] += g[1];
-            sum[2] += g[2];
-        }
-        EXPECT_NEAR(sum[0], 0.0, 1e-12);
-        EXPECT_NEAR(sum[1], 0.0, 1e-12);
-        EXPECT_NEAR(sum[2], 0.0, 1e-12);
-    }
-}
-
-TEST(LagrangeBasis, HexPartitionAndGradientSumZeroOrderThree) {
-    LagrangeBasis basis(ElementType::Hex8, 3);
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.25)};
-
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-    const double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    Gradient gsum{};
-    for (const auto& g : grads) {
-        gsum[0] += g[0];
-        gsum[1] += g[1];
-        gsum[2] += g[2];
-    }
-    EXPECT_NEAR(gsum[0], 0.0, 1e-10);
-    EXPECT_NEAR(gsum[1], 0.0, 1e-10);
-    EXPECT_NEAR(gsum[2], 0.0, 1e-10);
-}
-
-TEST(LagrangeBasis, OracleLine3ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Line3, 2);
-    const Point xi{Real(0.2), Real(0), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 3u);
-    ASSERT_EQ(gradients.size(), 3u);
-    ASSERT_EQ(hessians.size(), 3u);
-
-    const Real expected_values[] = {Real(-2) / Real(25), Real(3) / Real(25), Real(24) / Real(25)};
-    const Real expected_gradients[] = {Real(-3) / Real(10), Real(7) / Real(10), Real(-2) / Real(5)};
-    const Real expected_hessians[] = {Real(1), Real(1), Real(-2)};
+    for (const auto& [alias, canonical, order] : aliases) {
+        LagrangeBasis alias_basis(alias, 1);
+        LagrangeBasis canonical_basis(canonical, order);
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
 
-    for (std::size_t i = 0; i < 3; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), expected_hessians[i], 1e-14);
+        EXPECT_EQ(alias_basis.element_type(), canonical);
+        EXPECT_EQ(alias_basis.order(), order);
+        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
+        expect_nodes_close(alias_basis.nodes(), canonical_basis.nodes(), Real(1e-14));
+        expect_evaluations_match(alias_basis,
+                                 canonical_basis,
+                                 sample_points_for(canonical),
+                                 Real(1e-12));
     }
 }
 
-TEST(LagrangeBasis, OracleTriangle3ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Triangle3, 1);
-    const Point xi{Real(0.2), Real(0.3), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 3u);
-    const Point expected_gradients[] = {
-        Point{Real(-1), Real(-1), Real(0)},
-        Point{Real(1), Real(0), Real(0)},
-        Point{Real(0), Real(1), Real(0)}
+TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line2, ElementType::Line2, 1},
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle3, ElementType::Triangle3, 1},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad4, ElementType::Quad4, 1},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra4, ElementType::Tetra4, 1},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex8, ElementType::Hex8, 1},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge6, ElementType::Wedge6, 1},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
     };
-    const Real expected_values[] = {Real(0.5), Real(0.2), Real(0.3)};
-
-    for (std::size_t i = 0; i < 3; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        for (int a = 0; a < 2; ++a) {
-            for (int b = 0; b < 2; ++b) {
-                EXPECT_NEAR(hessians[i](static_cast<std::size_t>(a), static_cast<std::size_t>(b)),
-                            Real(0), 1e-14);
-            }
-        }
-    }
-}
-
-TEST(LagrangeBasis, OracleQuad4ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Quad4, 1);
-    const Point xi{Real(0.2), Real(-0.4), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
 
-    ASSERT_EQ(values.size(), 4u);
-    const Real expected_values[] = {Real(7) / Real(25), Real(21) / Real(50),
-                                    Real(9) / Real(50), Real(3) / Real(25)};
-    const Point expected_gradients[] = {
-        Point{Real(-7) / Real(20), Real(-1) / Real(5), Real(0)},
-        Point{Real(7) / Real(20), Real(-3) / Real(10), Real(0)},
-        Point{Real(3) / Real(20), Real(3) / Real(10), Real(0)},
-        Point{Real(-3) / Real(20), Real(1) / Real(5), Real(0)}
-    };
-    const Real expected_hxy[] = {Real(1) / Real(4), Real(-1) / Real(4),
-                                 Real(1) / Real(4), Real(-1) / Real(4)};
+    for (const auto& [alias, canonical, order] : aliases) {
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
 
-    for (std::size_t i = 0; i < 4; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 1), expected_hxy[i], 1e-14);
-        EXPECT_NEAR(hessians[i](1, 0), expected_hxy[i], 1e-14);
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node=" << i;
+        }
     }
 }
 
-TEST(LagrangeBasis, OracleWedge6ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Wedge6, 1);
-    const Point xi{Real(0.2), Real(0.25), Real(-0.3)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 6u);
-    const Real expected_values[] = {
-        Real(143) / Real(400), Real(13) / Real(100), Real(13) / Real(80),
-        Real(77) / Real(400), Real(7) / Real(100), Real(7) / Real(80)
-    };
-    const Point expected_gradients[] = {
-        Point{Real(-13) / Real(20), Real(-13) / Real(20), Real(-11) / Real(40)},
-        Point{Real(13) / Real(20), Real(0), Real(-1) / Real(10)},
-        Point{Real(0), Real(13) / Real(20), Real(-1) / Real(8)},
-        Point{Real(-7) / Real(20), Real(-7) / Real(20), Real(11) / Real(40)},
-        Point{Real(7) / Real(20), Real(0), Real(1) / Real(10)},
-        Point{Real(0), Real(7) / Real(20), Real(1) / Real(8)}
-    };
-    const Point expected_hxz[] = {
-        Point{Real(1) / Real(2), Real(1) / Real(2), Real(0)},
-        Point{Real(-1) / Real(2), Real(0), Real(0)},
-        Point{Real(0), Real(-1) / Real(2), Real(0)},
-        Point{Real(-1) / Real(2), Real(-1) / Real(2), Real(0)},
-        Point{Real(1) / Real(2), Real(0), Real(0)},
-        Point{Real(0), Real(1) / Real(2), Real(0)}
+TEST(LagrangeBasis, RemovedOrSerendipityFamiliesAreRejected) {
+    const std::array<ElementType, 6> unsupported = {
+        ElementType::Quad8,
+        ElementType::Hex20,
+        ElementType::Wedge15,
+        ElementType::Pyramid5,
+        ElementType::Pyramid13,
+        ElementType::Pyramid14,
     };
 
-    for (std::size_t i = 0; i < 6; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        EXPECT_NEAR(gradients[i][2], expected_gradients[i][2], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](2, 2), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 2), expected_hxz[i][0], 1e-14);
-        EXPECT_NEAR(hessians[i](2, 0), expected_hxz[i][0], 1e-14);
-        EXPECT_NEAR(hessians[i](1, 2), expected_hxz[i][1], 1e-14);
-        EXPECT_NEAR(hessians[i](2, 1), expected_hxz[i][1], 1e-14);
+    for (const auto type : unsupported) {
+        EXPECT_THROW((void)LagrangeBasis(type, 2), BasisElementCompatibilityException)
+            << "element=" << static_cast<int>(type);
     }
 }
 
-TEST(LagrangeBasis, DeterministicBoundarySweepMaintainsPartitionAndFiniteDerivatives) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 1},
-        {ElementType::Line3, 2},
-        {ElementType::Triangle3, 1},
-        {ElementType::Triangle6, 2},
-        {ElementType::Quad4, 1},
-        {ElementType::Quad9, 2},
-        {ElementType::Tetra4, 1},
-        {ElementType::Tetra10, 2},
-        {ElementType::Hex8, 1},
-        {ElementType::Hex27, 2},
-        {ElementType::Wedge6, 1},
-        {ElementType::Wedge18, 2},
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
+TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Line2, {Real(-0.2), Real(0), Real(0)}},
+        {ElementType::Triangle3, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Quad4, {Real(0.25), Real(-0.4), Real(0)}},
+        {ElementType::Tetra4, {Real(0.1), Real(0.2), Real(0.3)}},
+        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
+        {ElementType::Wedge6, {Real(0.2), Real(0.15), Real(-0.3)}},
     };
+    const Gradient expected_gradient = linear_gradient();
 
-    for (const auto& [type, order] : cases) {
-        LagrangeBasis basis(type, order);
-        for (const auto& xi : boundary_stress_points_for(type)) {
-            std::vector<Real> values;
-            std::vector<Gradient> gradients;
-            std::vector<Hessian> hessians;
-            basis.evaluate_values(xi, values);
-            basis.evaluate_gradients(xi, gradients);
-            basis.evaluate_hessians(xi, hessians);
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 1);
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        basis.evaluate_values(point, values);
+        basis.evaluate_gradients(point, gradients);
 
-            ASSERT_EQ(values.size(), basis.size());
-            ASSERT_EQ(gradients.size(), basis.size());
-            ASSERT_EQ(hessians.size(), basis.size());
+        const Real interpolated =
+            interpolate_value(basis, values, linear_function);
+        EXPECT_NEAR(interpolated, linear_function(point), Real(1e-12));
 
-            Real sum = Real(0);
-            for (Real value : values) {
-                EXPECT_TRUE(std::isfinite(value));
-                sum += value;
+        Gradient interpolated_gradient{};
+        for (std::size_t i = 0; i < gradients.size(); ++i) {
+            const Real nodal_value = linear_function(basis.nodes()[i]);
+            for (int d = 0; d < basis.dimension(); ++d) {
+                interpolated_gradient[static_cast<std::size_t>(d)] +=
+                    nodal_value * gradients[i][static_cast<std::size_t>(d)];
             }
-            expect_all_finite(gradients);
-            expect_hessians_finite(hessians, basis.dimension());
-            EXPECT_NEAR(sum, Real(1), type == ElementType::Pyramid5 || type == ElementType::Pyramid14
-                                       ? Real(1e-8)
-                                       : Real(1e-12))
-                << "type=" << static_cast<int>(type)
-                << ", order=" << order
-                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
         }
-    }
-}
-
-TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedLinearShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
-        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
-        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
-        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
-        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
-        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
-        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
-    };
-
-    for (const auto& c : cases) {
-        expect_gradients_match_finite_difference(c, Real(1e-6), Real(1e-6));
-    }
-}
-
-TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedQuadraticShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
-        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
-        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
-        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
-        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
-        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
-        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
-    };
-
-    for (const auto& c : cases) {
-        expect_gradients_match_finite_difference(c, Real(1e-6), Real(2e-6));
-    }
-}
-
-TEST(LagrangeBasis, LinearPolynomialReproductionAcrossSupportedLinearShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
-        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
-        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
-        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
-        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
-        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
-        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
-    };
-
-    const std::vector<std::array<int, 3>> exponents = {
-        {0, 0, 0},
-        {1, 0, 0},
-        {0, 1, 0},
-        {0, 0, 1},
-    };
-
-    for (const auto& c : cases) {
-        const std::vector<std::array<int, 3>> relevant(
-            exponents.begin(),
-            exponents.begin() + static_cast<std::ptrdiff_t>(c.type == ElementType::Line2 ? 2 :
-                                                            (c.type == ElementType::Triangle3 ||
-                                                             c.type == ElementType::Quad4) ? 3 : 4));
-        expect_polynomial_reproduction(c, relevant, Real(1e-12));
-    }
-}
-
-TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossSupportedQuadraticShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
-        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
-        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
-        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
-        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
-        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
-        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
-    };
-
-    const std::vector<std::array<int, 3>> line_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {2, 0, 0}
-    };
-    const std::vector<std::array<int, 3>> surface_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {0, 1, 0},
-        {2, 0, 0}, {1, 1, 0}, {0, 2, 0}
-    };
-    const std::vector<std::array<int, 3>> volume_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
-        {2, 0, 0}, {1, 1, 0}, {0, 2, 0},
-        {1, 0, 1}, {0, 1, 1}, {0, 0, 2}
-    };
-
-    for (const auto& c : cases) {
-        if (c.type == ElementType::Line3) {
-            expect_polynomial_reproduction(c, line_exponents, Real(1e-12));
-        } else if (c.type == ElementType::Triangle6 || c.type == ElementType::Quad9) {
-            expect_polynomial_reproduction(c, surface_exponents, Real(1e-11));
-        } else {
-            expect_polynomial_reproduction(c, volume_exponents, Real(2e-10));
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(interpolated_gradient[static_cast<std::size_t>(d)],
+                        expected_gradient[static_cast<std::size_t>(d)],
+                        Real(1e-12));
         }
     }
 }
 
-TEST(LagrangeBasis, HighOrderTensorLagrangeMaintainsPartitionAndDerivativeSums) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 8, {Point{-0.875, 0, 0}, Point{0.125, 0, 0}, Point{1, 0, 0}}},
-        {ElementType::Quad4, 7, {Point{0.2, -0.35, 0}, Point{-1, 0.5, 0}, Point{0.5, 1, 0}}},
-        {ElementType::Hex8, 6, {Point{0.1, -0.2, 0.3}, Point{-1, 0.5, 1}, Point{0.75, -1, -0.5}}},
+TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Line3, {Real(-0.2), Real(0), Real(0)}},
+        {ElementType::Triangle6, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Quad9, {Real(0.25), Real(-0.4), Real(0)}},
+        {ElementType::Tetra10, {Real(0.1), Real(0.2), Real(0.3)}},
+        {ElementType::Hex27, {Real(0.15), Real(-0.2), Real(0.25)}},
+        {ElementType::Wedge18, {Real(0.2), Real(0.15), Real(-0.3)}},
     };
 
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        expect_partition_gradient_hessian_sums(basis, c.points, Real(2e-12), Real(2e-8));
-    }
-}
-
-TEST(LagrangeBasis, HighOrderTensorLagrangeReproducesTensorPolynomials) {
-    const LagrangeAccuracyCase line{ElementType::Line2,
-                                    8,
-                                    {Point{-0.73, 0, 0}, Point{-0.1, 0, 0}, Point{0.64, 0, 0}}};
-    expect_polynomial_reproduction(line,
-                                   {{0, 0, 0}, {1, 0, 0}, {4, 0, 0}, {8, 0, 0}},
-                                   Real(1e-11));
-
-    const LagrangeAccuracyCase quad{ElementType::Quad4,
-                                    7,
-                                    {Point{-0.6, -0.2, 0}, Point{0.15, 0.45, 0}, Point{0.8, -0.55, 0}}};
-    expect_polynomial_reproduction(quad,
-                                   {{0, 0, 0}, {7, 0, 0}, {0, 7, 0}, {4, 3, 0}},
-                                   Real(5e-10));
-
-    const LagrangeAccuracyCase hex{ElementType::Hex8,
-                                   6,
-                                   {Point{-0.4, 0.2, -0.3}, Point{0.35, -0.55, 0.25}, Point{0.75, 0.4, -0.65}}};
-    expect_polynomial_reproduction(hex,
-                                   {{0, 0, 0}, {6, 0, 0}, {0, 6, 0}, {0, 0, 6}, {3, 2, 4}},
-                                   Real(2e-9));
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 1);
+        std::vector<Real> values;
+        basis.evaluate_values(point, values);
+
+        const Real interpolated =
+            interpolate_value(basis, values, quadratic_function);
+        EXPECT_NEAR(interpolated, quadratic_function(point), Real(5e-12))
+            << "element=" << static_cast<int>(type);
+    }
+}
+
+TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
+    auto lagrange =
+        basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1});
+    ASSERT_NE(lagrange, nullptr);
+    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
+    EXPECT_EQ(lagrange->element_type(), ElementType::Hex8);
+    EXPECT_EQ(lagrange->order(), 2);
+
+    auto serendipity =
+        basis_factory::create(BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid13, BasisType::Serendipity, 2}),
+                 BasisElementCompatibilityException);
 }
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
index 9f2bf8be5..30f876420 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -98,19 +98,9 @@ TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
     expect_partition_of_unity(basis, {Real(0.2), Real(0.3), Real(0.1)});
 }
 
-TEST(SerendipityBasis, Pyramid13IsNodalAndPartitionsUnity) {
-    SerendipityBasis basis(ElementType::Pyramid13, 2);
-
-    EXPECT_EQ(basis.size(), 13u);
-    expect_nodal_delta(basis,
-                       reference_nodes(ElementType::Pyramid13, basis.size()),
-                       Real(1e-8));
-    expect_partition_of_unity(basis, {Real(0.1), Real(-0.2), Real(0.4)});
-}
-
 TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
 }
-
diff --git a/tests/unitTests/test_common.h b/tests/unitTests/test_common.h
index 98709f600..7227b2beb 100644
--- a/tests/unitTests/test_common.h
+++ b/tests/unitTests/test_common.h
@@ -33,7 +33,6 @@
 #include <stdlib.h>
 #include <iostream>
 #include <random>
-#include <chrono>
 #include "CepMod.h"
 #include "ComMod.h"
 #include "gtest/gtest.h"
@@ -96,4 +95,4 @@ class TestBase {
 };
 
 
-#endif
\ No newline at end of file
+#endif

From 3876ee1fb1c0cd3231a8a2fdf4ea79b10c1dac24 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 13:11:04 -0700
Subject: [PATCH 07/91] removing prewarmed evaluations and switch to std
 library constants. removed associated unit tests for these changes

---
 Code/Source/solver/CMakeLists.txt             |   6 -
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  10 -
 Code/Source/solver/FE/Basis/BasisFunction.h   |   2 -
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |   5 -
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |   2 -
 Code/Source/solver/FE/Math/MathConstants.h    | 388 ------------------
 Code/Source/solver/FE/Math/Matrix.h           |   1 -
 Code/Source/solver/FE/Math/Vector.h           |  19 +-
 Code/Source/solver/nn.cpp                     | 144 +++----
 .../FE/Basis/test_BasisErrorPaths.cpp         |   1 -
 .../unitTests/FE/Math/test_ExpressionOps.cpp  |   1 -
 .../unitTests/FE/Math/test_MathConstants.cpp  | 341 ---------------
 tests/unitTests/FE/Math/test_Matrix.cpp       |   1 -
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |   1 -
 tests/unitTests/FE/Math/test_Vector.cpp       |   1 -
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |   1 -
 16 files changed, 77 insertions(+), 847 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/MathConstants.h
 delete mode 100644 tests/unitTests/FE/Math/test_MathConstants.cpp

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index bdebc4a52..eace4d0b2 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -258,17 +258,11 @@ file(GLOB SOLVER_FE_MATH_SRCS CONFIGURE_DEPENDS
   FE/Math/*.h
 )
 
-file(GLOB SOLVER_FE_QUADRATURE_SRCS CONFIGURE_DEPENDS
-  FE/Quadrature/*.cpp
-  FE/Quadrature/*.h
-)
-
 list(APPEND CSRCS
   ${SOLVER_CORE_SRCS}
   ${SOLVER_FE_COMMON_SRCS}
   ${SOLVER_FE_BASIS_SRCS}
   ${SOLVER_FE_MATH_SRCS}
-  ${SOLVER_FE_QUADRATURE_SRCS}
 )
 
   # Set PETSc interace code.
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 2a1d4f6b0..578c46c88 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -19,12 +19,6 @@ struct BasisFunctionScratch {
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
-
-    void prewarm(std::size_t max_size) {
-        values.reserve(max_size);
-        gradients.reserve(max_size);
-        hessians.reserve(max_size);
-    }
 };
 
 BasisFunctionScratch& scratch() {
@@ -34,10 +28,6 @@ BasisFunctionScratch& scratch() {
 
 } // namespace
 
-void prewarm_basis_function_scratch(std::size_t max_size) {
-    scratch().prewarm(max_size);
-}
-
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index dbabf7061..bf6ac5de7 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -23,8 +23,6 @@ namespace basis {
 using Gradient = math::Vector<Real, 3>;
 using Hessian  = math::Matrix<Real, 3, 3>;
 
-void prewarm_basis_function_scratch(std::size_t max_size);
-
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
                                                     Real yy,
                                                     Real zz,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 7516d514a..372209722 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -303,11 +303,6 @@ void store_gradient(const Gradient& gradient, Real* dst) {
 
 } // namespace
 
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
-    const auto n = static_cast<std::size_t>(std::max(0, max_order) + 1);
-    prewarm_basis_function_scratch(std::max(n * n * n, max_qpts));
-}
-
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
     : element_type_(type), order_(order) {
     const auto normalized = normalize_lagrange_request(element_type_, order_);
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index a5fe8e0fa..dae149872 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -18,8 +18,6 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
-
 class LagrangeBasis : public BasisFunction {
 public:
     using TensorNodeIndex = std::array<std::size_t, 3>;
diff --git a/Code/Source/solver/FE/Math/MathConstants.h b/Code/Source/solver/FE/Math/MathConstants.h
deleted file mode 100644
index 145520ab2..000000000
--- a/Code/Source/solver/FE/Math/MathConstants.h
+++ /dev/null
@@ -1,388 +0,0 @@
-#ifndef SVMP_FE_MATH_CONSTANTS_H
-#define SVMP_FE_MATH_CONSTANTS_H
-
-/**
- * @file MathConstants.h
- * @brief Mathematical constants and numerical tolerances for FE computations
- *
- * This header provides mathematical constants (π, e, √2, etc.) and numerical
- * tolerances used throughout the FE library. All constants are templated
- * to support different precision types.
- */
-
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-/**
- * @brief Mathematical constants templated by type
- * @tparam T The numeric type (float, double, long double)
- */
-template<typename T>
-struct Constants {
-    static_assert(std::is_floating_point_v<T>,
-                  "Constants only defined for floating-point types");
-
-    // Mathematical constants
-    static constexpr T pi           = T(3.14159265358979323846264338327950288419716939937510L);
-    static constexpr T two_pi       = T(6.28318530717958647692528676655900576839433879875021L);
-    static constexpr T half_pi      = T(1.57079632679489661923132169163975144209858469968755L);
-    static constexpr T quarter_pi   = T(0.78539816339744830961566084581987572104929234984378L);
-    static constexpr T inv_pi       = T(0.31830988618379067153776752674502872406891929148091L);
-    static constexpr T inv_two_pi   = T(0.15915494309189533576888376337251436203445964574046L);
-
-    static constexpr T e            = T(2.71828182845904523536028747135266249775724709369995L);
-    static constexpr T log2e        = T(1.44269504088896340735992468100189213742664595415299L);
-    static constexpr T log10e       = T(0.43429448190325182765112891891660508229439700580367L);
-    static constexpr T ln2          = T(0.69314718055994530941723212145817656807550013436026L);
-    static constexpr T ln10         = T(2.30258509299404568401799145468436420760110148862877L);
-
-    static constexpr T sqrt2        = T(1.41421356237309504880168872420969807856967187537694L);
-    static constexpr T sqrt3        = T(1.73205080756887729352744634150587236694280525381038L);
-    static constexpr T inv_sqrt2    = T(0.70710678118654752440084436210484903928483593768847L);
-    static constexpr T inv_sqrt3    = T(0.57735026918962576450914878050195745564760175127013L);
-
-    // Golden ratio
-    static constexpr T phi          = T(1.61803398874989484820458683436563811772030917980576L);
-
-    // Degrees to radians conversion
-    static constexpr T deg_to_rad   = pi / T(180);
-    static constexpr T rad_to_deg   = T(180) / pi;
-};
-
-/**
- * @brief Numerical tolerances and machine epsilon
- * @tparam T The numeric type
- */
-template<typename T>
-struct Tolerances {
-    static_assert(std::is_floating_point_v<T>,
-                  "Tolerances only defined for floating-point types");
-
-    // Machine epsilon
-    static constexpr T epsilon      = std::numeric_limits<T>::epsilon();
-
-    // Default tolerance (1000 * machine epsilon)
-    static constexpr T tolerance    = T(1000) * epsilon;
-
-    // Strict tolerance (10 * machine epsilon)
-    static constexpr T strict       = T(10) * epsilon;
-
-    // Loose tolerance (10000 * machine epsilon)
-    static constexpr T loose        = T(10000) * epsilon;
-
-    // Square root of epsilon (useful for finite differences)
-    static inline const T sqrt_epsilon = std::sqrt(epsilon);
-
-    // Cube root of epsilon (useful for numerical derivatives)
-    static inline const T cbrt_epsilon = std::cbrt(epsilon);
-
-    // Smallest positive normalized value
-    static constexpr T min_positive = std::numeric_limits<T>::min();
-
-    // Largest representable value
-    static constexpr T max_value    = std::numeric_limits<T>::max();
-
-    // Infinity
-    static constexpr T infinity     = std::numeric_limits<T>::infinity();
-
-    // Not-a-Number
-    static constexpr T nan          = std::numeric_limits<T>::quiet_NaN();
-};
-
-/**
- * @brief Convenient aliases for common types
- */
-template<typename T> inline constexpr T pi           = Constants<T>::pi;
-template<typename T> inline constexpr T two_pi       = Constants<T>::two_pi;
-template<typename T> inline constexpr T half_pi      = Constants<T>::half_pi;
-template<typename T> inline constexpr T quarter_pi   = Constants<T>::quarter_pi;
-template<typename T> inline constexpr T inv_pi       = Constants<T>::inv_pi;
-template<typename T> inline constexpr T inv_two_pi   = Constants<T>::inv_two_pi;
-
-template<typename T> inline constexpr T e            = Constants<T>::e;
-template<typename T> inline constexpr T log2e        = Constants<T>::log2e;
-template<typename T> inline constexpr T log10e       = Constants<T>::log10e;
-template<typename T> inline constexpr T ln2          = Constants<T>::ln2;
-template<typename T> inline constexpr T ln10         = Constants<T>::ln10;
-
-template<typename T> inline constexpr T sqrt2        = Constants<T>::sqrt2;
-template<typename T> inline constexpr T sqrt3        = Constants<T>::sqrt3;
-template<typename T> inline constexpr T inv_sqrt2    = Constants<T>::inv_sqrt2;
-template<typename T> inline constexpr T inv_sqrt3    = Constants<T>::inv_sqrt3;
-
-template<typename T> inline constexpr T phi          = Constants<T>::phi;
-
-template<typename T> inline constexpr T deg_to_rad   = Constants<T>::deg_to_rad;
-template<typename T> inline constexpr T rad_to_deg   = Constants<T>::rad_to_deg;
-
-template<typename T> inline constexpr T epsilon      = Tolerances<T>::epsilon;
-template<typename T> inline constexpr T tolerance    = Tolerances<T>::tolerance;
-template<typename T> inline constexpr T strict_tol   = Tolerances<T>::strict;
-template<typename T> inline constexpr T loose_tol    = Tolerances<T>::loose;
-template<typename T> inline const T sqrt_epsilon = Tolerances<T>::sqrt_epsilon;
-template<typename T> inline const T cbrt_epsilon = Tolerances<T>::cbrt_epsilon;
-template<typename T> inline constexpr T min_positive = Tolerances<T>::min_positive;
-template<typename T> inline constexpr T max_value    = Tolerances<T>::max_value;
-template<typename T> inline constexpr T infinity     = Tolerances<T>::infinity;
-
-/**
- * @brief Comparison functions with tolerance
- */
-
-/**
- * @brief Check if two values are approximately equal
- * @param a First value
- * @param b Second value
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if |a - b| <= tol * max(|a|, |b|, 1)
- */
-template<typename T>
-inline constexpr bool approx_equal(T a, T b, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "approx_equal only defined for floating-point types");
-    const T scale = std::max({std::abs(a), std::abs(b), T(1)});
-    return std::abs(a - b) <= tol * scale;
-}
-
-/**
- * @brief Check if a value is approximately zero
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if |a| <= tol
- */
-template<typename T>
-inline constexpr bool approx_zero(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "approx_zero only defined for floating-point types");
-    return std::abs(a) <= tol;
-}
-
-/**
- * @brief Check if a value is positive (greater than tolerance)
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if a > tol
- */
-template<typename T>
-inline constexpr bool is_positive(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_positive only defined for floating-point types");
-    return a > tol;
-}
-
-/**
- * @brief Check if a value is negative (less than -tolerance)
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if a < -tol
- */
-template<typename T>
-inline constexpr bool is_negative(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_negative only defined for floating-point types");
-    return a < -tol;
-}
-
-/**
- * @brief Check if a value is finite (not infinite or NaN)
- * @param a Value to check
- * @return true if value is finite
- */
-template<typename T>
-inline constexpr bool is_finite(T a) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_finite only defined for floating-point types");
-    return std::isfinite(a);
-}
-
-/**
- * @brief Degrees to radians conversion
- * @param degrees Angle in degrees
- * @return Angle in radians
- */
-template<typename T>
-inline constexpr T to_radians(T degrees) {
-    static_assert(std::is_floating_point_v<T>,
-                  "to_radians only defined for floating-point types");
-    return degrees * deg_to_rad<T>;
-}
-
-/**
- * @brief Radians to degrees conversion
- * @param radians Angle in radians
- * @return Angle in degrees
- */
-template<typename T>
-inline constexpr T to_degrees(T radians) {
-    static_assert(std::is_floating_point_v<T>,
-                  "to_degrees only defined for floating-point types");
-    return radians * rad_to_deg<T>;
-}
-
-// =============================================================================
-// Constants namespace for compatibility with test expectations
-// =============================================================================
-namespace constants {
-
-// Mathematical constants (double precision defaults)
-inline constexpr double PI         = Constants<double>::pi;
-inline constexpr double PI_2       = Constants<double>::half_pi;
-inline constexpr double PI_4       = Constants<double>::quarter_pi;
-inline constexpr double TWO_PI     = Constants<double>::two_pi;
-inline constexpr double INV_PI     = Constants<double>::inv_pi;
-
-inline constexpr double E          = Constants<double>::e;
-inline constexpr double LN_2       = Constants<double>::ln2;
-inline constexpr double LN_10      = Constants<double>::ln10;
-inline constexpr double LOG10_E    = Constants<double>::log10e;
-inline constexpr double LOG2_E     = Constants<double>::log2e;
-
-inline constexpr double SQRT_2     = Constants<double>::sqrt2;
-inline constexpr double SQRT_3     = Constants<double>::sqrt3;
-inline constexpr double SQRT_5     = 2.2360679774997896964091736687312L;
-inline constexpr double INV_SQRT_2  = Constants<double>::inv_sqrt2;
-inline constexpr double INV_SQRT_3  = Constants<double>::inv_sqrt3;
-
-inline constexpr double PHI        = Constants<double>::phi;
-
-// Angle conversion functions
-template<typename T>
-inline constexpr T deg_to_rad(T degrees) {
-    return degrees * Constants<T>::deg_to_rad;
-}
-
-template<typename T>
-inline constexpr T rad_to_deg(T radians) {
-    return radians * Constants<T>::rad_to_deg;
-}
-
-// Templated tolerances
-template<typename T>
-inline constexpr T tolerance() {
-    return Tolerances<T>::tolerance;
-}
-
-template<typename T>
-inline constexpr T machine_epsilon() {
-    return Tolerances<T>::epsilon;
-}
-
-// Additional constants and utility functions for tests
-inline constexpr double DEFAULT_TOLERANCE = Tolerances<double>::tolerance;
-inline constexpr double DEFAULT_REL_TOLERANCE = 1e-12;
-inline constexpr double GEOMETRY_TOLERANCE = 1e-10;
-inline constexpr double SOLVER_TOLERANCE = Tolerances<double>::strict;
-inline constexpr double EPSILON = Tolerances<double>::epsilon;
-inline constexpr double INF_VALUE = Tolerances<double>::infinity;  // Renamed from INFINITY
-inline constexpr double NOT_A_NUMBER = Tolerances<double>::nan;  // Renamed from NAN
-inline constexpr double MAX_DOUBLE = Tolerances<double>::max_value;
-inline constexpr double MIN_DOUBLE = Tolerances<double>::min_positive;
-inline constexpr double LOWEST_DOUBLE = -Tolerances<double>::max_value;
-
-// Physical constants
-inline constexpr double SPEED_OF_LIGHT = 299792458.0;         // m/s
-inline constexpr double GRAVITATIONAL_CONSTANT = 6.67430e-11;  // m³/(kg·s²)
-inline constexpr double PLANCK_CONSTANT = 6.62607015e-34;      // J·s
-inline constexpr double AVOGADRO_NUMBER = 6.02214076e23;       // mol⁻¹
-inline constexpr double BOLTZMANN_CONSTANT = 1.380649e-23;     // J/K
-inline constexpr double STANDARD_GRAVITY = 9.80665;            // m/s²
-
-// Float and long double versions
-inline constexpr float PI_F = static_cast<float>(PI);
-inline constexpr float E_F = static_cast<float>(E);
-inline constexpr float SQRT_2_F = static_cast<float>(SQRT_2);
-inline constexpr float EPSILON_F = Tolerances<float>::epsilon;
-
-inline constexpr long double PI_L = static_cast<long double>(PI);
-inline constexpr long double E_L = static_cast<long double>(E);
-inline constexpr long double SQRT_2_L = static_cast<long double>(SQRT_2);
-inline constexpr long double EPSILON_L = Tolerances<long double>::epsilon;
-
-// Additional mathematical constants
-inline constexpr double SQRT_PI = 1.7724538509055160272981674833411L;
-
-// Utility functions
-template<typename T>
-inline constexpr int sign(T value) {
-    return (T(0) < value) - (value < T(0));
-}
-
-template<typename T>
-inline constexpr bool is_zero(T value, T tol = DEFAULT_TOLERANCE) {
-    return std::abs(value) <= tol;
-}
-
-template<typename T>
-inline bool near(T a, T b, T tol = DEFAULT_TOLERANCE) {
-    return std::abs(a - b) <= tol;
-}
-
-template<typename T>
-inline bool near_relative(T a, T b, T rel_tol = DEFAULT_REL_TOLERANCE) {
-    T scale = std::max(std::abs(a), std::abs(b));
-    return std::abs(a - b) <= rel_tol * scale;
-}
-
-template<typename T>
-inline constexpr T clamp(T value, T min_val, T max_val) {
-    return value < min_val ? min_val : (value > max_val ? max_val : value);
-}
-
-template<typename T>
-inline constexpr T lerp(T a, T b, T t) {
-    return a + t * (b - a);
-}
-
-template<typename T>
-inline T safe_divide(T numerator, T denominator, T default_val = T(0)) {
-    return is_zero(denominator) ? default_val : numerator / denominator;
-}
-
-template<typename T>
-inline bool isinf(T value) {
-    return std::isinf(value);
-}
-
-template<typename T>
-inline bool isnan(T value) {
-    return std::isnan(value);
-}
-
-} // namespace constants
-
-// Physical constants for FE analysis
-namespace physical_constants {
-
-// Material properties (SI units)
-inline constexpr double water_density = 1000.0;         // kg/m³
-inline constexpr double steel_density = 7850.0;         // kg/m³
-inline constexpr double aluminum_density = 2700.0;      // kg/m³
-
-inline constexpr double water_viscosity = 0.001;        // Pa·s at 20°C
-inline constexpr double air_viscosity = 1.81e-5;        // Pa·s at 20°C
-
-inline constexpr double steel_youngs_modulus = 200e9;   // Pa
-inline constexpr double aluminum_youngs_modulus = 70e9; // Pa
-
-inline constexpr double steel_poisson_ratio = 0.3;      // dimensionless
-inline constexpr double aluminum_poisson_ratio = 0.33;  // dimensionless
-
-// Physical constants
-inline constexpr double gravity = 9.80665;              // m/s²
-inline constexpr double gas_constant = 8.314462618;     // J/(mol·K)
-inline constexpr double boltzmann = 1.380649e-23;       // J/K
-inline constexpr double avogadro = 6.02214076e23;       // mol⁻¹
-
-} // namespace physical_constants
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_CONSTANTS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 6058ab943..8cb28e5d5 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -13,7 +13,6 @@
 
 #include "MatrixExpr.h"
 #include "Vector.h"
-#include "MathConstants.h"
 #include "../Common/Types.h"
 #include <algorithm>
 #include <array>
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 76c7be152..777f9945b 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -11,12 +11,12 @@
  */
 
 #include "VectorExpr.h"
-#include "MathConstants.h"
 #include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <initializer_list>
+#include <limits>
 #include <ostream>
 #include <stdexcept>
 #include <type_traits>
@@ -25,6 +25,23 @@ namespace svmp {
 namespace FE {
 namespace math {
 
+template<typename T>
+inline constexpr T tolerance =
+    std::is_floating_point_v<T> ? T(1000) * std::numeric_limits<T>::epsilon() : T(0);
+
+template<typename T>
+inline bool approx_zero(T value, T tol = tolerance<T>) {
+    using std::abs;
+    return abs(value) <= tol;
+}
+
+template<typename T>
+inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
+    using std::abs;
+    const T scale = std::max({abs(a), abs(b), T(1)});
+    return abs(a - b) <= tol * scale;
+}
+
 /**
  * @brief Fixed-size vector for element-level computations
  * @tparam T Scalar type (float, double)
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 51c126708..a9e0aebc3 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -35,6 +35,7 @@
 #include <math.h> 
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
 #include <vector>
 
@@ -152,36 +153,27 @@ std::string solver_element_name(consts::ElementType eType)
 
 std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
 {
-  switch (eType) {
-    case consts::ElementType::LIN1:
-      return BasisSelection{fe::ElementType::Line2, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::LIN2:
-      return BasisSelection{fe::ElementType::Line3, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::TRI3:
-      return BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::TRI6:
-      return BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::QUD4:
-      return BasisSelection{fe::ElementType::Quad4, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::QUD8:
-      return BasisSelection{fe::ElementType::Quad8, fe::BasisType::Serendipity, 2};
-    case consts::ElementType::QUD9:
-      return BasisSelection{fe::ElementType::Quad9, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::TET4:
-      return BasisSelection{fe::ElementType::Tetra4, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::TET10:
-      return BasisSelection{fe::ElementType::Tetra10, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::HEX8:
-      return BasisSelection{fe::ElementType::Hex8, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::HEX20:
-      return BasisSelection{fe::ElementType::Hex20, fe::BasisType::Serendipity, 2};
-    case consts::ElementType::HEX27:
-      return BasisSelection{fe::ElementType::Hex27, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::WDG:
-      return BasisSelection{fe::ElementType::Wedge6, fe::BasisType::Lagrange, 1};
-    default:
-      return std::nullopt;
-  }
+  static constexpr std::array supported{
+      BasisSelection{fe::ElementType::Line2,     fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Line3,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Quad4,     fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Quad8,     fe::BasisType::Serendipity, 2},
+      BasisSelection{fe::ElementType::Quad9,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Tetra4,    fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Tetra10,   fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Hex8,      fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Hex20,     fe::BasisType::Serendipity, 2},
+      BasisSelection{fe::ElementType::Hex27,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Wedge6,    fe::BasisType::Lagrange,    1},
+  };
+
+  const int index = static_cast<int>(eType) - static_cast<int>(consts::ElementType::LIN1);
+  if (index >= 0 && static_cast<std::size_t>(index) < supported.size()) {
+    return supported[static_cast<std::size_t>(index)];
+  }
+  return std::nullopt;
 }
 
 bool use_basis_adapter_for(consts::ElementType eType)
@@ -189,11 +181,6 @@ bool use_basis_adapter_for(consts::ElementType eType)
   return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
 }
 
-bool supports_basis_hessian_adapter_for(consts::ElementType eType)
-{
-  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
-}
-
 bool supports_face_basis_adapter_for(consts::ElementType eType)
 {
   if (!basis_mode_allows_fe_adapter()) {
@@ -223,26 +210,36 @@ std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::El
         __FILE__, __LINE__, __func__);
   }
 
-  febasis::BasisRequest request;
-  request.element_type = selection->element;
-  request.basis_type = selection->basis;
-  request.order = selection->order;
-  return febasis::basis_factory::create(request);
+  return febasis::basis_factory::create(
+      {selection->element, selection->basis, selection->order});
 }
 
-template <std::size_t NumNodes>
-std::size_t mapped_basis_index(const std::array<std::size_t, NumNodes>& map,
-                               consts::ElementType eType,
-                               const int solver_node)
+std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 {
-  if (solver_node < 0 || static_cast<std::size_t>(solver_node) >= map.size()) {
-    throw febasis::BasisNodeOrderingException(
-        "Solver node " + std::to_string(solver_node) +
-            " is outside node map for " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
-  }
+  static constexpr std::array<std::size_t, 3> tri3{1, 2, 0};
+  static constexpr std::array<std::size_t, 6> tri6{1, 2, 0, 4, 5, 3};
+  static constexpr std::array<std::size_t, 4> tet4{1, 2, 3, 0};
+  static constexpr std::array<std::size_t, 10> tet10{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
+  static constexpr std::array<std::size_t, 27> hex27{
+      0, 1, 2, 3, 4, 5, 6, 7,
+      8, 9, 10, 11, 12, 13, 14, 15,
+      16, 17, 18, 19, 25, 23, 22, 24, 20, 21, 26};
 
-  return map[static_cast<std::size_t>(solver_node)];
+  switch (eType) {
+    case consts::ElementType::TRI3:
+      return tri3;
+    case consts::ElementType::TRI6:
+    case consts::ElementType::WDG:
+      return tri6;
+    case consts::ElementType::TET4:
+      return tet4;
+    case consts::ElementType::TET10:
+      return tet10;
+    case consts::ElementType::HEX27:
+      return hex27;
+    default:
+      return {};
+  }
 }
 
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
@@ -255,40 +252,17 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   }
 
   const auto node = static_cast<std::size_t>(solver_node);
-
-  switch (eType) {
-    case consts::ElementType::TRI3: {
-      static constexpr std::array<std::size_t, 3> map{1, 2, 0};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TRI6: {
-      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TET4: {
-      static constexpr std::array<std::size_t, 4> map{1, 2, 3, 0};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TET10: {
-      static constexpr std::array<std::size_t, 10> map{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::WDG: {
-      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::HEX27: {
-      static constexpr std::array<std::size_t, 27> map{
-        0, 1, 2, 3, 4, 5, 6, 7,
-        8, 9, 10, 11, 12, 13, 14, 15,
-        16, 17, 18, 19,
-        25, 23, 22, 24, 20, 21, 26
-      };
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    default:
-      return node;
+  const auto map = solver_to_basis_node_map(eType);
+  if (map.empty()) {
+    return node;
+  }
+  if (node < map.size()) {
+    return map[node];
   }
+  throw febasis::BasisNodeOrderingException(
+      "Solver node " + std::to_string(solver_node) +
+          " is outside node map for " + solver_element_name(eType),
+      __FILE__, __LINE__, __func__);
 }
 
 fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
@@ -710,7 +684,7 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
     return;
   }
 
-  if (supports_basis_hessian_adapter_for(eType)) {
+  if (use_basis_adapter_for(eType)) {
     try {
       evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
       return;
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 7838702b0..430390e54 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -156,7 +156,6 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
 TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
-    prewarm_basis_function_scratch(basis.size());
 
     std::vector<Real> flat_values(basis.size());
     std::vector<Real> flat_gradients(basis.size() * 3u);
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
index 307b308a1..a368e345e 100644
--- a/tests/unitTests/FE/Math/test_ExpressionOps.cpp
+++ b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/ExpressionOps.h"
 #include "FE/Math/Vector.h"
 #include "FE/Math/Matrix.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <complex>
diff --git a/tests/unitTests/FE/Math/test_MathConstants.cpp b/tests/unitTests/FE/Math/test_MathConstants.cpp
deleted file mode 100644
index 5619690ed..000000000
--- a/tests/unitTests/FE/Math/test_MathConstants.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/**
- * @file test_MathConstants.cpp
- * @brief Unit tests for MathConstants.h - mathematical constants and tolerances
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/MathConstants.h"
-#include <cmath>
-#include <limits>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for MathConstants tests
-class MathConstantsTest : public ::testing::Test {
-protected:
-    void SetUp() override {}
-    void TearDown() override {}
-};
-
-// =============================================================================
-// Mathematical Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, PiConstants) {
-    // Test PI value
-    EXPECT_NEAR(constants::PI, 3.14159265358979323846, 1e-15);
-
-    // Test PI/2
-    EXPECT_NEAR(constants::PI_2, constants::PI / 2.0, 1e-15);
-
-    // Test PI/4
-    EXPECT_NEAR(constants::PI_4, constants::PI / 4.0, 1e-15);
-
-    // Test 2*PI
-    EXPECT_NEAR(constants::TWO_PI, 2.0 * constants::PI, 1e-15);
-
-    // Test 1/PI
-    EXPECT_NEAR(constants::INV_PI, 1.0 / constants::PI, 1e-15);
-
-    // Test sqrt(PI)
-    EXPECT_NEAR(constants::SQRT_PI, std::sqrt(constants::PI), 1e-15);
-}
-
-TEST_F(MathConstantsTest, EulerConstant) {
-    // Test e (Euler's number)
-    EXPECT_NEAR(constants::E, std::exp(1.0), 1e-15);
-
-    // Test ln(2)
-    EXPECT_NEAR(constants::LN_2, std::log(2.0), 1e-15);
-
-    // Test ln(10)
-    EXPECT_NEAR(constants::LN_10, std::log(10.0), 1e-15);
-
-    // Test log10(e)
-    EXPECT_NEAR(constants::LOG10_E, std::log10(constants::E), 1e-15);
-
-    // Test log2(e)
-    EXPECT_NEAR(constants::LOG2_E, std::log2(constants::E), 1e-15);
-}
-
-TEST_F(MathConstantsTest, SquareRootConstants) {
-    // Test sqrt(2)
-    EXPECT_NEAR(constants::SQRT_2, std::sqrt(2.0), 1e-15);
-
-    // Test sqrt(3)
-    EXPECT_NEAR(constants::SQRT_3, std::sqrt(3.0), 1e-15);
-
-    // Test sqrt(5)
-    EXPECT_NEAR(constants::SQRT_5, std::sqrt(5.0), 1e-15);
-
-    // Test 1/sqrt(2)
-    EXPECT_NEAR(constants::INV_SQRT_2, 1.0 / std::sqrt(2.0), 1e-15);
-
-    // Test 1/sqrt(3)
-    EXPECT_NEAR(constants::INV_SQRT_3, 1.0 / std::sqrt(3.0), 1e-15);
-}
-
-TEST_F(MathConstantsTest, GoldenRatio) {
-    // Test golden ratio φ = (1 + sqrt(5))/2
-    EXPECT_NEAR(constants::PHI, (1.0 + std::sqrt(5.0)) / 2.0, 1e-15);
-
-    // Property: φ² = φ + 1
-    EXPECT_NEAR(constants::PHI * constants::PHI, constants::PHI + 1.0, 1e-14);
-
-    // Property: 1/φ = φ - 1
-    EXPECT_NEAR(1.0 / constants::PHI, constants::PHI - 1.0, 1e-14);
-}
-
-// =============================================================================
-// Angle Conversion Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, DegreesToRadians) {
-    // Test common conversions
-    EXPECT_NEAR(constants::deg_to_rad(0.0), 0.0, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(90.0), constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(180.0), constants::PI, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(270.0), 3.0 * constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(360.0), constants::TWO_PI, 1e-15);
-
-    // Test negative angles
-    EXPECT_NEAR(constants::deg_to_rad(-90.0), -constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(-180.0), -constants::PI, 1e-15);
-
-    // Test arbitrary angle
-    EXPECT_NEAR(constants::deg_to_rad(45.0), constants::PI_4, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(30.0), constants::PI / 6.0, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(60.0), constants::PI / 3.0, 1e-15);
-}
-
-TEST_F(MathConstantsTest, RadiansToDegrees) {
-    // Test common conversions
-    EXPECT_NEAR(constants::rad_to_deg(0.0), 0.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::PI_2), 90.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::PI), 180.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::TWO_PI), 360.0, 1e-13);
-
-    // Test negative angles
-    EXPECT_NEAR(constants::rad_to_deg(-constants::PI), -180.0, 1e-13);
-
-    // Test round-trip conversion
-    double angle_deg = 123.456;
-    double angle_rad = constants::deg_to_rad(angle_deg);
-    double back_to_deg = constants::rad_to_deg(angle_rad);
-    EXPECT_NEAR(back_to_deg, angle_deg, 1e-13);
-}
-
-// =============================================================================
-// Machine Precision Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, MachineEpsilon) {
-    // Test double precision epsilon
-    EXPECT_EQ(constants::EPSILON, std::numeric_limits<double>::epsilon());
-
-    // Test float precision epsilon
-    EXPECT_EQ(constants::EPSILON_F, std::numeric_limits<float>::epsilon());
-
-    // Verify epsilon is the smallest value such that 1.0 + epsilon != 1.0
-    double one_plus_eps = 1.0 + constants::EPSILON;
-    double one_plus_half_eps = 1.0 + constants::EPSILON / 2.0;
-
-    EXPECT_NE(one_plus_eps, 1.0);
-    EXPECT_EQ(one_plus_half_eps, 1.0);
-}
-
-TEST_F(MathConstantsTest, NumericalLimits) {
-    // Test infinity
-    EXPECT_TRUE(std::isinf(constants::INF_VALUE));
-    EXPECT_GT(constants::INF_VALUE, std::numeric_limits<double>::max());
-
-    // Test NaN
-    EXPECT_TRUE(std::isnan(constants::NOT_A_NUMBER));
-    EXPECT_NE(constants::NOT_A_NUMBER, constants::NOT_A_NUMBER);  // NaN != NaN
-
-    // Test max/min values
-    EXPECT_EQ(constants::MAX_DOUBLE, std::numeric_limits<double>::max());
-    EXPECT_EQ(constants::MIN_DOUBLE, std::numeric_limits<double>::min());
-    EXPECT_EQ(constants::LOWEST_DOUBLE, std::numeric_limits<double>::lowest());
-}
-
-// =============================================================================
-// Tolerance Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, DefaultTolerances) {
-    // Test default absolute tolerance
-    EXPECT_GT(constants::DEFAULT_TOLERANCE, 0.0);
-    EXPECT_LT(constants::DEFAULT_TOLERANCE, 1e-10);
-
-    // Test default relative tolerance
-    EXPECT_GT(constants::DEFAULT_REL_TOLERANCE, 0.0);
-    EXPECT_LT(constants::DEFAULT_REL_TOLERANCE, 1e-10);
-
-    // Test solver tolerance
-    EXPECT_GT(constants::SOLVER_TOLERANCE, 0.0);
-    EXPECT_LE(constants::SOLVER_TOLERANCE, constants::DEFAULT_TOLERANCE);
-
-    // Test geometry tolerance (typically larger)
-    EXPECT_GT(constants::GEOMETRY_TOLERANCE, 0.0);
-    EXPECT_GE(constants::GEOMETRY_TOLERANCE, constants::DEFAULT_TOLERANCE);
-}
-
-TEST_F(MathConstantsTest, ToleranceComparison) {
-    double a = 1.0;
-    double b = 1.0 + constants::DEFAULT_TOLERANCE / 2.0;
-    double c = 1.0 + constants::DEFAULT_TOLERANCE * 2.0;
-
-    // Values within tolerance should be considered equal
-    EXPECT_TRUE(constants::near(a, b, constants::DEFAULT_TOLERANCE));
-
-    // Values outside tolerance should not be equal
-    EXPECT_FALSE(constants::near(a, c, constants::DEFAULT_TOLERANCE));
-
-    // Test relative tolerance
-    double large_a = 1e10;
-    double large_b = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE / 2.0);
-    double large_c = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE * 2.0);
-
-    EXPECT_TRUE(constants::near_relative(large_a, large_b, constants::DEFAULT_REL_TOLERANCE));
-    EXPECT_FALSE(constants::near_relative(large_a, large_c, constants::DEFAULT_REL_TOLERANCE));
-}
-
-TEST_F(MathConstantsTest, ZeroComparison) {
-    // Test near zero detection
-    EXPECT_TRUE(constants::is_zero(0.0));
-    EXPECT_TRUE(constants::is_zero(constants::DEFAULT_TOLERANCE / 2.0));
-    EXPECT_FALSE(constants::is_zero(constants::DEFAULT_TOLERANCE * 2.0));
-
-    // Test with negative values
-    EXPECT_TRUE(constants::is_zero(-constants::DEFAULT_TOLERANCE / 2.0));
-    EXPECT_FALSE(constants::is_zero(-constants::DEFAULT_TOLERANCE * 2.0));
-}
-
-// =============================================================================
-// Physical Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, PhysicalConstants) {
-    // Test speed of light (m/s)
-    EXPECT_NEAR(constants::SPEED_OF_LIGHT, 299792458.0, 1.0);
-
-    // Test gravitational constant (m³/kg/s²)
-    EXPECT_NEAR(constants::GRAVITATIONAL_CONSTANT, 6.67430e-11, 1e-16);
-
-    // Test standard gravity (m/s²)
-    EXPECT_NEAR(constants::STANDARD_GRAVITY, 9.80665, 1e-10);
-
-    // Test Planck constant (J⋅s)
-    EXPECT_NEAR(constants::PLANCK_CONSTANT, 6.62607015e-34, 1e-42);
-
-    // Test Boltzmann constant (J/K)
-    EXPECT_NEAR(constants::BOLTZMANN_CONSTANT, 1.380649e-23, 1e-29);
-
-    // Test Avogadro's number (1/mol)
-    EXPECT_NEAR(constants::AVOGADRO_NUMBER, 6.02214076e23, 1e15);
-}
-
-// =============================================================================
-// Compile-Time Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, CompileTimeConstants) {
-    // Test that constants are constexpr (compile-time)
-    constexpr double pi = constants::PI;
-    constexpr double e = constants::E;
-    constexpr double sqrt2 = constants::SQRT_2;
-
-    EXPECT_EQ(pi, constants::PI);
-    EXPECT_EQ(e, constants::E);
-    EXPECT_EQ(sqrt2, constants::SQRT_2);
-
-    // Test compile-time functions
-    constexpr double angle_rad = constants::deg_to_rad(90.0);
-    EXPECT_NEAR(angle_rad, constants::PI_2, 1e-15);
-
-    constexpr double angle_deg = constants::rad_to_deg(constants::PI);
-    EXPECT_NEAR(angle_deg, 180.0, 1e-13);
-}
-
-// =============================================================================
-// Type Traits Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, TypedConstants) {
-    // Test float versions
-    EXPECT_NEAR(constants::PI_F, static_cast<float>(constants::PI), 1e-7f);
-    EXPECT_NEAR(constants::E_F, static_cast<float>(constants::E), 1e-7f);
-    EXPECT_NEAR(constants::SQRT_2_F, static_cast<float>(constants::SQRT_2), 1e-7f);
-
-    // Test long double versions
-    EXPECT_NEAR(constants::PI_L, static_cast<long double>(constants::PI), 1e-18L);
-    EXPECT_NEAR(constants::E_L, static_cast<long double>(constants::E), 1e-18L);
-}
-
-// =============================================================================
-// Special Functions Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, SignFunction) {
-    // Test sign function
-    EXPECT_EQ(constants::sign(5.0), 1);
-    EXPECT_EQ(constants::sign(-5.0), -1);
-    EXPECT_EQ(constants::sign(0.0), 0);
-
-    // Test with very small values
-    EXPECT_EQ(constants::sign(constants::EPSILON), 1);
-    EXPECT_EQ(constants::sign(-constants::EPSILON), -1);
-
-    // Test with infinity
-    EXPECT_EQ(constants::sign(constants::INF_VALUE), 1);
-    EXPECT_EQ(constants::sign(-constants::INF_VALUE), -1);
-}
-
-TEST_F(MathConstantsTest, SafeDivision) {
-    // Test safe division
-    EXPECT_NEAR(constants::safe_divide(10.0, 2.0), 5.0, 1e-15);
-    EXPECT_NEAR(constants::safe_divide(1.0, 3.0), 1.0/3.0, 1e-15);
-
-    // Test division by zero returns default
-    EXPECT_EQ(constants::safe_divide(1.0, 0.0, 999.0), 999.0);
-    EXPECT_EQ(constants::safe_divide(1.0, constants::EPSILON/2.0, -1.0), -1.0);
-
-    // Test division by near-zero
-    double tiny = constants::DEFAULT_TOLERANCE / 10.0;
-    EXPECT_EQ(constants::safe_divide(1.0, tiny, 0.0), 0.0);
-}
-
-// =============================================================================
-// Utility Functions Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, ClampFunction) {
-    // Test clamping
-    EXPECT_EQ(constants::clamp(5.0, 0.0, 10.0), 5.0);
-    EXPECT_EQ(constants::clamp(-5.0, 0.0, 10.0), 0.0);
-    EXPECT_EQ(constants::clamp(15.0, 0.0, 10.0), 10.0);
-
-    // Test with same min/max
-    EXPECT_EQ(constants::clamp(5.0, 3.0, 3.0), 3.0);
-
-    // Test with infinity
-    EXPECT_EQ(constants::clamp(constants::INF_VALUE, 0.0, 10.0), 10.0);
-    EXPECT_EQ(constants::clamp(-constants::INF_VALUE, 0.0, 10.0), 0.0);
-}
-
-TEST_F(MathConstantsTest, LerpFunction) {
-    // Test linear interpolation
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.0), 0.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.0), 10.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.5), 5.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.25), 2.5, 1e-15);
-
-    // Test extrapolation
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, -0.5), -5.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.5), 15.0, 1e-15);
-
-    // Test with negative range
-    EXPECT_NEAR(constants::lerp(-10.0, -5.0, 0.5), -7.5, 1e-15);
-}
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
index c186c26ee..3b2fe664a 100644
--- a/tests/unitTests/FE/Math/test_Matrix.cpp
+++ b/tests/unitTests/FE/Math/test_Matrix.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/Matrix.h"
 #include "FE/Math/Vector.h"
 #include "FE/Math/MatrixExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <thread>
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
index 9486f409c..b17bce928 100644
--- a/tests/unitTests/FE/Math/test_MatrixExpr.cpp
+++ b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/Matrix.h"
 #include "FE/Math/MatrixExpr.h"
 #include "FE/Math/Vector.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <memory>
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
index a38a71727..754ad819d 100644
--- a/tests/unitTests/FE/Math/test_Vector.cpp
+++ b/tests/unitTests/FE/Math/test_Vector.cpp
@@ -6,7 +6,6 @@
 #include <gtest/gtest.h>
 #include "FE/Math/Vector.h"
 #include "FE/Math/VectorExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <sstream>
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
index bd6d85d51..0e7363c64 100644
--- a/tests/unitTests/FE/Math/test_VectorExpr.cpp
+++ b/tests/unitTests/FE/Math/test_VectorExpr.cpp
@@ -6,7 +6,6 @@
 #include <gtest/gtest.h>
 #include "FE/Math/Vector.h"
 #include "FE/Math/VectorExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <memory>

From 2a97fa0466796913614d22a7e9f4c089e1a1d257 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 13:43:26 -0700
Subject: [PATCH 08/91] consolidating math support for integer functions and
 expression operations

---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  25 +-
 Code/Source/solver/FE/Math/ExpressionOps.h    |  99 ----
 Code/Source/solver/FE/Math/IntegerMath.h      |  98 ----
 Code/Source/solver/FE/Math/MatrixExpr.h       |   5 +-
 Code/Source/solver/FE/Math/VectorExpr.h       |  59 +-
 .../unitTests/FE/Math/test_ExpressionOps.cpp  | 508 ------------------
 6 files changed, 75 insertions(+), 719 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/ExpressionOps.h
 delete mode 100644 Code/Source/solver/FE/Math/IntegerMath.h
 delete mode 100644 tests/unitTests/FE/Math/test_ExpressionOps.cpp

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index e6395cee4..237f8c2ce 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -9,7 +9,6 @@
 #include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
 #include "Math/DenseLinearAlgebra.h"
-#include "Math/IntegerMath.h"
 
 #include <algorithm>
 #include <array>
@@ -21,8 +20,6 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-using math::pow_int;
-
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
@@ -150,7 +147,7 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
         const Real y = nodes[static_cast<std::size_t>(row)][1];
         for (int col = 0; col < n; ++col) {
             const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
-            vandermonde[idx(row, col)] = pow_int(x, ax) * pow_int(y, ay);
+            vandermonde[idx(row, col)] = std::pow(x, ax) * std::pow(y, ay);
         }
     }
 
@@ -521,7 +518,7 @@ void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
         std::vector<Real> monomials(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            monomials[j] = pow_int(x, ax) * pow_int(y, ay);
+            monomials[j] = std::pow(x, ax) * std::pow(y, ay);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
@@ -609,8 +606,10 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         std::vector<Real> dmon_dy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dmon_dx[j] = (ax > 0) ? Real(ax) * pow_int(x, ax - 1) * pow_int(y, ay) : Real(0);
-            dmon_dy[j] = (ay > 0) ? pow_int(x, ax) * Real(ay) * pow_int(y, ay - 1) : Real(0);
+            dmon_dx[j] =
+                (ax > 0) ? Real(ax) * std::pow(x, ax - 1) * std::pow(y, ay) : Real(0);
+            dmon_dy[j] =
+                (ay > 0) ? std::pow(x, ax) * Real(ay) * std::pow(y, ay - 1) : Real(0);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
@@ -747,9 +746,15 @@ void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
         std::vector<Real> dyy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dxx[j] = (ax > 1) ? Real(ax * (ax - 1)) * pow_int(x, ax - 2) * pow_int(y, ay) : Real(0);
-            dxy[j] = (ax > 0 && ay > 0) ? Real(ax * ay) * pow_int(x, ax - 1) * pow_int(y, ay - 1) : Real(0);
-            dyy[j] = (ay > 1) ? Real(ay * (ay - 1)) * pow_int(x, ax) * pow_int(y, ay - 2) : Real(0);
+            dxx[j] = (ax > 1)
+                         ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
+                         : Real(0);
+            dxy[j] = (ax > 0 && ay > 0)
+                         ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
+                         : Real(0);
+            dyy[j] = (ay > 1)
+                         ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
+                         : Real(0);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
diff --git a/Code/Source/solver/FE/Math/ExpressionOps.h b/Code/Source/solver/FE/Math/ExpressionOps.h
deleted file mode 100644
index 96cea1037..000000000
--- a/Code/Source/solver/FE/Math/ExpressionOps.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef SVMP_FE_MATH_EXPRESSION_OPS_H
-#define SVMP_FE_MATH_EXPRESSION_OPS_H
-
-/**
- * @file ExpressionOps.h
- * @brief Common expression template operators for vector and matrix expressions
- *
- * This header provides shared operator functors used by both VectorExpr.h and
- * MatrixExpr.h to avoid code duplication and namespace conflicts. All operators
- * are defined in the detail::ops namespace for internal use by expression templates.
- */
-
-#include <cmath>
-
-namespace svmp {
-namespace FE {
-namespace math {
-namespace detail {
-namespace ops {
-
-/**
- * @brief Addition operator functor
- */
-struct Add {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a + b;
-    }
-};
-
-/**
- * @brief Subtraction operator functor
- */
-struct Sub {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a - b;
-    }
-};
-
-/**
- * @brief Multiplication operator functor
- */
-struct Mul {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a * b;
-    }
-};
-
-/**
- * @brief Division operator functor
- */
-struct Div {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a / b;
-    }
-};
-
-/**
- * @brief Negation operator functor
- */
-struct Negate {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        return -a;
-    }
-};
-
-/**
- * @brief Absolute value operator functor
- */
-struct Abs {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::abs;
-        return abs(a);
-    }
-};
-
-/**
- * @brief Square root operator functor
- */
-struct Sqrt {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::sqrt;
-        return sqrt(a);
-    }
-};
-
-} // namespace ops
-} // namespace detail
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_EXPRESSION_OPS_H
diff --git a/Code/Source/solver/FE/Math/IntegerMath.h b/Code/Source/solver/FE/Math/IntegerMath.h
deleted file mode 100644
index 52a50117f..000000000
--- a/Code/Source/solver/FE/Math/IntegerMath.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_MATH_INTEGERMATH_H
-#define SVMP_FE_MATH_INTEGERMATH_H
-
-#include "Types.h"
-
-#include <cstddef>
-#include <limits>
-#include <numeric>
-#include <stdexcept>
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-[[nodiscard]] constexpr Real pow_int_nonnegative(Real base, int exponent) noexcept {
-    Real result = Real(1);
-    Real factor = base;
-    int power = exponent;
-    while (power > 0) {
-        if ((power & 1) != 0) {
-            result *= factor;
-        }
-        power >>= 1;
-        if (power > 0) {
-            factor *= factor;
-        }
-    }
-    return result;
-}
-
-[[nodiscard]] constexpr Real pow_int(Real base, int exponent) noexcept {
-    if (exponent < 0) {
-        return Real(1) / pow_int_nonnegative(base, -exponent);
-    }
-    return pow_int_nonnegative(base, exponent);
-}
-
-[[nodiscard]] constexpr std::size_t binomial_size(int n, int k) {
-    if (n < 0 || k < 0 || k > n) {
-        return 0u;
-    }
-    if (k > n - k) {
-        k = n - k;
-    }
-
-    std::size_t result = 1u;
-    for (int i = 1; i <= k; ++i) {
-        auto numerator = static_cast<std::size_t>(n - (k - i));
-        auto denominator = static_cast<std::size_t>(i);
-
-        const auto numerator_gcd = std::gcd(numerator, denominator);
-        numerator /= numerator_gcd;
-        denominator /= numerator_gcd;
-
-        const auto result_gcd = std::gcd(result, denominator);
-        result /= result_gcd;
-        denominator /= result_gcd;
-        if (denominator != 1u) {
-            throw std::overflow_error(
-                "binomial_size: failed to reduce exact binomial factor");
-        }
-        if (numerator != 0u &&
-            result > std::numeric_limits<std::size_t>::max() / numerator) {
-            throw std::overflow_error("binomial_size: result does not fit in size_t");
-        }
-        result *= numerator;
-    }
-    return result;
-}
-
-[[nodiscard]] constexpr Real binomial_real(int n, int k) noexcept {
-    if (k < 0 || k > n) {
-        return Real(0);
-    }
-    if (k > n - k) {
-        k = n - k;
-    }
-
-    Real result = Real(1);
-    for (int i = 1; i <= k; ++i) {
-        result *= static_cast<Real>(n - (k - i));
-        result /= static_cast<Real>(i);
-    }
-    return result;
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_INTEGERMATH_H
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index 097f35361..13010bddf 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -11,10 +11,11 @@
  */
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <type_traits>
-#include <cmath>
-#include "ExpressionOps.h"
+
+#include "VectorExpr.h"
 
 namespace svmp {
 namespace FE {
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 627d2fd88..178b66b8a 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -10,14 +10,69 @@
  * of assignment, eliminating intermediate allocations and improving performance.
  */
 
+#include <cmath>
 #include <cstddef>
 #include <type_traits>
-#include <cmath>
-#include "ExpressionOps.h"
 
 namespace svmp {
 namespace FE {
 namespace math {
+namespace detail {
+namespace ops {
+
+struct Add {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a + b;
+    }
+};
+
+struct Sub {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a - b;
+    }
+};
+
+struct Mul {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a * b;
+    }
+};
+
+struct Div {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a / b;
+    }
+};
+
+struct Negate {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        return -a;
+    }
+};
+
+struct Abs {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::abs;
+        return abs(a);
+    }
+};
+
+struct Sqrt {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::sqrt;
+        return sqrt(a);
+    }
+};
+
+} // namespace ops
+} // namespace detail
 
 /**
  * @brief Base class for all vector expressions using CRTP
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
deleted file mode 100644
index a368e345e..000000000
--- a/tests/unitTests/FE/Math/test_ExpressionOps.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-/**
- * @file test_ExpressionOps.cpp
- * @brief Unit tests for ExpressionOps.h - expression template operators
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/ExpressionOps.h"
-#include "FE/Math/Vector.h"
-#include "FE/Math/Matrix.h"
-#include <limits>
-#include <cmath>
-#include <complex>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-using namespace svmp::FE::math::detail::ops;
-
-// Test fixture for ExpressionOps tests
-class ExpressionOpsTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Binary Operation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, AddOperator) {
-    Add op;
-
-    // Integer addition
-    EXPECT_EQ(op(5, 3), 8);
-    EXPECT_EQ(op(-5, 3), -2);
-    EXPECT_EQ(op(-5, -3), -8);
-
-    // Floating point addition
-    EXPECT_DOUBLE_EQ(op(3.14, 2.86), 6.0);
-    EXPECT_DOUBLE_EQ(op(-1.5, 2.5), 1.0);
-
-    // Mixed types
-    auto result = op(3, 2.5);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 5.5);
-}
-
-TEST_F(ExpressionOpsTest, SubOperator) {
-    Sub op;
-
-    // Integer subtraction
-    EXPECT_EQ(op(5, 3), 2);
-    EXPECT_EQ(op(3, 5), -2);
-    EXPECT_EQ(op(-5, -3), -2);
-
-    // Floating point subtraction
-    EXPECT_DOUBLE_EQ(op(5.5, 2.5), 3.0);
-    EXPECT_DOUBLE_EQ(op(2.5, 5.5), -3.0);
-
-    // Mixed types
-    auto result = op(5.5, 2);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 3.5);
-}
-
-TEST_F(ExpressionOpsTest, MulOperator) {
-    Mul op;
-
-    // Integer multiplication
-    EXPECT_EQ(op(5, 3), 15);
-    EXPECT_EQ(op(-5, 3), -15);
-    EXPECT_EQ(op(-5, -3), 15);
-
-    // Floating point multiplication
-    EXPECT_DOUBLE_EQ(op(2.5, 4.0), 10.0);
-    EXPECT_DOUBLE_EQ(op(-2.5, 4.0), -10.0);
-
-    // Zero multiplication
-    EXPECT_EQ(op(0, 100), 0);
-    EXPECT_DOUBLE_EQ(op(0.0, 3.14), 0.0);
-
-    // Mixed types
-    auto result = op(3, 2.5);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 7.5);
-}
-
-TEST_F(ExpressionOpsTest, DivOperator) {
-    Div op;
-
-    // Integer division
-    EXPECT_EQ(op(10, 2), 5);
-    EXPECT_EQ(op(10, 3), 3);  // Integer division truncates
-    EXPECT_EQ(op(-10, 2), -5);
-
-    // Floating point division
-    EXPECT_DOUBLE_EQ(op(10.0, 2.0), 5.0);
-    EXPECT_DOUBLE_EQ(op(10.0, 3.0), 10.0/3.0);
-    EXPECT_DOUBLE_EQ(op(-10.0, 2.0), -5.0);
-
-    // Mixed types
-    auto result = op(10.0, 3);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 10.0/3.0);
-}
-
-// =============================================================================
-// Unary Operation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, NegateOperator) {
-    Negate op;
-
-    // Integer negation
-    EXPECT_EQ(op(5), -5);
-    EXPECT_EQ(op(-5), 5);
-    EXPECT_EQ(op(0), 0);
-
-    // Floating point negation
-    EXPECT_DOUBLE_EQ(op(3.14), -3.14);
-    EXPECT_DOUBLE_EQ(op(-2.71), 2.71);
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-
-    // Type preservation
-    auto int_result = op(5);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    auto double_result = op(5.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-}
-
-TEST_F(ExpressionOpsTest, AbsOperator) {
-    Abs op;
-
-    // Integer absolute value
-    EXPECT_EQ(op(5), 5);
-    EXPECT_EQ(op(-5), 5);
-    EXPECT_EQ(op(0), 0);
-
-    // Floating point absolute value
-    EXPECT_DOUBLE_EQ(op(3.14), 3.14);
-    EXPECT_DOUBLE_EQ(op(-3.14), 3.14);
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-
-    // Special cases
-    EXPECT_DOUBLE_EQ(op(-0.0), 0.0);
-
-    // Type preservation
-    auto int_result = op(-5);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    auto double_result = op(-5.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-}
-
-TEST_F(ExpressionOpsTest, SqrtOperator) {
-    Sqrt op;
-
-    // Perfect squares
-    EXPECT_DOUBLE_EQ(op(4.0), 2.0);
-    EXPECT_DOUBLE_EQ(op(9.0), 3.0);
-    EXPECT_DOUBLE_EQ(op(16.0), 4.0);
-    EXPECT_DOUBLE_EQ(op(25.0), 5.0);
-
-    // Non-perfect squares
-    EXPECT_DOUBLE_EQ(op(2.0), std::sqrt(2.0));
-    EXPECT_DOUBLE_EQ(op(3.0), std::sqrt(3.0));
-
-    // Special cases
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-    EXPECT_DOUBLE_EQ(op(1.0), 1.0);
-
-    // Type conversion
-    auto result = op(4);  // Integer input
-    EXPECT_DOUBLE_EQ(result, 2.0);
-}
-
-// =============================================================================
-// Constexpr Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ConstexprOperators) {
-    // Test that operators can be used in constexpr contexts
-    constexpr Add add_op;
-    constexpr Sub sub_op;
-    constexpr Mul mul_op;
-    constexpr Div div_op;
-    constexpr Negate neg_op;
-
-    // Compile-time evaluation
-    constexpr auto sum = add_op(3, 4);
-    constexpr auto diff = sub_op(7, 3);
-    constexpr auto prod = mul_op(3, 4);
-    constexpr auto quot = div_op(12, 3);
-    constexpr auto neg = neg_op(5);
-
-    EXPECT_EQ(sum, 7);
-    EXPECT_EQ(diff, 4);
-    EXPECT_EQ(prod, 12);
-    EXPECT_EQ(quot, 4);
-    EXPECT_EQ(neg, -5);
-
-    // Static assertions to verify compile-time evaluation
-    static_assert(add_op(2, 3) == 5);
-    static_assert(sub_op(5, 2) == 3);
-    static_assert(mul_op(3, 4) == 12);
-    static_assert(div_op(10, 2) == 5);
-    static_assert(neg_op(3) == -3);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, TypeDeduction) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-
-    // int + int -> int
-    auto int_result = add_op(3, 4);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    // double + double -> double
-    auto double_result = add_op(3.0, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-
-    // int + double -> double
-    auto mixed_result1 = add_op(3, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(mixed_result1), double>));
-
-    // double + int -> double
-    auto mixed_result2 = add_op(3.0, 4);
-    EXPECT_TRUE((std::is_same_v<decltype(mixed_result2), double>));
-
-    // float + double -> double
-    auto float_double_result = add_op(3.0f, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(float_double_result), double>));
-}
-
-// =============================================================================
-// Complex Expression Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ChainedOperations) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Negate neg_op;
-
-    // Simulate complex expression: -(a + b) * c / d
-    double a = 2.0, b = 3.0, c = 4.0, d = 2.0;
-
-    auto sum = add_op(a, b);       // 5.0
-    auto negated = neg_op(sum);    // -5.0
-    auto product = mul_op(negated, c);  // -20.0
-    auto result = div_op(product, d);   // -10.0
-
-    EXPECT_DOUBLE_EQ(result, -10.0);
-}
-
-TEST_F(ExpressionOpsTest, MixedPrecisionChain) {
-    Add add_op;
-    Mul mul_op;
-
-    // Mixed precision chain
-    int a = 2;
-    float b = 3.5f;
-    double c = 1.5;
-
-    auto step1 = add_op(a, b);    // int + float -> float (5.5f)
-    auto step2 = mul_op(step1, c); // float + double -> double (8.25)
-
-    EXPECT_TRUE((std::is_same_v<decltype(step2), double>));
-    EXPECT_DOUBLE_EQ(step2, 8.25);
-}
-
-// =============================================================================
-// Operator Integration with Vector/Matrix Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, VectorIntegration) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2{4.0, 5.0, 6.0};
-
-    // Test that operators work correctly in vector expressions
-    Vector<double, 3> sum = v1 + v2;
-    Vector<double, 3> diff = v1 - v2;
-    Vector<double, 3> neg = -v1;
-    Vector<double, 3> scaled = v1 * 2.0;
-
-    EXPECT_DOUBLE_EQ(sum[0], 5.0);
-    EXPECT_DOUBLE_EQ(sum[1], 7.0);
-    EXPECT_DOUBLE_EQ(sum[2], 9.0);
-
-    EXPECT_DOUBLE_EQ(diff[0], -3.0);
-    EXPECT_DOUBLE_EQ(diff[1], -3.0);
-    EXPECT_DOUBLE_EQ(diff[2], -3.0);
-
-    EXPECT_DOUBLE_EQ(neg[0], -1.0);
-    EXPECT_DOUBLE_EQ(neg[1], -2.0);
-    EXPECT_DOUBLE_EQ(neg[2], -3.0);
-
-    EXPECT_DOUBLE_EQ(scaled[0], 2.0);
-    EXPECT_DOUBLE_EQ(scaled[1], 4.0);
-    EXPECT_DOUBLE_EQ(scaled[2], 6.0);
-}
-
-TEST_F(ExpressionOpsTest, MatrixIntegration) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> m2{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Test that operators work correctly in matrix expressions
-    Matrix<double, 2, 2> sum = m1 + m2;
-    Matrix<double, 2, 2> diff = m1 - m2;
-    Matrix<double, 2, 2> neg = -m1;
-    Matrix<double, 2, 2> scaled = m1 * 2.0;
-
-    EXPECT_DOUBLE_EQ(sum(0, 0), 6.0);
-    EXPECT_DOUBLE_EQ(sum(0, 1), 8.0);
-    EXPECT_DOUBLE_EQ(sum(1, 0), 10.0);
-    EXPECT_DOUBLE_EQ(sum(1, 1), 12.0);
-
-    EXPECT_DOUBLE_EQ(diff(0, 0), -4.0);
-    EXPECT_DOUBLE_EQ(diff(0, 1), -4.0);
-    EXPECT_DOUBLE_EQ(diff(1, 0), -4.0);
-    EXPECT_DOUBLE_EQ(diff(1, 1), -4.0);
-
-    EXPECT_DOUBLE_EQ(neg(0, 0), -1.0);
-    EXPECT_DOUBLE_EQ(neg(0, 1), -2.0);
-    EXPECT_DOUBLE_EQ(neg(1, 0), -3.0);
-    EXPECT_DOUBLE_EQ(neg(1, 1), -4.0);
-
-    EXPECT_DOUBLE_EQ(scaled(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(scaled(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(scaled(1, 0), 6.0);
-    EXPECT_DOUBLE_EQ(scaled(1, 1), 8.0);
-}
-
-// =============================================================================
-// Edge Cases and Special Values Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, SpecialFloatingPointValues) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Abs abs_op;
-    Negate neg_op;
-
-    // Infinity handling
-    double inf = std::numeric_limits<double>::infinity();
-    EXPECT_DOUBLE_EQ(add_op(inf, 1.0), inf);
-    EXPECT_DOUBLE_EQ(sub_op(inf, 1.0), inf);
-    EXPECT_DOUBLE_EQ(mul_op(inf, 2.0), inf);
-    EXPECT_DOUBLE_EQ(div_op(inf, 2.0), inf);
-    EXPECT_DOUBLE_EQ(abs_op(inf), inf);
-    EXPECT_DOUBLE_EQ(neg_op(inf), -inf);
-
-    // NaN handling
-    double nan = std::numeric_limits<double>::quiet_NaN();
-    EXPECT_TRUE(std::isnan(add_op(nan, 1.0)));
-    EXPECT_TRUE(std::isnan(sub_op(nan, 1.0)));
-    EXPECT_TRUE(std::isnan(mul_op(nan, 2.0)));
-    EXPECT_TRUE(std::isnan(div_op(nan, 2.0)));
-    EXPECT_TRUE(std::isnan(abs_op(nan)));
-    EXPECT_TRUE(std::isnan(neg_op(nan)));
-
-    // Division by zero
-    EXPECT_DOUBLE_EQ(div_op(1.0, 0.0), inf);
-    EXPECT_DOUBLE_EQ(div_op(-1.0, 0.0), -inf);
-    EXPECT_TRUE(std::isnan(div_op(0.0, 0.0)));
-}
-
-TEST_F(ExpressionOpsTest, LargeAndSmallValues) {
-    Add add_op;
-    Mul mul_op;
-
-    // Large values
-    double large = 1e308;
-    double result = add_op(large, large);
-    EXPECT_TRUE(std::isinf(result));  // Overflow to infinity
-
-    // Small values
-    double tiny = std::numeric_limits<double>::min();
-    double tiny_result = mul_op(tiny, 0.5);
-    EXPECT_GT(tiny_result, 0.0);  // Should still be positive
-    EXPECT_LT(tiny_result, tiny);  // But smaller
-
-    // Denormalized numbers
-    double denorm = std::numeric_limits<double>::denorm_min();
-    double denorm_result = add_op(denorm, denorm);
-    EXPECT_EQ(denorm_result, 2.0 * denorm);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Constraint Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, SFINAECompatibility) {
-    // Test that operators work with any arithmetic types
-    Add add_op;
-
-    // Various integer types
-    EXPECT_EQ(add_op(int8_t(3), int8_t(4)), 7);
-    EXPECT_EQ(add_op(int16_t(100), int16_t(200)), 300);
-    EXPECT_EQ(add_op(int32_t(1000), int32_t(2000)), 3000);
-    EXPECT_EQ(add_op(int64_t(10000), int64_t(20000)), 30000);
-
-    // Unsigned types
-    EXPECT_EQ(add_op(uint8_t(3), uint8_t(4)), 7u);
-    EXPECT_EQ(add_op(uint16_t(100), uint16_t(200)), 300u);
-    EXPECT_EQ(add_op(uint32_t(1000), uint32_t(2000)), 3000u);
-
-    // Floating point types
-    EXPECT_FLOAT_EQ(add_op(3.0f, 4.0f), 7.0f);
-    EXPECT_DOUBLE_EQ(add_op(3.0, 4.0), 7.0);
-
-    // Long double
-    long double ld1 = 3.0L;
-    long double ld2 = 4.0L;
-    EXPECT_DOUBLE_EQ(add_op(ld1, ld2), 7.0L);
-}
-
-// =============================================================================
-// Template Instantiation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, TemplateInstantiations) {
-    // Test that operators can be instantiated with various types
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Abs abs_op;
-    Sqrt sqrt_op;
-    Negate neg_op;
-
-    // Custom types that support arithmetic operations
-    struct CustomNumber {
-        double value;
-        CustomNumber(double v) : value(v) {}
-        CustomNumber operator+(const CustomNumber& other) const { return CustomNumber(value + other.value); }
-        CustomNumber operator-(const CustomNumber& other) const { return CustomNumber(value - other.value); }
-        CustomNumber operator*(const CustomNumber& other) const { return CustomNumber(value * other.value); }
-        CustomNumber operator/(const CustomNumber& other) const { return CustomNumber(value / other.value); }
-        CustomNumber operator-() const { return CustomNumber(-value); }
-        bool operator==(const CustomNumber& other) const { return value == other.value; }
-    };
-
-    CustomNumber cn1(3.0);
-    CustomNumber cn2(4.0);
-
-    auto cn_sum = add_op(cn1, cn2);
-    EXPECT_EQ(cn_sum.value, 7.0);
-
-    auto cn_diff = sub_op(cn1, cn2);
-    EXPECT_EQ(cn_diff.value, -1.0);
-
-    auto cn_prod = mul_op(cn1, cn2);
-    EXPECT_EQ(cn_prod.value, 12.0);
-
-    auto cn_quot = div_op(cn1, cn2);
-    EXPECT_EQ(cn_quot.value, 0.75);
-
-    auto cn_neg = neg_op(cn1);
-    EXPECT_EQ(cn_neg.value, -3.0);
-}
-
-// =============================================================================
-// Complex Number Support Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ComplexNumberSupport) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Negate neg_op;
-
-    std::complex<double> c1(3.0, 4.0);
-    std::complex<double> c2(1.0, 2.0);
-
-    auto c_sum = add_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_sum.real(), 4.0);
-    EXPECT_DOUBLE_EQ(c_sum.imag(), 6.0);
-
-    auto c_diff = sub_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_diff.real(), 2.0);
-    EXPECT_DOUBLE_EQ(c_diff.imag(), 2.0);
-
-    auto c_prod = mul_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_prod.real(), -5.0);  // (3+4i)(1+2i) = 3+6i+4i+8i² = 3+10i-8 = -5+10i
-    EXPECT_DOUBLE_EQ(c_prod.imag(), 10.0);
-
-    auto c_neg = neg_op(c1);
-    EXPECT_DOUBLE_EQ(c_neg.real(), -3.0);
-    EXPECT_DOUBLE_EQ(c_neg.imag(), -4.0);
-}

From 7f2e0202de0896246f4a88b4d42ec38e60b72b3a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 14:16:50 -0700
Subject: [PATCH 09/91] removing the previous basis functions so that we are
 not maintaining two basis function infrastructures

---
 Code/Source/solver/README.md       |    2 +-
 Code/Source/solver/nn.cpp          |  256 +----
 Code/Source/solver/nn_elem_gnn.h   | 1586 ----------------------------
 Code/Source/solver/nn_elem_gnnxx.h |  139 ---
 4 files changed, 32 insertions(+), 1951 deletions(-)
 delete mode 100644 Code/Source/solver/nn_elem_gnn.h
 delete mode 100644 Code/Source/solver/nn_elem_gnnxx.h

diff --git a/Code/Source/solver/README.md b/Code/Source/solver/README.md
index 252999e8f..d11378e35 100644
--- a/Code/Source/solver/README.md
+++ b/Code/Source/solver/README.md
@@ -601,7 +601,7 @@ A map type used to set element properties.
 
 Computes shape functions and derivatives at given natural coords.
 
-- `set_face_shape_data[face.eType](gaus_pt, face)`
+- FE Basis face evaluation for supported mapped face elements.
 
 
 <!-- ============= -->
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index a9e0aebc3..1ec9984b6 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -1,7 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// The functions defined here replicate the Fortran functions defined in NN.f.
+// Solver-facing element setup, Gauss integration, FE Basis evaluation, and
+// shape-function bounds.
 //
 // The functions are used to 
 //
@@ -25,13 +26,8 @@
 
 #include "lapack_defs.h"
 
-#include <algorithm>
 #include <array>
-#include <cstdlib>
-#include <cctype>
-#include <exception>
 #include <functional>
-#include <iostream> 
 #include <math.h> 
 #include <memory>
 #include <optional>
@@ -51,12 +47,6 @@ using namespace consts;
 // Define maps used to set element Gauss integration data. 
 #include "nn_elem_gip.h"
 
-// Define maps used to set element shape function data. 
-#include "nn_elem_gnn.h"
-
-// Define maps used to get element shape function 2nd derivative data. 
-#include "nn_elem_gnnxx.h"
-
 // Define a map type used to set the bounds of element shape functions.
 #include "nn_elem_nn_bnds.h"
 
@@ -71,77 +61,6 @@ struct BasisSelection {
   int order;
 };
 
-enum class BasisMode {
-  Auto,
-  Legacy,
-  Fe
-};
-
-std::string normalize_basis_mode_name(std::string value)
-{
-  std::transform(value.begin(), value.end(), value.begin(),
-      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-  return value;
-}
-
-BasisMode parse_basis_mode()
-{
-  const char* mode_env = std::getenv("SVMP_BASIS_MODE");
-  if (mode_env == nullptr || *mode_env == '\0') {
-    return BasisMode::Auto;
-  }
-
-  const std::string mode = normalize_basis_mode_name(mode_env);
-  if (mode == "auto") {
-    return BasisMode::Auto;
-  }
-  if (mode == "legacy") {
-    return BasisMode::Legacy;
-  }
-  if (mode == "fe") {
-    return BasisMode::Fe;
-  }
-
-  throw febasis::BasisConfigurationException(
-      "Invalid SVMP_BASIS_MODE='" + std::string(mode_env) +
-          "'. Expected one of: auto, legacy, fe",
-      __FILE__, __LINE__, __func__);
-}
-
-BasisMode active_basis_mode()
-{
-  static const BasisMode mode = parse_basis_mode();
-  return mode;
-}
-
-const char* basis_mode_name(BasisMode mode)
-{
-  switch (mode) {
-    case BasisMode::Auto:
-      return "auto";
-    case BasisMode::Legacy:
-      return "legacy";
-    case BasisMode::Fe:
-      return "fe";
-  }
-  return "unknown";
-}
-
-void log_basis_mode_once()
-{
-  static const bool logged = []() {
-    std::cout << "[svMultiPhysics] SVMP_BASIS_MODE="
-              << basis_mode_name(active_basis_mode()) << std::endl;
-    return true;
-  }();
-  (void)logged;
-}
-
-bool basis_mode_allows_fe_adapter()
-{
-  return active_basis_mode() != BasisMode::Legacy;
-}
-
 std::string solver_element_name(consts::ElementType eType)
 {
   auto it = consts::element_type_to_string.find(eType);
@@ -178,15 +97,11 @@ std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
 
 bool use_basis_adapter_for(consts::ElementType eType)
 {
-  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+  return to_basis_selection(eType).has_value();
 }
 
 bool supports_face_basis_adapter_for(consts::ElementType eType)
 {
-  if (!basis_mode_allows_fe_adapter()) {
-    return false;
-  }
-
   switch (eType) {
     case consts::ElementType::LIN1:
     case consts::ElementType::LIN2:
@@ -464,73 +379,20 @@ void evaluate_basis_hessians(const int insd,
   copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
 }
 
-void call_legacy_get_gnn(const int insd,
-                         consts::ElementType eType,
-                         const int eNoN,
-                         const int g,
-                         Array<double>& xi,
-                         Array<double>& N,
-                         Array3<double>& Nx,
-                         const std::string& basis_failure = "")
+void set_point_face_shape_data(const int gaus_pt, faceType& face)
 {
-  try {
-    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
-  } catch (const std::bad_function_call&) {
-    std::string message = "[get_gnn] No FE Basis or legacy shape support for element " +
-        solver_element_name(eType) + "; legacy fallback was attempted";
-    if (!basis_failure.empty()) {
-      message += " after FE Basis failure: " + basis_failure;
+  face.N(0, gaus_pt) = 1.0;
+  for (int row = 0; row < face.Nx.nrows(); ++row) {
+    for (int col = 0; col < face.Nx.ncols(); ++col) {
+      face.Nx(row, col, gaus_pt) = 0.0;
     }
-    throw fe::InvalidElementException(message, solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
   }
 }
 
-void call_legacy_get_gn_nxx(const int insd,
-                            const int ind2,
-                            consts::ElementType eType,
-                            const int eNoN,
-                            const int gaus_pt,
-                            const Array<double>& xi,
-                            Array3<double>& Nxx,
-                            const std::string& basis_failure = "",
-                            const bool allow_missing_legacy_table = false)
-{
-  try {
-    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
-  } catch (const std::bad_function_call&) {
-    if (allow_missing_legacy_table) {
-      return;
-    }
-
-    std::string message = "[get_gn_nxx] No FE Basis or legacy second-derivative support for element " +
-        solver_element_name(eType) + "; legacy fallback was attempted";
-    if (!basis_failure.empty()) {
-      message += " after FE Basis failure: " + basis_failure;
-    }
-    throw fe::InvalidElementException(message, solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
-  }
-}
-
-void call_legacy_face_shape_data(const int gaus_pt, faceType& face)
-{
-  auto legacy_shape = set_face_shape_data.find(face.eType);
-  if (legacy_shape == set_face_shape_data.end()) {
-    throw fe::InvalidElementException(
-        "[get_gnn(face)] No FE Basis or legacy face shape support",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
-  }
-
-  legacy_shape->second(gaus_pt, face);
-}
-
 } // namespace
 
 void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<double>& w, Array<double>& xi) 
 {
-  log_basis_mode_once();
-
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
@@ -546,8 +408,6 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
 //
 void get_gip(mshType& mesh)
 {
-  log_basis_mode_once();
-
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
@@ -559,8 +419,6 @@ void get_gip(mshType& mesh)
 
 void get_gip(Simulation* simulation, faceType& face)
 {
-  log_basis_mode_once();
-
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
@@ -575,30 +433,16 @@ void get_gip(Simulation* simulation, faceType& face)
 void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const int g, Array<double>& xi, 
     Array<double>& N, Array3<double>& Nx)
 {
-  log_basis_mode_once();
-
-  if (use_basis_adapter_for(eType)) {
-    try {
-      evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
-      return;
-    } catch (const fe::NotImplementedException& exception) {
-      call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx, exception.what());
-      return;
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gnn] FE Basis adapter failed for element " +
-              solver_element_name(eType) +
-              "; legacy fallback was not attempted for this approved element: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (!use_basis_adapter_for(eType)) {
+    throw febasis::BasisElementCompatibilityException(
+        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
   }
 
-  call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx);
+  evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
 }
 
-/// @brief A big fat hack because the Fortran GETNN() operates on primitive types but
-/// the C++ version does not, uses Array and Vector objects.
+/// @brief Adapter overload for vector-style callers.
 //
 void get_gnn(const int nsd, consts::ElementType eType, const int eNoN, Vector<double>& xi, 
     Vector<double>& N, Array<double>& Nx)
@@ -625,86 +469,48 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  log_basis_mode_once();
-
-  if (active_basis_mode() == BasisMode::Legacy) {
-    call_legacy_face_shape_data(gaus_pt, face);
-    return;
-  }
-
   if (face.eType == ElementType::NRB) {
     throw fe::NotImplementedException(
-        "[get_gnn(face)] NRB face shape functions remain unsupported by FE Basis and the legacy face table",
+        "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis",
         __FILE__, __LINE__, __func__);
   }
 
-  if (supports_face_basis_adapter_for(face.eType)) {
-    try {
-      // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
-      evaluate_face_basis_values_and_gradients(gaus_pt, face);
-      return;
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gnn(face)] FE Basis face adapter failed for mapped face element " +
-              solver_element_name(face.eType) + "; legacy fallback was not attempted: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (face.eType == ElementType::PNT) {
+    set_point_face_shape_data(gaus_pt, face);
+    return;
   }
 
-  if (face.eType == ElementType::PNT) {
-    // Point faces have no mapped FE Basis representation in this pass; keep the legacy scalar value path.
-    call_legacy_face_shape_data(gaus_pt, face);
+  if (supports_face_basis_adapter_for(face.eType)) {
+    // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
+    evaluate_face_basis_values_and_gradients(gaus_pt, face);
     return;
   }
 
-  // The legacy face table is retained only for explicitly unsupported paths and future cleanup.
-  call_legacy_face_shape_data(gaus_pt, face);
+  throw febasis::BasisElementCompatibilityException(
+      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType),
+      __FILE__, __LINE__, __func__);
 }
 
-/// @brief Returns second order derivatives at given natural coords
-///
-/// Replicates 'SUBROUTINE GETGNNxx(insd, ind2, eType, eNoN, xi, Nxx)'.
+/// @brief Returns second order derivatives at given natural coords.
 //
 void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const int eNoN, const int gaus_pt, 
     const Array<double>& xi, Array3<double>& Nxx)
 {
   using namespace consts;
 
-  log_basis_mode_once();
-
   // NRB/PNT and face-only Hessian paths remain intentionally unsupported here.
   if (eType == ElementType::NRB || eType == ElementType::PNT) {
     return;
   }
 
-  if (active_basis_mode() == BasisMode::Legacy) {
-    call_legacy_get_gn_nxx(
-        insd, ind2, eType, eNoN, gaus_pt, xi, Nxx, "", true);
-    return;
-  }
-
-  if (use_basis_adapter_for(eType)) {
-    try {
-      evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
-      return;
-    } catch (const fe::NotImplementedException& exception) {
-      throw fe::NotImplementedException(
-          "[get_gn_nxx] FE Basis Hessian support is required for mapped volume element " +
-              solver_element_name(eType) + " but is not implemented: " + exception.what(),
-          __FILE__, __LINE__, __func__);
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gn_nxx] FE Basis Hessian adapter failed for element " +
-              solver_element_name(eType) +
-              "; legacy fallback was not attempted for this approved element: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (!use_basis_adapter_for(eType)) {
+    throw febasis::BasisElementCompatibilityException(
+        "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
+            solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
   }
 
-  // Legacy Hessian tables are reserved for intentionally unsupported families.
-  call_legacy_get_gn_nxx(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
+  evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
 }
 
 /// @brief Sets bounds on Gauss integration points in parametric space and
diff --git a/Code/Source/solver/nn_elem_gnn.h b/Code/Source/solver/nn_elem_gnn.h
deleted file mode 100644
index 33564d45b..000000000
--- a/Code/Source/solver/nn_elem_gnn.h
+++ /dev/null
@@ -1,1586 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-/// @brief Define a map type used to set element shape function data.
-///
-/// Reproduces the Fortran 'GETGNN' subroutine.
-//
-using GetElementShapeMapType = std::map<ElementType, std::function<void(const int, const int, const int, 
-    Array<double>&, Array<double>&, Array3<double>&)>>;
-
-GetElementShapeMapType get_element_shape_data = {
-
-  {ElementType::HEX8, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    N(0,g) = lx*ly*lz/8.0;
-    N(1,g) = ux*ly*lz/8.0;
-    N(2,g) = ux*uy*lz/8.0;
-    N(3,g) = lx*uy*lz/8.0;
-    N(4,g) = lx*ly*uz/8.0;
-    N(5,g) = ux*ly*uz/8.0;
-    N(6,g) = ux*uy*uz/8.0;
-    N(7,g) = lx*uy*uz/8.0;
-
-    Nx(0,0,g) = -ly*lz/8.0;
-    Nx(1,0,g) = -lx*lz/8.0;
-    Nx(2,0,g) = -lx*ly/8.0;
-
-    Nx(0,1,g) =  ly*lz/8.0;
-    Nx(1,1,g) = -ux*lz/8.0;
-    Nx(2,1,g) = -ux*ly/8.0;
-
-    Nx(0,2,g) =  uy*lz/8.0;
-    Nx(1,2,g) =  ux*lz/8.0;
-    Nx(2,2,g) = -ux*uy/8.0;
-
-    Nx(0,3,g) = -uy*lz/8.0;
-    Nx(1,3,g) =  lx*lz/8.0;
-    Nx(2,3,g) = -lx*uy/8.0;
-
-    Nx(0,4,g) = -ly*uz/8.0;
-    Nx(1,4,g) = -lx*uz/8.0;
-    Nx(2,4,g) =  lx*ly/8.0;
-
-    Nx(0,5,g) =  ly*uz/8.0;
-    Nx(1,5,g) = -ux*uz/8.0;
-    Nx(2,5,g) =  ux*ly/8.0;
-
-    Nx(0,6,g) =  uy*uz/8.0;
-    Nx(1,6,g) =  ux*uz/8.0;
-    Nx(2,6,g) =  ux*uy/8.0;
-
-    Nx(0,7,g) = -uy*uz/8.0;
-    Nx(1,7,g) =  lx*uz/8.0;
-    Nx(2,7,g) =  lx*uy/8.0;
-    }
-  },
-
-  {ElementType::HEX20, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N,
-      Array3<double>& Nx) -> void
-    {
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = lx*ux;
-    double my = ly*uy;
-    double mz = lz*uz;
-
-    N(0, g) = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    N(1, g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    N(2, g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0;
-    N(3, g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0;
-    N(4, g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0;
-    N(5, g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0;
-    N(6, g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0;
-    N(7, g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0;
-    N(8, g) = mx*ly*lz/4.0;
-    N(9, g) = ux*my*lz/4.0;
-    N(10, g) = mx*uy*lz/4.0;
-    N(11, g) = lx*my*lz/4.0;
-    N(12, g) = mx*ly*uz/4.0;
-    N(13, g) = ux*my*uz/4.0;
-    N(14, g) = mx*uy*uz/4.0;
-    N(15, g) = lx*my*uz/4.0;
-    N(16, g) = lx*ly*mz/4.0;
-    N(17, g) = ux*ly*mz/4.0;
-    N(18, g) = ux*uy*mz/4.0;
-    N(19, g) = lx*uy*mz/4.0;
-
-    // N(1)  = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    int n = 0;
-    Nx(0,n,g) = -ly*lz*(lx+ly+lz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*lz*(lx+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -lx*ly*(lx+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    n += 1;
-    Nx(0,n,g) =  ly*lz*(ux+ly+lz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*lz*(ux+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -ux*ly*(ux+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*lz*(ux+uy+lz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*lz*(ux+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -ux*uy*(ux+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*lz*(lx+uy+lz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*lz*(lx+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -lx*uy*(lx+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -ly*uz*(lx+ly+uz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*uz*(lx+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  lx*ly*(lx+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  ly*uz*(ux+ly+uz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*uz*(ux+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  ux*ly*(ux+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*uz*(ux+uy+uz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*uz*(ux+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  ux*uy*(ux+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*uz*(lx+uy+uz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*uz*(lx+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  lx*uy*(lx+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = mx*ly*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*lz/4.0;
-    Nx(1,n,g) = -mx*lz/4.0;
-    Nx(2,n,g) = -mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*lz/4.0;
-    Nx(2,n,g) = -ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*lz/4.0;
-    Nx(1,n,g) =  mx*lz/4.0;
-    Nx(2,n,g) = -mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*lz/4.0;
-    Nx(2,n,g) = -lx*my/4.0;
-
-//c   N(0n,g) = mx*ly*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uz/4.0;
-    Nx(1,n,g) = -mx*uz/4.0;
-    Nx(2,n,g) =  mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*uz/4.0;
-    Nx(2,n,g) =  ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*uz/4.0;
-    Nx(1,n,g) =  mx*uz/4.0;
-    Nx(2,n,g) =  mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*uz/4.0;
-    Nx(2,n,g) =  lx*my/4.0;
-
-//c   N(0n,g) = lx*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -ly*mz/4.0;
-    Nx(1,n,g) = -lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ly/4.0;
-
-//c   N(0n,g) = ux*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  ly*mz/4.0;
-    Nx(1,n,g) = -ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*ly/4.0;
-
-//c   N(0n,g) = ux*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  uy*mz/4.0;
-    Nx(1,n,g) =  ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*uy/4.0;
-
-//c   N(n,g) = lx*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -uy*mz/4.0;
-    Nx(1,n,g) =  lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*uy/4.0;
-    }
-  },
-
-  {ElementType::HEX27, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N,
-      Array3<double>& Nx) -> void
-    {
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = xi(0,g);
-    double my = xi(1,g);
-    double mz = xi(2,g);
-
-    N(0,g)  = -mx*lx*my*ly*mz*lz/8.0;
-    N(1,g)  =  mx*ux*my*ly*mz*lz/8.0;
-    N(2,g)  = -mx*ux*my*uy*mz*lz/8.0;
-    N(3,g)  =  mx*lx*my*uy*mz*lz/8.0;
-    N(4,g)  =  mx*lx*my*ly*mz*uz/8.0;
-    N(5,g)  = -mx*ux*my*ly*mz*uz/8.0;
-    N(6,g)  =  mx*ux*my*uy*mz*uz/8.0;
-    N(7,g)  = -mx*lx*my*uy*mz*uz/8.0;
-    N(8,g)  =  lx*ux*my*ly*mz*lz/4.0;
-    N(9,g)  = -mx*ux*ly*uy*mz*lz/4.0;
-    N(10,g) = -lx*ux*my*uy*mz*lz/4.0;
-    N(11,g) =  mx*lx*ly*uy*mz*lz/4.0;
-    N(12,g) = -lx*ux*my*ly*mz*uz/4.0;
-    N(13,g) =  mx*ux*ly*uy*mz*uz/4.0;
-    N(14,g) =  lx*ux*my*uy*mz*uz/4.0;
-    N(15,g) = -mx*lx*ly*uy*mz*uz/4.0;
-    N(16,g) =  mx*lx*my*ly*lz*uz/4.0;
-    N(17,g) = -mx*ux*my*ly*lz*uz/4.0;
-    N(18,g) =  mx*ux*my*uy*lz*uz/4.0;
-    N(19,g) = -mx*lx*my*uy*lz*uz/4.0;
-
-    N(20,g) = -mx*lx*ly*uy*lz*uz/2.0;
-    N(21,g) =  mx*ux*ly*uy*lz*uz/2.0;
-    N(22,g) = -lx*ux*my*ly*lz*uz/2.0;
-    N(23,g) =  lx*ux*my*uy*lz*uz/2.0;
-    N(24,g) = -lx*ux*ly*uy*mz*lz/2.0;
-    N(25,g) =  lx*ux*ly*uy*mz*uz/2.0;
-
-    N(26,g) =  lx*ux*ly*uy*lz*uz;
-
-    // N(0)  = -mx*lx*my*ly*mz*lz/8.0
-    int n = 0;
-    Nx(0,n,g)  = -(lx - mx)*my*ly*mz*lz/8.0;
-    Nx(1,n,g)  = -(ly - my)*mx*lx*mz*lz/8.0;
-    Nx(2,n,g)  = -(lz - mz)*mx*lx*my*ly/8.0;
-
-    // N(n,g)  =  mx*ux*my*ly*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (mx + ux)*my*ly*mz*lz/8.0;
-    Nx(1,n,g)  =  (ly - my)*mx*ux*mz*lz/8.0;
-    Nx(2,n,g)  =  (lz - mz)*mx*ux*my*ly/8.0;
-
-    // N(n,g)  = -mx*ux*my*uy*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(mx + ux)*my*uy*mz*lz/8.0;
-    Nx(1,n,g)  = -(my + uy)*mx*ux*mz*lz/8.0;
-    Nx(2,n,g)  = -(lz - mz)*mx*ux*my*uy/8.0;
-
-    // N(n,g)  =  mx*lx*my*uy*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - mx)*my*uy*mz*lz/8.0;
-    Nx(1,n,g)  =  (my + uy)*mx*lx*mz*lz/8.0;
-    Nx(2,n,g)  =  (lz - mz)*mx*lx*my*uy/8.0;
-
-    // N(n,g)  =  mx*lx*my*ly*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - mx)*my*ly*mz*uz/8.0;
-    Nx(1,n,g)  =  (ly - my)*mx*lx*mz*uz/8.0;
-    Nx(2,n,g)  =  (mz + uz)*mx*lx*my*ly/8.0;
-
-    // N(n,g)  = -mx*ux*my*ly*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(mx + ux)*my*ly*mz*uz/8.0;
-    Nx(1,n,g)  = -(ly - my)*mx*ux*mz*uz/8.0;
-    Nx(2,n,g)  = -(mz + uz)*mx*ux*my*ly/8.0;
-
-    // N(n,g)  =  mx*ux*my*uy*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (mx + ux)*my*uy*mz*uz/8.0;
-    Nx(1,n,g)  =  (my + uy)*mx*ux*mz*uz/8.0;
-    Nx(2,n,g)  =  (mz + uz)*mx*ux*my*uy/8.0;
-
-    // N(n,g)  = -mx*lx*my*uy*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(lx - mx)*my*uy*mz*uz/8.0;
-    Nx(1,n,g)  = -(my + uy)*mx*lx*mz*uz/8.0;
-    Nx(2,n,g)  = -(mz + uz)*mx*lx*my*uy/8.0;
-
-    // N(n,g)  =  lx*ux*my*ly*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - ux)*my*ly*mz*lz/4.0;
-    Nx(1,n,g)  =  (ly - my)*lx*ux*mz*lz/4.0;
-    Nx(2,n,g)  =  (lz - mz)*lx*ux*my*ly/4.0;
-
-    // N(n,g) = -mx*ux*ly*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -(mx + ux)*ly*uy*mz*lz/4.0;
-    Nx(1,n,g) = -(ly - uy)*mx*ux*mz*lz/4.0;
-    Nx(2,n,g) = -(lz - mz)*mx*ux*ly*uy/4.0;
-
-    //   N(n,g) = -lx*ux*my*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*uy*mz*lz/4.0;
-    Nx(1,n,g) = -(my + uy)*lx*ux*mz*lz/4.0;
-    Nx(2,n,g) = -(lz - mz)*lx*ux*my*uy/4.0;
-
-    //   N(n,g) =  mx*lx*ly*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - mx)*ly*uy*mz*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*mx*lx*mz*lz/4.0;
-    Nx(2,n,g) =  (lz - mz)*mx*lx*ly*uy/4.0;
-
-    //   N(n,g) = -lx*ux*my*ly*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*ly*mz*uz/4.0;
-    Nx(1,n,g) = -(ly - my)*lx*ux*mz*uz/4.0;
-    Nx(2,n,g) = -(mz + uz)*lx*ux*my*ly/4.0;
-
-    //   N(n,g) =  mx*ux*ly*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*ly*uy*mz*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*mx*ux*mz*uz/4.0;
-    Nx(2,n,g) =  (mz + uz)*mx*ux*ly*uy/4.0;
-
-    //   N(n,g) =  lx*ux*my*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*my*uy*mz*uz/4.0;
-    Nx(1,n,g) =  (my + uy)*lx*ux*mz*uz/4.0;
-    Nx(2,n,g) =  (mz + uz)*lx*ux*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*ly*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*ly*uy*mz*uz/4.0;
-    Nx(1,n,g) = -(ly - uy)*mx*lx*mz*uz/4.0;
-    Nx(2,n,g) = -(mz + uz)*mx*lx*ly*uy/4.0;
-
-    //   N(n,g) =  mx*lx*my*ly*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - mx)*my*ly*lz*uz/4.0;
-    Nx(1,n,g) =  (ly - my)*mx*lx*lz*uz/4.0;
-    Nx(2,n,g) =  (lz - uz)*mx*lx*my*ly/4.0;
-
-    //   N(n,g) = -mx*ux*my*ly*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(mx + ux)*my*ly*lz*uz/4.0;
-    Nx(1,n,g) = -(ly - my)*mx*ux*lz*uz/4.0;
-    Nx(2,n,g) = -(lz - uz)*mx*ux*my*ly/4.0;
-
-    //   N(n,g) =  mx*ux*my*uy*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*my*uy*lz*uz/4.0;
-    Nx(1,n,g) =  (my + uy)*mx*ux*lz*uz/4.0;
-    Nx(2,n,g) =  (lz - uz)*mx*ux*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*my*uy*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*my*uy*lz*uz/4.0;
-    Nx(1,n,g) = -(my + uy)*mx*lx*lz*uz/4.0;
-    Nx(2,n,g) = -(lz - uz)*mx*lx*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*ly*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*ly*uy*lz*uz/2.0;
-    Nx(1,n,g) = -(ly - uy)*mx*lx*lz*uz/2.0;
-    Nx(2,n,g) = -(lz - uz)*mx*lx*ly*uy/2.0;
-
-    //   N(n,g) =  mx*ux*ly*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*ly*uy*lz*uz/2.0;
-    Nx(1,n,g) =  (ly - uy)*mx*ux*lz*uz/2.0;
-    Nx(2,n,g) =  (lz - uz)*mx*ux*ly*uy/2.0;
-
-    //   N(n,g) = -lx*ux*my*ly*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*ly*lz*uz/2.0;
-    Nx(1,n,g) = -(ly - my)*lx*ux*lz*uz/2.0;
-    Nx(2,n,g) = -(lz - uz)*lx*ux*my*ly/2.0;
-
-    //   N(n,g) =  lx*ux*my*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*my*uy*lz*uz/2.0;
-    Nx(1,n,g) =  (my + uy)*lx*ux*lz*uz/2.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ux*my*uy/2.0;
-
-    //   N(n,g) = -lx*ux*ly*uy*mz*lz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*ly*uy*mz*lz/2.0;
-    Nx(1,n,g) = -(ly - uy)*lx*ux*mz*lz/2.0;
-    Nx(2,n,g) = -(lz - mz)*lx*ux*ly*uy/2.0;
-
-    //   N(n,g) =  lx*ux*ly*uy*mz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uy*mz*uz/2.0;
-    Nx(1,n,g) =  (ly - uy)*lx*ux*mz*uz/2.0;
-    Nx(2,n,g) =  (mz + uz)*lx*ux*ly*uy/2.0;
-
-    //   N(n,g) =  lx*ux*ly*uy*lz*uz
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uy*lz*uz;
-    Nx(1,n,g) =  (ly - uy)*lx*ux*lz*uz;
-    Nx(2,n,g) =  (lz - uz)*lx*ux*ly*uy;
-    }
-  },
-
-  {ElementType::LIN1, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    N(0,g) = (1.0 - xi(0,g))*0.5;
-    N(1,g) = (1.0 + xi(0,g))*0.5;
-
-    Nx(0,0,g) = -0.5;
-    Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    N(0,g) = lx*ly / 4.0;
-    N(1,g) = ux*ly / 4.0;
-    N(2,g) = ux*uy / 4.0;
-    N(3,g) = lx*uy / 4.0;
-
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
-  {ElementType::QUD9, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::TET4, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    //std::cout << "[get_element_shape_data] TET4 " << std::endl;
-
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = xi(2,g);
-    N(3,g) = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(2,1,g) =  0.0;
-    Nx(0,2,g) =  0.0;
-    Nx(1,2,g) =  0.0;
-    Nx(2,2,g) =  1.0;
-    Nx(0,3,g) = -1.0;
-    Nx(1,3,g) = -1.0;
-    Nx(2,3,g) = -1.0;
-    }
-  },
-
-  {ElementType::TET10, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double s = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-    N(0,g) = xi(0,g)*(2.0*xi(0,g) - 1.0);
-    N(1,g) = xi(1,g)*(2.0*xi(1,g) - 1.0);
-    N(2,g) = xi(2,g)*(2.0*xi(2,g) - 1.0);
-    N(3,g) = s * (2.0*s - 1.0);
-    N(4,g) = 4.0*xi(0,g)*xi(1,g);
-    N(5,g) = 4.0*xi(1,g)*xi(2,g);
-    N(6,g) = 4.0*xi(0,g)*xi(2,g);
-    N(7,g) = 4.0*xi(0,g)*s;
-    N(8,g) = 4.0*xi(1,g)*s;
-    N(9,g) = 4.0*xi(2,g)*s;
-
-    Nx(0,0,g)  =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g)  =  0.0;
-    Nx(2,0,g)  =  0.0;
-
-    Nx(0,1,g)  =  0.0;
-    Nx(1,1,g)  =  4.0*xi(1,g) - 1.0;
-    Nx(2,1,g)  =  0.0;
-
-    Nx(0,2,g)  =  0.0;
-    Nx(1,2,g)  =  0.0;
-    Nx(2,2,g)  =  4.0*xi(2,g) - 1.0;
-
-    Nx(0,3,g)  =  1.0 - 4.0*s;
-    Nx(1,3,g)  =  1.0 - 4.0*s;
-    Nx(2,3,g)  =  1.0 - 4.0*s;
-
-    Nx(0,4,g)  =  4.0*xi(1,g);
-    Nx(1,4,g)  =  4.0*xi(0,g);
-    Nx(2,4,g)  =  0.0;
-
-    Nx(0,5,g)  =  0.0;
-    Nx(1,5,g)  =  4.0*xi(2,g);
-    Nx(2,5,g)  =  4.0*xi(1,g);
-
-    Nx(0,6,g)  =  4.0*xi(2,g);
-    Nx(1,6,g)  =  0.0;
-    Nx(2,6,g)  =  4.0*xi(0,g);
-
-    Nx(0,7,g)  =  4.0*( s - xi(0,g));
-    Nx(1,7,g)  = -4.0*xi(0,g);
-    Nx(2,7,g)  = -4.0*xi(0,g);
-
-    Nx(0,8,g)  = -4.0*xi(1,g);
-    Nx(1,8,g)  =  4.0*( s - xi(1,g));
-    Nx(2,8,g)  = -4.0*xi(1,g);
-
-    Nx(0,9,g) = -4.0*xi(2,g);
-    Nx(1,9,g) = -4.0*xi(2,g);
-    Nx(2,9,g) =  4.0*( s - xi(2,g));
-    }
-  },
-
-  {ElementType::TRI3, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    //std::cout << "[get_element_shape_data] TRI3 " << std::endl;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = 1.0 - xi(0,g) - xi(1,g);
-
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(0,2,g) = -1.0;
-    Nx(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g) * (2.0*xi(0,g) - 1.0);
-    N(1,g) = xi(1,g) * (2.0*xi(1,g) - 1.0);
-    N(2,g) = s * (2.0*s - 1.0);
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    Nx(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g) =  0.0;
-
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  4.0*xi(1,g) - 1.0;
-
-    Nx(0,2,g) =  1.0 - 4.0*s;
-    Nx(1,2,g) =  1.0 - 4.0*s;
-
-    Nx(0,3,g) =  4.0*xi(1,g);
-    Nx(1,3,g) =  4.0*xi(0,g);
-
-    Nx(0,4,g) = -4.0*xi(1,g);
-    Nx(1,4,g) =  4.0*( s - xi(1,g) );
-
-    Nx(0,5,g) =  4.0*( s - xi(0,g) );
-    Nx(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-  {ElementType::WDG, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double ux = xi(0,g);
-    double uy = xi(1,g);
-    double uz = 1.0 - ux - uy;
-    double s = (1.0 + xi(2,g))*0.5;
-    double t = (1.0 - xi(2,g))*0.5;
-    N(0,g) = ux*t;
-    N(1,g) = uy*t;
-    N(2,g) = uz*t;
-    N(3,g) = ux*s;
-    N(4,g) = uy*s;
-    N(5,g) = uz*s;
-
-    Nx(0,0,g) =  t;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) = -ux*0.50;
-
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  t;
-    Nx(2,1,g) = -uy*0.50;
-
-    Nx(0,2,g) = -t;
-    Nx(1,2,g) = -t;
-    Nx(2,2,g) = -uz*0.50;
-
-    Nx(0,3,g) =  s;
-    Nx(1,3,g) =  0.0;
-    Nx(2,3,g) =  ux*0.50;
-
-    Nx(0,4,g) =  0.0;
-    Nx(1,4,g) =  s;
-    Nx(2,4,g) =  uy*0.50;
-
-    Nx(0,5,g) = -s;
-    Nx(1,5,g) = -s;
-    Nx(2,5,g) =  uz*0.50;
-    }
-  },
-
-
-
-};
-
-
-//------------------------
-// set_element_shape_data 
-//------------------------
-// Replicates 'SUBROUTINE GETGNN(insd, eType, eNoN, xi, N, Nxi)' defined in NN.f.
-//
-using SetElementShapeMapType = std::map<ElementType, std::function<void(int, mshType&)>>;
-
-SetElementShapeMapType set_element_shape_data = {
-
-  {ElementType::HEX8, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    auto& N = mesh.N;
-    N(0,g) = lx*ly*lz/8.0;
-    N(1,g) = ux*ly*lz/8.0;
-    N(2,g) = ux*uy*lz/8.0;
-    N(3,g) = lx*uy*lz/8.0;
-    N(4,g) = lx*ly*uz/8.0;
-    N(5,g) = ux*ly*uz/8.0;
-    N(6,g) = ux*uy*uz/8.0;
-    N(7,g) = lx*uy*uz/8.0;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -ly*lz/8.0;
-    Nx(1,0,g) = -lx*lz/8.0;
-    Nx(2,0,g) = -lx*ly/8.0;
-
-    Nx(0,1,g) =  ly*lz/8.0;
-    Nx(1,1,g) = -ux*lz/8.0;
-    Nx(2,1,g) = -ux*ly/8.0;
-
-    Nx(0,2,g) =  uy*lz/8.0;
-    Nx(1,2,g) =  ux*lz/8.0;
-    Nx(2,2,g) = -ux*uy/8.0;
-
-    Nx(0,3,g) = -uy*lz/8.0;
-    Nx(1,3,g) =  lx*lz/8.0;
-    Nx(2,3,g) = -lx*uy/8.0;
-
-    Nx(0,4,g) = -ly*uz/8.0;
-    Nx(1,4,g) = -lx*uz/8.0;
-    Nx(2,4,g) =  lx*ly/8.0;
-
-    Nx(0,5,g) =  ly*uz/8.0;
-    Nx(1,5,g) = -ux*uz/8.0;
-    Nx(2,5,g) =  ux*ly/8.0;
-
-    Nx(0,6,g) =  uy*uz/8.0;
-    Nx(1,6,g) =  ux*uz/8.0;
-    Nx(2,6,g) =  ux*uy/8.0;
-
-    Nx(0,7,g) = -uy*uz/8.0;
-    Nx(1,7,g) =  lx*uz/8.0;
-    Nx(2,7,g) =  lx*uy/8.0;
-    }
-  },
-
-  {ElementType::HEX20, [](int g, mshType& mesh) -> void {
-
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = lx*ux;
-    double my = ly*uy;
-    double mz = lz*uz;
-
-    auto& N = mesh.N;
-    N(0, g) = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    N(1, g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    N(2, g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0;
-    N(3, g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0;
-    N(4, g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0;
-    N(5, g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0;
-    N(6, g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0;
-    N(7, g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0;
-    N(8, g) = mx*ly*lz/4.0;
-    N(9, g) = ux*my*lz/4.0;
-    N(10, g) = mx*uy*lz/4.0;
-    N(11, g) = lx*my*lz/4.0;
-    N(12, g) = mx*ly*uz/4.0;
-    N(13, g) = ux*my*uz/4.0;
-    N(14, g) = mx*uy*uz/4.0;
-    N(15, g) = lx*my*uz/4.0;
-    N(16, g) = lx*ly*mz/4.0;
-    N(17, g) = ux*ly*mz/4.0;
-    N(18, g) = ux*uy*mz/4.0;
-    N(19, g) = lx*uy*mz/4.0;
-
-    // N(1)  = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    auto& Nx = mesh.Nx;
-    int n = 0;
-    Nx(0,n,g) = -ly*lz*(lx+ly+lz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*lz*(lx+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -lx*ly*(lx+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    n += 1;
-    Nx(0,n,g) =  ly*lz*(ux+ly+lz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*lz*(ux+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -ux*ly*(ux+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*lz*(ux+uy+lz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*lz*(ux+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -ux*uy*(ux+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*lz*(lx+uy+lz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*lz*(lx+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -lx*uy*(lx+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -ly*uz*(lx+ly+uz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*uz*(lx+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  lx*ly*(lx+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  ly*uz*(ux+ly+uz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*uz*(ux+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  ux*ly*(ux+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*uz*(ux+uy+uz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*uz*(ux+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  ux*uy*(ux+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*uz*(lx+uy+uz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*uz*(lx+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  lx*uy*(lx+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = mx*ly*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*lz/4.0;
-    Nx(1,n,g) = -mx*lz/4.0;
-    Nx(2,n,g) = -mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*lz/4.0;
-    Nx(2,n,g) = -ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*lz/4.0;
-    Nx(1,n,g) =  mx*lz/4.0;
-    Nx(2,n,g) = -mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*lz/4.0;
-    Nx(2,n,g) = -lx*my/4.0;
-
-//c   N(0n,g) = mx*ly*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uz/4.0;
-    Nx(1,n,g) = -mx*uz/4.0;
-    Nx(2,n,g) =  mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*uz/4.0;
-    Nx(2,n,g) =  ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*uz/4.0;
-    Nx(1,n,g) =  mx*uz/4.0;
-    Nx(2,n,g) =  mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*uz/4.0;
-    Nx(2,n,g) =  lx*my/4.0;
-
-//c   N(0n,g) = lx*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -ly*mz/4.0;
-    Nx(1,n,g) = -lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ly/4.0;
-
-//c   N(0n,g) = ux*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  ly*mz/4.0;
-    Nx(1,n,g) = -ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*ly/4.0;
-
-//c   N(0n,g) = ux*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  uy*mz/4.0;
-    Nx(1,n,g) =  ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*uy/4.0;
-
-//c   N(n,g) = lx*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -uy*mz/4.0;
-    Nx(1,n,g) =  lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*uy/4.0;
-    }
-  },
-
-  {ElementType::HEX27, [](int g, mshType& mesh) -> void {
-
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = xi(0,g);
-    double my = xi(1,g);
-    double mz = xi(2,g);
-
-    auto& N = mesh.N;
-    N(0,g)  = -mx*lx*my*ly*mz*lz/8.0;
-    N(1,g)  =  mx*ux*my*ly*mz*lz/8.0;
-    N(2,g)  = -mx*ux*my*uy*mz*lz/8.0;
-    N(3,g)  =  mx*lx*my*uy*mz*lz/8.0;
-    N(4,g)  =  mx*lx*my*ly*mz*uz/8.0;
-    N(5,g)  = -mx*ux*my*ly*mz*uz/8.0;
-    N(6,g)  =  mx*ux*my*uy*mz*uz/8.0;
-    N(7,g)  = -mx*lx*my*uy*mz*uz/8.0;
-    N(8,g)  =  lx*ux*my*ly*mz*lz/4.0;
-    N(9,g) = -mx*ux*ly*uy*mz*lz/4.0;
-    N(10,g) = -lx*ux*my*uy*mz*lz/4.0;
-    N(11,g) =  mx*lx*ly*uy*mz*lz/4.0;
-    N(12,g) = -lx*ux*my*ly*mz*uz/4.0;
-    N(13,g) =  mx*ux*ly*uy*mz*uz/4.0;
-    N(14,g) =  lx*ux*my*uy*mz*uz/4.0;
-    N(15,g) = -mx*lx*ly*uy*mz*uz/4.0;
-    N(16,g) =  mx*lx*my*ly*lz*uz/4.0;
-    N(17,g) = -mx*ux*my*ly*lz*uz/4.0;
-    N(18,g) =  mx*ux*my*uy*lz*uz/4.0;
-    N(19,g) = -mx*lx*my*uy*lz*uz/4.0;
-
-    N(20,g) = -mx*lx*ly*uy*lz*uz/2.0;
-    N(21,g) =  mx*ux*ly*uy*lz*uz/2.0;
-    N(22,g) = -lx*ux*my*ly*lz*uz/2.0;
-    N(23,g) =  lx*ux*my*uy*lz*uz/2.0;
-    N(24,g) = -lx*ux*ly*uy*mz*lz/2.0;
-    N(25,g) =  lx*ux*ly*uy*mz*uz/2.0;
-
-    N(26,g) =  lx*ux*ly*uy*lz*uz;
-
-    auto& Nxi = mesh.Nx;
-    int n = 0;
-    Nxi(0,n,g)  = -(lx - mx)*my*ly*mz*lz/8.0;
-    Nxi(1,n,g)  = -(ly - my)*mx*lx*mz*lz/8.0;
-    Nxi(2,n,g)  = -(lz - mz)*mx*lx*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (mx + ux)*my*ly*mz*lz/8.0;
-    Nxi(1,n,g)  =  (ly - my)*mx*ux*mz*lz/8.0;
-    Nxi(2,n,g)  =  (lz - mz)*mx*ux*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(mx + ux)*my*uy*mz*lz/8.0;
-    Nxi(1,n,g)  = -(my + uy)*mx*ux*mz*lz/8.0;
-    Nxi(2,n,g)  = -(lz - mz)*mx*ux*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - mx)*my*uy*mz*lz/8.0;
-    Nxi(1,n,g)  =  (my + uy)*mx*lx*mz*lz/8.0;
-    Nxi(2,n,g)  =  (lz - mz)*mx*lx*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - mx)*my*ly*mz*uz/8.0;
-    Nxi(1,n,g)  =  (ly - my)*mx*lx*mz*uz/8.0;
-    Nxi(2,n,g)  =  (mz + uz)*mx*lx*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(mx + ux)*my*ly*mz*uz/8.0;
-    Nxi(1,n,g)  = -(ly - my)*mx*ux*mz*uz/8.0;
-    Nxi(2,n,g)  = -(mz + uz)*mx*ux*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (mx + ux)*my*uy*mz*uz/8.0;
-    Nxi(1,n,g)  =  (my + uy)*mx*ux*mz*uz/8.0;
-    Nxi(2,n,g)  =  (mz + uz)*mx*ux*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(lx - mx)*my*uy*mz*uz/8.0;
-    Nxi(1,n,g)  = -(my + uy)*mx*lx*mz*uz/8.0;
-    Nxi(2,n,g)  = -(mz + uz)*mx*lx*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - ux)*my*ly*mz*lz/4.0;
-    Nxi(1,n,g)  =  (ly - my)*lx*ux*mz*lz/4.0;
-    Nxi(2,n,g)  =  (lz - mz)*lx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(mx + ux)*ly*uy*mz*lz/4.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*ux*mz*lz/4.0;
-    Nxi(2,n,g) = -(lz - mz)*mx*ux*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*uy*mz*lz/4.0;
-    Nxi(1,n,g) = -(my + uy)*lx*ux*mz*lz/4.0;
-    Nxi(2,n,g) = -(lz - mz)*lx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - mx)*ly*uy*mz*lz/4.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*lx*mz*lz/4.0;
-    Nxi(2,n,g) =  (lz - mz)*mx*lx*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*ly*mz*uz/4.0;
-    Nxi(1,n,g) = -(ly - my)*lx*ux*mz*uz/4.0;
-    Nxi(2,n,g) = -(mz + uz)*lx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*ly*uy*mz*uz/4.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*ux*mz*uz/4.0;
-    Nxi(2,n,g) =  (mz + uz)*mx*ux*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*my*uy*mz*uz/4.0;
-    Nxi(1,n,g) =  (my + uy)*lx*ux*mz*uz/4.0;
-    Nxi(2,n,g) =  (mz + uz)*lx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*ly*uy*mz*uz/4.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*lx*mz*uz/4.0;
-    Nxi(2,n,g) = -(mz + uz)*mx*lx*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - mx)*my*ly*lz*uz/4.0;
-    Nxi(1,n,g) =  (ly - my)*mx*lx*lz*uz/4.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*lx*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(mx + ux)*my*ly*lz*uz/4.0;
-    Nxi(1,n,g) = -(ly - my)*mx*ux*lz*uz/4.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*my*uy*lz*uz/4.0;
-    Nxi(1,n,g) =  (my + uy)*mx*ux*lz*uz/4.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*my*uy*lz*uz/4.0;
-    Nxi(1,n,g) = -(my + uy)*mx*lx*lz*uz/4.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*lx*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*ly*uy*lz*uz/2.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*lx*lz*uz/2.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*lx*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*ly*uy*lz*uz/2.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*ux*lz*uz/2.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*ly*lz*uz/2.0;
-    Nxi(1,n,g) = -(ly - my)*lx*ux*lz*uz/2.0;
-    Nxi(2,n,g) = -(lz - uz)*lx*ux*my*ly/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*my*uy*lz*uz/2.0;
-    Nxi(1,n,g) =  (my + uy)*lx*ux*lz*uz/2.0;
-    Nxi(2,n,g) =  (lz - uz)*lx*ux*my*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*ly*uy*mz*lz/2.0;
-    Nxi(1,n,g) = -(ly - uy)*lx*ux*mz*lz/2.0;
-    Nxi(2,n,g) = -(lz - mz)*lx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*ly*uy*mz*uz/2.0;
-    Nxi(1,n,g) =  (ly - uy)*lx*ux*mz*uz/2.0;
-    Nxi(2,n,g) =  (mz + uz)*lx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*ly*uy*lz*uz;
-    Nxi(1,n,g) =  (ly - uy)*lx*ux*lz*uz;
-    Nxi(2,n,g) =  (lz - uz)*lx*ux*ly*uy;
-    }
-  },
-
-  {ElementType::LIN1, [](int g, mshType& mesh) -> void { 
-    //std::cout << "[set_element_shape_data] **************************" << std::endl;
-    //std::cout << "[set_element_shape_data] ERROR: LIN1 not supported." << std::endl;
-    //std::cout << "[set_element_shape_data] **************************" << std::endl;
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = (1.0 - xi(0,g))*0.5;
-    N(1,g) = (1.0 + xi(0,g))*0.5;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -0.5;
-    Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    auto& N = mesh.N;
-    N(0,g) = lx*ly / 4.0;    
-    N(1,g) = ux*ly / 4.0;    
-    N(2,g) = ux*uy / 4.0;    
-    N(3,g) = lx*uy / 4.0;    
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
- {ElementType::QUD9, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    auto& N = mesh.N;
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::TET4, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = xi(2,g);
-    N(3,g) = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(2,1,g) =  0.0;
-    Nx(0,2,g) =  0.0;
-    Nx(1,2,g) =  0.0;
-    Nx(2,2,g) =  1.0;
-    Nx(0,3,g) = -1.0;
-    Nx(1,3,g) = -1.0;
-    Nx(2,3,g) = -1.0;
-    }
-  },
-
-  {ElementType::TET10, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    double s = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-    N(0,g)  = xi(0,g)*(2.0*xi(0,g) - 1.0);
-    N(1,g)  = xi(1,g)*(2.0*xi(1,g) - 1.0);
-    N(2,g)  = xi(2,g)*(2.0*xi(2,g) - 1.0);
-    N(3,g)  = s    *(2.0*s     - 1.0);
-    N(4,g)  = 4.0*xi(0,g)*xi(1,g);
-    N(5,g)  = 4.0*xi(1,g)*xi(2,g);
-    N(6,g)  = 4.0*xi(0,g)*xi(2,g);
-    N(7,g)  = 4.0*xi(0,g)*s;
-    N(8,g)  = 4.0*xi(1,g)*s;
-    N(9,g) = 4.0*xi(2,g)*s;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g)  =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g)  =  0.0;
-    Nx(2,0,g)  =  0.0;
-
-    Nx(0,1,g)  =  0.0;
-    Nx(1,1,g)  =  4.0*xi(1,g) - 1.0;
-    Nx(2,1,g)  =  0.0;
-
-    Nx(0,2,g)  =  0.0;
-    Nx(1,2,g)  =  0.0;
-    Nx(2,2,g)  =  4.0*xi(2,g) - 1.0;
-
-    Nx(0,3,g)  =  1.0 - 4.0*s;
-    Nx(1,3,g)  =  1.0 - 4.0*s;
-    Nx(2,3,g)  =  1.0 - 4.0*s;
-
-    Nx(0,4,g)  =  4.0*xi(1,g);
-    Nx(1,4,g)  =  4.0*xi(0,g);
-    Nx(2,4,g)  =  0.0;
-
-    Nx(0,5,g)  =  0.0;
-    Nx(1,5,g)  =  4.0*xi(2,g);
-    Nx(2,5,g)  =  4.0*xi(1,g);
-
-    Nx(0,6,g)  =  4.0*xi(2,g);
-    Nx(1,6,g)  =  0.0;
-    Nx(2,6,g)  =  4.0*xi(0,g);
-
-    Nx(0,7,g)  =  4.0*( s - xi(0,g));
-    Nx(1,7,g)  = -4.0*xi(0,g);
-    Nx(2,7,g)  = -4.0*xi(0,g);
-
-    Nx(0,8,g)  = -4.0*xi(1,g);
-    Nx(1,8,g)  =  4.0*( s - xi(1,g));
-    Nx(2,8,g)  = -4.0*xi(1,g);
-
-    Nx(0,9,g) = -4.0*xi(2,g);
-    Nx(1,9,g) = -4.0*xi(2,g);
-    Nx(2,9,g) =  4.0*( s - xi(2,g));
-    }
-  },
-
-  {ElementType::TRI3, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = 1.0 - xi(0,g) - xi(1,g);
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  1.0;
-    Nxi(1,0,g) =  0.0;
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  1.0;
-    Nxi(0,2,g) = -1.0;
-    Nxi(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g)*( 2.0*xi(0,g) - 1.0 );
-    N(1,g) = xi(1,g)*( 2.0*xi(1,g) - 1.0 );
-    N(2,g) = s    *( 2.0*s     - 1.0 );
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nxi(1,0,g) =  0.0;
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  4.0*xi(1,g) - 1.0;
-    Nxi(0,2,g) =  1.0 - 4.0*s;
-    Nxi(1,2,g) =  1.0 - 4.0*s;
-    Nxi(0,3,g) =  4.0*xi(1,g);
-    Nxi(1,3,g) =  4.0*xi(0,g);
-    Nxi(0,4,g) = -4.0*xi(1,g);
-    Nxi(1,4,g) =  4.0*( s - xi(1,g) );
-    Nxi(0,5,g) =  4.0*( s - xi(0,g) );
-    Nxi(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-  {ElementType::WDG, [](int g, mshType& mesh) -> void 
-    { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    double ux = xi(0,g);
-    double uy = xi(1,g);
-    double uz = 1.0 - ux - uy;
-    double s = (1.0 + xi(2,g))*0.5;
-    double t = (1.0 - xi(2,g))*0.5;
-    N(0,g) = ux*t;
-    N(1,g) = uy*t;
-    N(2,g) = uz*t;
-    N(3,g) = ux*s;
-    N(4,g) = uy*s;
-    N(5,g) = uz*s;
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  t;
-    Nxi(1,0,g) =  0.0;
-    Nxi(2,0,g) = -ux*0.50;
-
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  t;
-    Nxi(2,1,g) = -uy*0.50;
-
-    Nxi(0,2,g) = -t;
-    Nxi(1,2,g) = -t;
-    Nxi(2,2,g) = -uz*0.50;
-
-    Nxi(0,3,g) =  s;
-    Nxi(1,3,g) =  0.0;
-    Nxi(2,3,g) =  ux*0.50;
-
-    Nxi(0,4,g) =  0.0;
-    Nxi(1,4,g) =  s;
-    Nxi(2,4,g) =  uy*0.50;
-
-    Nxi(0,5,g) = -s;
-    Nxi(1,5,g) = -s;
-    Nxi(2,5,g) =  uz*0.50;
-    }
-  },
-
-};
-
-//---------------------
-// set_face_shape_data
-//---------------------
-// Define a map type used to face element shape function data.
-//
-// This reproduces 'SUBROUTINE GETGNN(insd, eType, eNoN, xi, N, Nxi)' in NN.f.
-//
-using SetFaceShapeMapType = std::map<ElementType, std::function<void(int, faceType&)>>;
-
-SetFaceShapeMapType set_face_shape_data = {
-
-  {ElementType::PNT, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = 1.0;
-    }
-  },
-
-  {ElementType::QUD8, [](int g, faceType& face) -> void 
-    {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = lx*ux;
-    double my = ly*uy;
-
-    auto& N = face.N;
-    N(0,g) = lx*ly*(lx+ly-3.0)/4.0;
-    N(1,g) = ux*ly*(ux+ly-3.0)/4.0;
-    N(2,g) = ux*uy*(ux+uy-3.0)/4.0;
-    N(3,g) = lx*uy*(lx+uy-3.0)/4.0;
-    N(4,g) = mx*ly*0.50;
-    N(5,g) = ux*my*0.50;
-    N(6,g) = mx*uy*0.50;
-    N(7,g) = lx*my*0.50;
-
-    auto& Nxi = face.Nx;
-    Nxi(0,0,g) = -ly*(lx+ly-3.0+lx)/4.0;
-    Nxi(1,0,g) = -lx*(lx+ly-3.0+ly)/4.0;
-
-    Nxi(0,1,g) =  ly*(ux+ly-3.0+ux)/4.0;
-    Nxi(1,1,g) = -ux*(ux+ly-3.0+ly)/4.0;
-
-    Nxi(0,2,g) =  uy*(ux+uy-3.0+ux)/4.0;
-    Nxi(1,2,g) =  ux*(ux+uy-3.0+uy)/4.0;
-
-    Nxi(0,3,g) = -uy*(lx+uy-3.0+lx)/4.0;
-    Nxi(1,3,g) =  lx*(lx+uy-3.0+uy)/4.0;
-
-    Nxi(0,4,g) =  (lx - ux)*ly*0.50;
-    Nxi(1,4,g) = -mx*0.50;
-
-    Nxi(0,5,g) =  my*0.50;
-    Nxi(1,5,g) =  (ly - uy)*ux*0.50;
-
-    Nxi(0,6,g) =  (lx - ux)*uy*0.50;
-    Nxi(1,6,g) =  mx*0.50;
-
-    Nxi(0,7,g) = -my*0.50;
-    Nxi(1,7,g) =  (ly - uy)*lx*0.50;
-    }
-  },
-
-  {ElementType::QUD9, [](int g, faceType& face) -> void 
-    {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    auto& N = face.N;
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::LIN1, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = 0.5 * (1.0 - face.xi(0,g));
-    face.N(1,g) = 0.5 * (1.0 + face.xi(0,g));
-
-    face.Nx(0,0,g) = -0.5;
-    face.Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](int g, faceType& face) -> void
-    {
-    auto& xi = face.xi;
-    auto& N = face.N;
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](int g, faceType& face) -> void {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    auto& N =face.N;
-    N(0,g) = lx*ly / 4.0;
-    N(1,g) = ux*ly / 4.0;
-    N(2,g) = ux*uy / 4.0;
-    N(3,g) = lx*uy / 4.0;
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
-  {ElementType::TRI3, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = face.xi(0,g);
-    face.N(1,g) = face.xi(1,g);
-    face.N(2,g) = 1.0 - face.xi(0,g) - face.xi(1,g);
-
-    face.Nx(0,0,g) = 1.0;
-    face.Nx(1,0,g) = 0.0;
-
-    face.Nx(0,1,g) = 0.0;
-    face.Nx(1,1,g) = 1.0;
-
-    face.Nx(0,2,g) = -1.0;
-    face.Nx(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](int g, faceType& face) -> void
-    {
-    auto& xi = face.xi;
-    auto& N = face.N;
-
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g)*( 2.0*xi(0,g) - 1.0 );
-    N(1,g) = xi(1,g)*( 2.0*xi(1,g) - 1.0 );
-    N(2,g) = s    *( 2.0*s     - 1.0 );
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    auto& Nxi = face.Nx;
-    Nxi(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nxi(1,0,g) =  0.0;
-
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  4.0*xi(1,g) - 1.0;
-
-    Nxi(0,2,g) =  1.0 - 4.0*s;
-    Nxi(1,2,g) =  1.0 - 4.0*s;
-
-    Nxi(0,3,g) =  4.0*xi(1,g);
-    Nxi(1,3,g) =  4.0*xi(0,g);
-
-    Nxi(0,4,g) = -4.0*xi(1,g);
-    Nxi(1,4,g) =  4.0*( s - xi(1,g) );
-
-    Nxi(0,5,g) =  4.0*( s - xi(0,g) );
-    Nxi(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-
-};
diff --git a/Code/Source/solver/nn_elem_gnnxx.h b/Code/Source/solver/nn_elem_gnnxx.h
deleted file mode 100644
index 7b40a783b..000000000
--- a/Code/Source/solver/nn_elem_gnnxx.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-/// @brief Define a map type used to compute 2nd direivatives of element shape function data.
-///
-/// Replicates 'SUBROUTINE GETGNNxx(insd, ind2, eType, eNoN, xi, Nxx)'
-//
-static double fp = 4.0;
-static double fn = -4.0;
-static double en = -8.0;
-static double ze =  0.0;
-
-using GetElement2ndDerivMapType = std::map<ElementType, std::function<void(const int, const int, const int, 
-    const int, const Array<double>&, Array3<double>&)>>;
-
-GetElement2ndDerivMapType get_element_2nd_derivs = {
-
-  {ElementType::QUD8, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void {
-
-    double lx = 1.0 - xi(0);
-    double ly = 1.0 - xi(1);
-    double ux = 1.0 + xi(0);
-    double uy = 1.0 + xi(1);
-    double mx = xi(0);
-    double my = xi(1);
-
-    Nxx(0,0,g) =  ly*0.50;
-    Nxx(1,0,g) =  lx*0.50;
-    Nxx(2,0,g) =  (lx+lx+ly+ly-3.0)/4.0;
-
-    Nxx(0,1,g) =  ly*0.50;
-    Nxx(1,1,g) =  ux*0.50;
-    Nxx(2,1,g) = -(ux+ux+ly+ly-3.0)/4.0;
-
-    Nxx(0,2,g) =  uy*0.50;
-    Nxx(1,2,g) =  ux*0.50;
-    Nxx(2,3,g) =  (ux+ux+uy+uy-3.0)/4.0;
-
-    Nxx(0,3,g) =  uy*0.50;
-    Nxx(1,3,g) =  lx*0.50;
-    Nxx(2,3,g) = -(lx+lx+uy+uy-3.0)/4.0;
-
-    Nxx(0,4,g) = -ly;
-    Nxx(1,4,g) =  0.0;
-    Nxx(2,4,g) =  mx;
-
-    Nxx(0,5,g) =  0.0;
-    Nxx(1,5,g) = -ux;
-    Nxx(2,5,g) = -my;
-
-    Nxx(0,6,g) = -uy;
-    Nxx(1,6,g) =  0.0;
-    Nxx(2,6,g) = -mx;
-
-    Nxx(0,7,g) =  0.0;
-    Nxx(1,7,g) = -lx;
-    Nxx(2,7,g) =  my;
-    }
-  },
-
-  {ElementType::QUD9, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void { 
-
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    Nxx(0,0,g) = -ly*my*0.5;
-    Nxx(1,0,g) = -lx*mx*0.5;
-    Nxx(2,0,g) =  (lx-mx)*(ly-my)/4.0;
-
-    Nxx(0,1,g) = -ly*my*0.5;
-    Nxx(1,1,g) =  ux*mx*0.5;
-    Nxx(2,1,g) = -(ux+mx)*(ly-my)/4.0;
-
-    Nxx(0,2,g) =  uy*my*0.5;
-    Nxx(1,2,g) =  ux*mx*0.5;
-    Nxx(2,2,g) =  (ux+mx)*(uy+my)/4.0;
-
-    Nxx(0,3,g) =  uy*my*0.5;
-    Nxx(1,3,g) = -lx*mx*0.5;
-    Nxx(2,3,g) = -(lx-mx)*(uy+my)/4.0;
-
-    Nxx(0,4,g) =  ly*my;
-    Nxx(1,4,g) =  lx*ux;
-    Nxx(2,4,g) =  mx*(ly-my);
-
-    Nxx(0,5,g) =  ly*uy;
-    Nxx(1,5,g) = -ux*mx;
-    Nxx(2,5,g) = -(ux+mx)*my;
-
-    Nxx(0,6,g) = -uy*my;
-    Nxx(1,6,g) =  lx*ux;
-    Nxx(2,6,g) = -mx*(uy+my);
-
-    Nxx(0,7,g) =  ly*uy;
-    Nxx(1,7,g) =  lx*mx;
-    Nxx(2,7,g) =  (lx-mx)*my;
-
-    Nxx(0,8,g) = -ly*uy*2.0;
-    Nxx(1,8,g) = -lx*ux*2.0;
-    Nxx(2,8,g) =  mx*my*4.0;
-    }
-  },
-
-  {ElementType::TET10, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void { 
-    Nxx.set_row(0, g, {fp, ze, ze, ze, ze, ze});
-    Nxx.set_row(1, g, {ze, fp, ze, ze, ze, ze});
-    Nxx.set_row(2, g, {ze, ze, fp, ze, ze, ze});
-    Nxx.set_row(3, g, {fp, fp, fp, fp, fp, fp});
-    Nxx.set_row(4, g, {ze, ze, ze, fp, ze, ze});
-    Nxx.set_row(5, g, {ze, ze, ze, ze, fp, ze});
-    Nxx.set_row(6, g, {ze, ze, ze, ze, ze, fp});
-    Nxx.set_row(7, g, {en, ze, ze, fn, ze, fn});
-    Nxx.set_row(8, g, {ze, en, ze, fn, fn, ze});
-    Nxx.set_row(9, g, {ze, ze, en, ze, fn, fn});
-    }
-  },
-
-  {ElementType::TRI6, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi,
-       Array3<double>& Nxx) -> void {
-
-    Nxx.set_row(0, g, {fp, ze, ze});
-    Nxx.set_row(1, g, {ze, fp, ze});
-    Nxx.set_row(2, g, {fp, fp, fp});
-    Nxx.set_row(3, g, {ze, ze, fp});
-    Nxx.set_row(4, g, {ze, en, fn});
-    Nxx.set_row(5, g, {en, ze, fn});
-    }
-  },
-
-};
-
-

From 36046f8a2f0baa006dcd2fa896bd66498c7032b0 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 14:25:44 -0700
Subject: [PATCH 10/91] fixing the licensing and copyright comments

---
 Code/Source/solver/FE/Basis/BasisExceptions.h |  8 ++---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  8 ++---
 Code/Source/solver/FE/Basis/BasisFactory.h    |  8 ++---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  8 ++---
 Code/Source/solver/FE/Basis/BasisFunction.h   |  8 ++---
 Code/Source/solver/FE/Basis/BasisTraits.h     |  8 ++---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  8 ++---
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  8 ++---
 .../FE/Basis/NodeOrderingConventions.cpp      |  8 ++---
 .../solver/FE/Basis/NodeOrderingConventions.h |  8 ++---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  8 ++---
 .../Source/solver/FE/Basis/SerendipityBasis.h |  8 ++---
 Code/Source/solver/FE/Common/Types.h          | 31 ++-----------------
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  8 ++---
 .../solver/FE/Math/DenseLinearAlgebra.h       |  8 ++---
 .../solver/FE/Math/DenseTransformKernels.h    |  8 ++---
 Code/Source/solver/FE/Math/Matrix.h           |  3 ++
 Code/Source/solver/FE/Math/MatrixExpr.h       |  3 ++
 Code/Source/solver/FE/Math/Vector.h           |  3 ++
 Code/Source/solver/FE/Math/VectorExpr.h       |  3 ++
 20 files changed, 44 insertions(+), 119 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index 8ee92a3dd..c1af17049 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISEXCEPTIONS_H
 #define SVMP_FE_BASIS_BASISEXCEPTIONS_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index 9f0867959..bc01be0ed 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "BasisFactory.h"
 
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index c937dd4a0..b188b3aa2 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISFACTORY_H
 #define SVMP_FE_BASIS_BASISFACTORY_H
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 578c46c88..3d95671f4 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "BasisFunction.h"
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index bf6ac5de7..5ad65f35d 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISFUNCTION_H
 #define SVMP_FE_BASIS_BASISFUNCTION_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index d97b59f1f..eca5c1c69 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISTRAITS_H
 #define SVMP_FE_BASIS_BASISTRAITS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 372209722..ece2d9cb5 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index dae149872..43304a263 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
 #define SVMP_FE_BASIS_LAGRANGEBASIS_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index ae3ea8ed3..76662abe1 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "NodeOrderingConventions.h"
 #include "BasisExceptions.h"
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 8a43cc4e3..4b11cca32 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 #define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 237f8c2ce..d551419a8 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "SerendipityBasis.h"
 #include "LagrangeBasis.h"
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 10e426164..e0289f82d 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_SERENDIPITYBASIS_H
 #define SVMP_FE_BASIS_SERENDIPITYBASIS_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index bb3f23bca..e3d5a46e9 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -1,32 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See Copyright-SimVascular.txt for additional details.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_TYPES_H
 #define SVMP_FE_TYPES_H
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 7d909fa0c..8be9a7560 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "DenseLinearAlgebra.h"
 
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 7684439b5..6c81755f4 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_MATH_DENSELINEARALGEBRA_H
 #define SVMP_FE_MATH_DENSELINEARALGEBRA_H
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 8bf83ec0b..50f1002de 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 #define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 8cb28e5d5..3f3a9d9b6 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_MATRIX_H
 #define SVMP_FE_MATH_MATRIX_H
 
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index 13010bddf..288bbc5ca 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_MATRIX_EXPR_H
 #define SVMP_FE_MATH_MATRIX_EXPR_H
 
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 777f9945b..a1214f9aa 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_VECTOR_H
 #define SVMP_FE_MATH_VECTOR_H
 
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 178b66b8a..aa712dd63 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_VECTOR_EXPR_H
 #define SVMP_FE_MATH_VECTOR_EXPR_H
 

From 3691503eed8da410083633ed831ec7c350bd433f Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 10:00:33 -0700
Subject: [PATCH 11/91] including doxygen documentation for Basis and Math
 submodules

---
 .github/workflows/documentation.yml           |   2 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  86 ++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  19 ++-
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 158 ++++++++++++++++++
 .../Source/solver/FE/Basis/SerendipityBasis.h | 116 +++++++++++++
 Code/Source/solver/FE/Math/Matrix.h           |  27 ++-
 Code/Source/solver/FE/Math/Vector.h           |  14 ++
 Documentation/Doxyfile                        |   8 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |   2 +-
 9 files changed, 423 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index db67bbbdb..c1f8a3b5d 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -10,7 +10,7 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build doxygen documentation
         run: |
-          sudo apt install -y doxygen
+          sudo apt install -y doxygen graphviz
           doxygen Documentation/Doxyfile
       - name: Save documentation
         uses: actions/upload-artifact@v4
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 5ad65f35d..f8f78d7b6 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -12,11 +12,25 @@
 #include <cstddef>
 #include <vector>
 
+/// \defgroup FE FE Library
+/// \brief Finite-element interfaces and utilities used by the solver.
+///
+/// The FE library groups basis functions, math utilities, assembly interfaces,
+/// and related support code that can be built and consumed as a coherent
+/// finite-element component.
+
+/// \defgroup FE_Basis Basis
+/// \ingroup FE
+/// \brief Basis-function interfaces, concrete basis families, and reference-node conventions.
+
 namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \brief Gradient vector type used by basis evaluators.
 using Gradient = math::Vector<Real, 3>;
+
+/// \brief Hessian matrix type used by basis evaluators.
 using Hessian  = math::Matrix<Real, 3, 3>;
 
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
@@ -71,38 +85,110 @@ inline void add_scaled_hessian(Hessian& target,
     }
 }
 
+/// \brief Abstract interface for finite-element basis-function families.
+/// \ingroup FE_Basis
+///
+/// BasisFunction defines the common query and evaluation API used by solver
+/// code that does not need to know the concrete basis implementation. Derived
+/// classes provide values at minimum and can override analytical gradients,
+/// Hessians, combined evaluation, and flat-buffer output paths.
 class BasisFunction {
 public:
+    /// \brief Destroy a basis function through the abstract interface.
     virtual ~BasisFunction() = default;
 
+    /// \brief Return the concrete basis family.
+    /// \return Basis family identifier.
     virtual BasisType basis_type() const noexcept = 0;
+
+    /// \brief Return the canonical element type represented by this basis.
+    /// \return Element type used for node layout and evaluation.
     virtual ElementType element_type() const noexcept = 0;
+
+    /// \brief Return the reference-space dimension of the basis.
+    /// \return Reference dimension, from zero for points through three for volume elements.
     virtual int dimension() const noexcept = 0;
+
+    /// \brief Return the polynomial order represented by this basis.
+    /// \return Effective polynomial order after any element-family normalization.
     virtual int order() const noexcept = 0;
+
+    /// \brief Return the number of basis functions and reference nodes.
+    /// \return Basis function count.
     virtual std::size_t size() const noexcept = 0;
 
+    /// \brief Evaluate basis function values at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     virtual void evaluate_values(const math::Vector<Real, 3>& xi,
                                  std::vector<Real>& values) const = 0;
+
+    /// \brief Evaluate basis gradients at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \throws BasisEvaluationException If gradients are not available for the basis.
     virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
+
+    /// \brief Evaluate basis Hessians at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /// \throws BasisEvaluationException If Hessians are not available for the basis.
     virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
+
+    /// \brief Evaluate values, gradients, and Hessians together.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     virtual void evaluate_all(const math::Vector<Real, 3>& xi,
                               std::vector<Real>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
+    /// \brief Evaluate basis values into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
                                     Real* SVMP_RESTRICT values_out) const;
+
+    /// \brief Evaluate basis gradients into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT gradients_out) const;
+
+    /// \brief Evaluate basis Hessians into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                       Real* SVMP_RESTRICT hessians_out) const;
 
 protected:
+    /// \brief Approximate gradients by centered finite differences of values.
+    ///
+    /// \details This helper exists as a development and fallback utility for
+    /// basis implementations that do not yet provide analytical gradients. It
+    /// is useful for prototyping new basis families and for checking analytical
+    /// derivative formulas in tests. Production element assembly should prefer
+    /// analytical gradients when available because finite differences introduce
+    /// truncation/roundoff sensitivity and require multiple value evaluations
+    /// per reference coordinate.
     void numerical_gradient(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients,
                             Real eps = Real(1e-6)) const;
+
+    /// \brief Approximate Hessians by centered finite differences of gradients.
+    ///
+    /// \details This helper exists for the same reason as numerical_gradient:
+    /// it provides a simple reference implementation for prototyping and
+    /// derivative verification when analytical second derivatives are not yet
+    /// implemented. It depends on evaluate_gradients(), so it is only available
+    /// for basis implementations that can already provide gradients. Analytical
+    /// Hessians should be used in performance-sensitive solver paths because
+    /// finite-difference Hessians amplify numerical error and require repeated
+    /// gradient evaluations.
     void numerical_hessian(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians,
                            Real eps = Real(1e-5)) const;
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index ece2d9cb5..d777447cb 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -16,6 +16,7 @@ namespace {
 
 using Vec3 = math::Vector<Real, 3>;
 
+// Return the equispaced 1D reference coordinate in [-1, 1].
 inline constexpr Real equispaced_pm_one_coord(int i, int order) {
     if (order <= 0) {
         return Real(0);
@@ -40,6 +41,7 @@ struct NormalizedLagrangeRequest {
     int order;
 };
 
+// Validate and return the supported basis topology for a Lagrange element type.
 BasisTopology supported_lagrange_topology(ElementType type) {
     const BasisTopology top = topology(type);
     if (top == BasisTopology::Unknown) {
@@ -49,6 +51,7 @@ BasisTopology supported_lagrange_topology(ElementType type) {
     return top;
 }
 
+// Normalize named higher-order element requests to base Lagrange topologies.
 NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
     switch (element_type) {
         case ElementType::Line3:
@@ -79,13 +82,14 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
             throw BasisElementCompatibilityException(
-                "LagrangeBasis: pyramid support has been removed from the current solver basis scope",
+                "LagrangeBasis: pyramid support is not within the current solver basis scope",
                 __FILE__, __LINE__, __func__);
         default:
             return {element_type, order};
     }
 }
 
+// Convert a coordinate on [-1, 1] to an equispaced axis node index.
 std::size_t axis_index_pm_one(Real coord, int order) {
     if (order <= 0) {
         return 0u;
@@ -94,6 +98,7 @@ std::size_t axis_index_pm_one(Real coord, int order) {
     return static_cast<std::size_t>(std::llround(scaled));
 }
 
+// Convert a simplex barycentric coordinate to a lattice index.
 int simplex_lattice_index(Real value, int order) {
     if (order <= 0) {
         return 0;
@@ -101,6 +106,7 @@ int simplex_lattice_index(Real value, int order) {
     return static_cast<int>(std::llround(value * Real(order)));
 }
 
+// Compute simplex interpolation exponents from a reference node.
 LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
                                                            BasisTopology top,
                                                            int order) {
@@ -121,6 +127,7 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
     return e;
 }
 
+// Evaluate 1D Lagrange polynomials and derivatives at a point.
 void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
     const std::size_t n = nodes.size();
     out.value.assign(n, Real(0));
@@ -185,6 +192,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
     }
 }
 
+// Evaluate one barycentric polynomial factor and derivatives.
 std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
     Real value = Real(1);
     Real first = Real(0);
@@ -204,6 +212,7 @@ std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
     return {value, first, second};
 }
 
+// Evaluate simplex Lagrange basis functions and derivatives.
 void evaluate_simplex(const Vec3& xi,
                       BasisTopology top,
                       int order,
@@ -291,6 +300,7 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
+// Store a gradient in the flat buffer layout used by fast evaluators.
 void store_gradient(const Gradient& gradient, Real* dst) {
     dst[0] = gradient[0];
     dst[1] = gradient[1];
@@ -314,6 +324,7 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     init_nodes();
 }
 
+// Initialize equispaced 1D interpolation nodes for tensor-product axes.
 void LagrangeBasis::init_equispaced_1d_nodes() {
     nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
     for (int i = 0; i <= order_; ++i) {
@@ -322,6 +333,7 @@ void LagrangeBasis::init_equispaced_1d_nodes() {
     }
 }
 
+// Initialize reference nodes and topology-specific lookup data.
 void LagrangeBasis::init_nodes() {
     nodes_.clear();
     nodes_1d_.clear();
@@ -357,10 +369,12 @@ void LagrangeBasis::init_nodes() {
                                              __FILE__, __LINE__, __func__);
 }
 
+// Build the single reference node for a point basis.
 void LagrangeBasis::build_point_nodes() {
     nodes_.push_back(Vec3{Real(0), Real(0), Real(0)});
 }
 
+// Build nodes and axis indices for tensor-product elements.
 void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
@@ -378,6 +392,7 @@ void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
     }
 }
 
+// Build nodes and barycentric exponents for simplex elements.
 void LagrangeBasis::build_simplex_nodes() {
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
     simplex_exponents_.reserve(nodes_.size());
@@ -386,6 +401,7 @@ void LagrangeBasis::build_simplex_nodes() {
     }
 }
 
+// Build nodes and mixed triangle-axis lookup data for wedge elements.
 void LagrangeBasis::build_wedge_nodes() {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
@@ -412,6 +428,7 @@ void LagrangeBasis::build_wedge_nodes() {
     }
 }
 
+// Evaluate requested basis quantities into caller-provided flat buffers.
 void LagrangeBasis::evaluate_all_to(const Vec3& xi,
                                     Real* SVMP_RESTRICT values_out,
                                     Real* SVMP_RESTRICT gradients_out,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 43304a263..3bb1a5e74 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -14,37 +14,193 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \defgroup FE_LagrangeBasis LagrangeBasis
+/// \ingroup FE_Basis
+/// \brief Construction and evaluation API for nodal Lagrange finite-element bases.
+///
+/// \details This group documents the complete nodal Lagrange basis evaluator
+/// used by the FE library. The implementation covers tensor-product,
+/// simplex, and wedge reference topologies with exact analytical first and
+/// second derivatives in reference coordinates.
+/// @{
+
+/// \brief Nodal Lagrange basis on supported reference finite elements.
+///
+/// \details LagrangeBasis represents the nodal interpolation basis associated
+/// with an equispaced reference-node lattice. It supports point, line,
+/// quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
+/// elements. Named complete quadratic elements such as Line3, Triangle6,
+/// Quad9, Tetra10, Hex27, and Wedge18 are normalized to their canonical
+/// linear topology plus effective order 2.
+///
+/// Tensor-product elements use the one-dimensional nodal polynomials
+/// \f[
+///   l_i(x) = \prod_{j \ne i} \frac{x - x_j}{x_i - x_j}
+/// \f]
+/// on equispaced coordinates in \f$[-1, 1]\f$. Multi-dimensional basis
+/// functions are products of the active axis polynomials, for example
+/// \f$N_{ijk}(r,s,t) = l_i(r)l_j(s)l_k(t)\f$ on a hexahedron.
+///
+/// Simplex elements use barycentric coordinates and integer lattice
+/// exponents. For a node with exponent tuple \f$\alpha\f$, where
+/// \f$\sum_a \alpha_a = p\f$, the basis is assembled from scaled
+/// falling-factorial factors,
+/// \f[
+///   N_\alpha(\lambda) =
+///   \prod_a \prod_{m=0}^{\alpha_a-1}
+///   \frac{p\lambda_a - m}{m + 1}.
+/// \f]
+/// Gradients and Hessians are evaluated analytically by differentiating these
+/// factors and applying the barycentric-coordinate chain rule.
+///
+/// Wedge elements are treated as a tensor product between a triangle simplex
+/// basis and a one-dimensional through-axis basis:
+/// \f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)\f$.
+///
+/// The vector-returning evaluators are convenient API wrappers. The `*_to`
+/// methods write to caller-provided flat buffers and are intended for assembly
+/// paths that avoid temporary allocations.
 class LagrangeBasis : public BasisFunction {
 public:
+    /// \brief Axis-index tuple for tensor-product reference nodes.
     using TensorNodeIndex = std::array<std::size_t, 3>;
+
+    /// \brief Barycentric exponent tuple for simplex reference nodes.
     using SimplexExponent = std::array<int, 4>;
+
+    /// \brief Triangle-node and axis-node tuple for wedge reference nodes.
     using WedgeNodeIndex = std::array<std::size_t, 2>;
 
+    /// \brief Construct a Lagrange basis for an element type and polynomial order.
+    ///
+    /// \details The constructor normalizes complete higher-order aliases to the
+    /// canonical topology and effective polynomial order, builds the reference
+    /// node coordinates, and precomputes topology-specific lookup data used by
+    /// evaluation. Tensor-product bases store per-axis node indices, simplex
+    /// bases store barycentric exponent tuples, and wedge bases store the
+    /// triangle-node/axis-node decomposition.
+    ///
+    /// \param type Element type used to determine topology and reference-node layout.
+    /// \param order Requested polynomial order.
+    /// \throws BasisConfigurationException If the effective order is negative.
+    /// \throws BasisElementCompatibilityException If the element type is unsupported.
     LagrangeBasis(ElementType type, int order);
 
+    /// \copydoc BasisFunction::basis_type()
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+
+    /// \copydoc BasisFunction::element_type()
     ElementType element_type() const noexcept override { return element_type_; }
+
+    /// \copydoc BasisFunction::dimension()
     int dimension() const noexcept override { return dimension_; }
+
+    /// \copydoc BasisFunction::order()
     int order() const noexcept override { return order_; }
+
+    /// \copydoc BasisFunction::size()
     std::size_t size() const noexcept override { return nodes_.size(); }
 
+    /// \brief Return the reference interpolation nodes in basis ordering.
+    ///
+    /// \details The returned node order matches the basis-function order used
+    /// by all evaluators. Coordinates are reference-element coordinates:
+    /// tensor-product axes use \f$[-1,1]\f$, triangles and tetrahedra use the
+    /// repository's simplex reference coordinates, and wedges combine triangle
+    /// reference coordinates with a \f$[-1,1]\f$ through-axis coordinate.
+    ///
+    /// \return Reference node coordinates, one per basis function.
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
+    /// \brief Evaluate Lagrange basis function values at a reference coordinate.
+    ///
+    /// \details Values satisfy the nodal interpolation property
+    /// \f$N_i(x_j)=\delta_{ij}\f$ at the basis nodes. Tensor-product values are
+    /// products of one-dimensional Lagrange polynomials. Simplex values are
+    /// products of barycentric falling-factorial factors. Wedge values are
+    /// products of triangle simplex values and through-axis Lagrange values.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const final;
+
+    /// \brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
+    ///
+    /// \details Gradients are derivatives with respect to reference
+    /// coordinates, not physical coordinates. Tensor-product gradients apply
+    /// the product rule to the active axis polynomials. Simplex gradients
+    /// differentiate the barycentric factors and multiply by the constant
+    /// gradients of the barycentric coordinates. Wedge gradients combine the
+    /// triangle gradient in the first two components with the through-axis
+    /// derivative in the third component.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
+
+    /// \brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
+    ///
+    /// \details Hessians are second derivatives in reference coordinates and
+    /// are stored as 3-by-3 matrices. Tensor-product Hessians contain pure
+    /// second axis derivatives on the diagonal and mixed product-rule terms
+    /// off diagonal. Simplex Hessians are assembled from first and second
+    /// derivatives of the barycentric factors. Wedge Hessians contain triangle
+    /// Hessian terms, through-axis second derivatives, and mixed
+    /// triangle/through-axis derivative products.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate Lagrange values, gradients, and Hessians together.
+    ///
+    /// \details This is the allocation-friendly vector API for callers that
+    /// need all basis quantities at the same quadrature point. The underlying
+    /// evaluator computes only topology-local polynomial data once and then
+    /// fills all requested outputs.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_all(const math::Vector<Real, 3>& xi,
                       std::vector<Real>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
+    /// \brief Evaluate Lagrange basis values into a flat caller-provided buffer.
+    ///
+    /// \details This is the low-allocation API intended for element assembly
+    /// loops. The buffer is filled in basis-node order and no vector resizing
+    /// is performed.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
                             Real* SVMP_RESTRICT values_out) const final;
+
+    /// \brief Evaluate Lagrange basis gradients into a flat caller-provided buffer.
+    ///
+    /// \details Gradients are written in node-major order with three
+    /// reference-coordinate components per node. For node \f$i\f$ and component
+    /// \f$c\f$, the entry is `gradients_out[i * 3 + c]`.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                Real* SVMP_RESTRICT gradients_out) const final;
+
+    /// \brief Evaluate Lagrange basis Hessians into a flat caller-provided buffer.
+    ///
+    /// \details Hessians are written in node-major row-major order. For node
+    /// \f$i\f$ and Hessian component \f$(r,c)\f$, the entry is
+    /// `hessians_out[i * 9 + r * 3 + c]`.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                               Real* SVMP_RESTRICT hessians_out) const final;
 
@@ -73,6 +229,8 @@ class LagrangeBasis : public BasisFunction {
                          Real* SVMP_RESTRICT hessians_out) const;
 };
 
+/// @}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index e0289f82d..fc0b897cf 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -17,23 +17,137 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \defgroup FE_SerendipityBasis SerendipityBasis
+/// \ingroup FE_Basis
+/// \brief Construction and evaluation API for reduced serendipity finite-element bases.
+///
+/// \details This group documents reduced degree-of-freedom basis families that
+/// preserve nodal interpolation on supported element boundaries while omitting
+/// selected interior tensor-product modes. These bases are used for standard
+/// serendipity elements and geometry-mode mappings that intentionally use a
+/// lower-order interpolation space.
+/// @{
+
+/// \brief Reduced-degree-of-freedom serendipity basis on supported reference elements.
+///
+/// \details SerendipityBasis implements nodal bases for Quad4/Quad8,
+/// Hex8/Hex20, and Wedge15. Compared with a complete tensor-product Lagrange
+/// basis of the same nominal order, a serendipity basis removes selected
+/// interior modes while retaining nodal interpolation on the supported node
+/// layout.
+///
+/// Quadrilateral serendipity bases are built from monomials
+/// \f$x^{a_x}y^{a_y}\f$ whose superlinear degree is at most the requested
+/// order. In this implementation the superlinear degree is
+/// \f[
+///   sldeg(x^{a_x}y^{a_y}) =
+///   \begin{cases} a_x, & a_x > 1 \\ 0, & a_x \le 1 \end{cases}
+///   +
+///   \begin{cases} a_y, & a_y > 1 \\ 0, & a_y \le 1 \end{cases}.
+/// \f]
+/// The nodal basis is recovered by inverting the Vandermonde interpolation
+/// matrix at the selected reference nodes. Values, gradients, and Hessians are
+/// then evaluated by differentiating the monomial vector and applying the
+/// inverse Vandermonde coefficients.
+///
+/// Hex8 uses the standard trilinear corner basis
+/// \f$(1 \pm r)(1 \pm s)(1 \pm t)/8\f$. Hex20 and Wedge15 use tabulated
+/// polynomial coefficient tables over monomial bases; analytical gradients and
+/// Hessians are obtained by differentiating those monomials. Hex20 evaluation
+/// is reordered through ReferenceNodeLayout so the output matches the public
+/// basis ordering.
+///
+/// When `geometry_mode` is enabled for Hex20, the basis uses the trilinear
+/// Hex8 corner functions for geometry mapping and assigns zero contribution to
+/// the quadratic edge nodes. This preserves the public Hex20 node count while
+/// intentionally reducing the geometry interpolation order.
 class SerendipityBasis : public BasisFunction {
 public:
+    /// \brief Construct a serendipity basis for an element type and polynomial order.
+    ///
+    /// \details The constructor selects the topology-specific interpolation
+    /// space, computes the reference node coordinates, and initializes any
+    /// coefficient tables needed for evaluation. Quadrilateral bases build and
+    /// invert a Vandermonde matrix for the selected serendipity monomials.
+    /// Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
+    /// linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
+    /// wedges, only quadratic Wedge15 is supported.
+    ///
+    /// \param type Element type used to determine topology and reference-node layout.
+    /// \param order Requested polynomial order.
+    /// \param geometry_mode When true, allow reduced geometry-mapping behavior for supported elements.
+    /// \throws BasisConfigurationException If the requested order or mode is invalid.
+    /// \throws BasisElementCompatibilityException If the element type is unsupported.
     SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
 
+    /// \copydoc BasisFunction::basis_type()
     BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+
+    /// \copydoc BasisFunction::element_type()
     ElementType element_type() const noexcept override { return element_type_; }
+
+    /// \copydoc BasisFunction::dimension()
     int dimension() const noexcept override { return dimension_; }
+
+    /// \copydoc BasisFunction::order()
     int order() const noexcept override { return order_; }
+
+    /// \copydoc BasisFunction::size()
     std::size_t size() const noexcept override { return size_; }
+
+    /// \brief Return the reference interpolation nodes in basis ordering.
+    ///
+    /// \details Node coordinates are the points at which the serendipity basis
+    /// satisfies the nodal interpolation property. Quadrilateral nodes are
+    /// placed first on the boundary and then, for higher order requests, at the
+    /// selected interior points needed to make the reduced monomial space
+    /// unisolvent. Hexahedral and wedge nodes are taken from
+    /// ReferenceNodeLayout.
+    ///
+    /// \return Reference node coordinates, one per basis function.
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
+    /// \brief Evaluate serendipity basis function values at a reference coordinate.
+    ///
+    /// \details For quadrilateral bases, this evaluates the serendipity
+    /// monomial vector and multiplies by the inverse Vandermonde matrix to
+    /// obtain nodal shape-function values. For Hex8, values are the standard
+    /// trilinear corner products. For Hex20 and Wedge15, values are evaluated
+    /// from the stored polynomial coefficient tables. In Hex20 geometry mode,
+    /// only the first eight corner values are nonzero and they match Hex8.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const override;
 
+    /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
+    ///
+    /// \details Gradients are derivatives with respect to reference
+    /// coordinates. Quadrilateral gradients differentiate the monomial vector
+    /// before applying the inverse Vandermonde coefficients. Hex8 gradients are
+    /// direct derivatives of the trilinear corner products. Hex20 and Wedge15
+    /// gradients are computed by differentiating the tabulated monomial
+    /// expansions. In Hex20 geometry mode, edge-node gradients are zero and the
+    /// corner gradients match Hex8.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients) const override;
 
+    /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
+    ///
+    /// \details Hessians are second derivatives in reference coordinates and
+    /// are stored as 3-by-3 matrices. Quadrilateral Hessians use second
+    /// derivatives of the monomial vector and inverse Vandermonde coefficients.
+    /// Hex8 Hessians are delegated to the linear Lagrange Hex8 basis. Hex20 and
+    /// Wedge15 Hessians are computed by differentiating their polynomial
+    /// coefficient tables twice. In Hex20 geometry mode, only the corner
+    /// Hessians from the Hex8 geometry mapping are populated.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const override;
 
@@ -52,6 +166,8 @@ class SerendipityBasis : public BasisFunction {
     bool geometry_mode_;
 };
 
+/// @}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 3f3a9d9b6..f7432f38c 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -25,12 +25,21 @@
 #include <stdexcept>
 #include <type_traits>
 
+/// \defgroup FE_MatrixMath Matrix
+/// \ingroup FE_Math
+/// \brief Fixed-size matrix types, matrix expressions, and small-matrix operations.
+///
+/// \details The Matrix submodule contains row-major fixed-size matrices used
+/// by FE kernels, expression-template support for matrix algebra, and direct
+/// determinant/inverse implementations for common element-level sizes.
+
 namespace svmp {
 namespace FE {
 namespace math {
 
 /**
  * @brief Fixed-size matrix for element-level computations
+ * @ingroup FE_MatrixMath
  * @tparam T Scalar type (float, double)
  * @tparam M Number of rows
  * @tparam N Number of columns
@@ -770,7 +779,14 @@ inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
     return adj * inv_det;
 }
 
-// Template specializations for 2x2 Matrix determinant and inverse
+/**
+ * @brief Specialized fixed-size 2-by-2 matrix for element-level computations.
+ * @ingroup FE_MatrixMath
+ * @tparam T Scalar type.
+ *
+ * This specialization preserves the Matrix API while using direct formulas for
+ * 2-by-2 determinant and inverse operations.
+ */
 template<typename T>
 class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
     static constexpr std::size_t M = 2;
@@ -1006,7 +1022,14 @@ class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
     const T* end() const { return data_ + 4; }
 };
 
-// Template specialization for 3x3 Matrix
+/**
+ * @brief Specialized fixed-size 3-by-3 matrix for element-level computations.
+ * @ingroup FE_MatrixMath
+ * @tparam T Scalar type.
+ *
+ * This specialization preserves the Matrix API while using direct formulas for
+ * 3-by-3 determinant and inverse operations.
+ */
 template<typename T>
 class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
     static constexpr std::size_t M = 3;
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index a1214f9aa..0ec99c81f 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -24,6 +24,19 @@
 #include <stdexcept>
 #include <type_traits>
 
+/// \defgroup FE_Math Math
+/// \ingroup FE
+/// \brief Fixed-size and dense linear algebra utilities for finite-element computations.
+///
+/// \details The Math module provides small fixed-size vector and matrix types
+/// used in element-level kernels, expression-template infrastructure for
+/// allocation-free algebraic expressions, and dense linear algebra utilities
+/// used by basis construction and local transforms.
+///
+/// \defgroup FE_VectorMath Vector
+/// \ingroup FE_Math
+/// \brief Fixed-size vector types and vector expression utilities.
+
 namespace svmp {
 namespace FE {
 namespace math {
@@ -47,6 +60,7 @@ inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
 
 /**
  * @brief Fixed-size vector for element-level computations
+ * @ingroup FE_VectorMath
  * @tparam T Scalar type (float, double)
  * @tparam N Vector dimension
  *
diff --git a/Documentation/Doxyfile b/Documentation/Doxyfile
index acd5ba21c..3c29a08f1 100644
--- a/Documentation/Doxyfile
+++ b/Documentation/Doxyfile
@@ -191,10 +191,10 @@ TREEVIEW_WIDTH         = 250
 EXT_LINKS_IN_WINDOW    = NO
 FORMULA_FONTSIZE       = 10
 USE_MATHJAX            = YES
-MATHJAX_VERSION        = MathJax_3
-MATHJAX_FORMAT         = chtml
-MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@3
-MATHJAX_EXTENSIONS     = ams
+MATHJAX_VERSION        = MathJax_2
+MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@2
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 SERVER_BASED_SEARCH    = NO
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 430390e54..d4bf1d6e5 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -1,6 +1,6 @@
 /**
  * @file test_BasisErrorPaths.cpp
- * @brief Error-path coverage for the migrated Lagrange-focused Basis subset.
+ * @brief Error-path coverage for the Lagrange-focused Basis subset.
  */
 
 #include <gtest/gtest.h>

From c53e0e06ef4165b3b8b4069f2fbec246bbd4ab54 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 11:34:34 -0700
Subject: [PATCH 12/91] updating serendipity basis to be concrete terminal
 classes with `final`

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 442 ++++++++----------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  56 ++-
 2 files changed, 238 insertions(+), 260 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index d551419a8..358e76123 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "SerendipityBasis.h"
-#include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
 #include "Math/DenseLinearAlgebra.h"
 
@@ -19,6 +18,61 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
+void store_gradient(const Gradient& gradient, Real* dst) {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
+}
+
+void evaluate_hex8_reference(Real r,
+                             Real s,
+                             Real t,
+                             Real* values,
+                             Real* gradients,
+                             Real* hessians) {
+    static constexpr int signs[8][3] = {
+        {-1, -1, -1},
+        { 1, -1, -1},
+        { 1,  1, -1},
+        {-1,  1, -1},
+        {-1, -1,  1},
+        { 1, -1,  1},
+        { 1,  1,  1},
+        {-1,  1,  1},
+    };
+
+    for (std::size_t i = 0; i < 8u; ++i) {
+        const Real a = Real(signs[i][0]);
+        const Real b = Real(signs[i][1]);
+        const Real c = Real(signs[i][2]);
+        const Real ar = Real(1) + a * r;
+        const Real bs = Real(1) + b * s;
+        const Real ct = Real(1) + c * t;
+
+        if (values) {
+            values[i] = Real(0.125) * ar * bs * ct;
+        }
+        if (gradients) {
+            Real* g = gradients + i * 3u;
+            g[0] = Real(0.125) * a * bs * ct;
+            g[1] = Real(0.125) * b * ar * ct;
+            g[2] = Real(0.125) * c * ar * bs;
+        }
+        if (hessians) {
+            Real* h = hessians + i * 9u;
+            h[0] = Real(0);
+            h[1] = Real(0.125) * a * b * ct;
+            h[2] = Real(0.125) * a * c * bs;
+            h[3] = h[1];
+            h[4] = Real(0);
+            h[5] = Real(0.125) * b * c * ar;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = Real(0);
+        }
+    }
+}
+
 int quad_serendipity_superlinear_degree(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }
@@ -496,96 +550,24 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
     }
 }
 
-void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
-                                       std::vector<Real>& values) const {
-    values.assign(size_, Real(0));
-    const Real x = xi[0];
-    const Real y = xi[1];
-    const Real z = xi[2];
-
-    if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
-                __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> monomials(size_, Real(0));
-        for (std::size_t j = 0; j < size_; ++j) {
-            const auto [ax, ay] = quad_monomial_exponents_[j];
-            monomials[j] = std::pow(x, ax) * std::pow(y, ay);
-        }
-
-        for (std::size_t i = 0; i < size_; ++i) {
-            Real value = Real(0);
-            for (std::size_t j = 0; j < size_; ++j) {
-                value += monomials[j] * quad_inv_vandermonde_[j * size_ + i];
-            }
-            values[i] = value;
-        }
+void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out,
+                                       Real* SVMP_RESTRICT gradients_out,
+                                       Real* SVMP_RESTRICT hessians_out) const {
+    if (!values_out && !gradients_out && !hessians_out) {
         return;
     }
 
-    if (dimension_ == 3 && order_ == 1) {
-        // Hex8 trilinear shape functions
-        const Real r = x;
-        const Real s = y;
-        const Real t = z;
-        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
-        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
-        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
-        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
-        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
-        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
-        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
-        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
-        return;
+    if (values_out) {
+        std::fill_n(values_out, size_, Real(0));
     }
-
-    const Real r = x;
-    const Real s = y;
-    const Real t = z;
-
-    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
-        // Hex20 geometry mode: use trilinear Hex8 shape functions on corners, edges zero.
-        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
-        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
-        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
-        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
-        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
-        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
-        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
-        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
-        for (std::size_t i = 8; i < 20; ++i) {
-            values[i] = Real(0);
-        }
-        return;
+    if (gradients_out) {
+        std::fill_n(gradients_out, size_ * 3u, Real(0));
     }
-
-    if (element_type_ == ElementType::Hex20) {
-        Real internal_vals[20];
-        eval_hex20_internal(r, s, t, internal_vals);
-        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            values[i] = internal_vals[mesh_to_basis[i]];
-        }
-        return;
+    if (hessians_out) {
+        std::fill_n(hessians_out, size_ * 9u, Real(0));
     }
 
-    if (element_type_ == ElementType::Wedge15) {
-        eval_wedge15_polynomial(r, s, t, values.data(), nullptr, nullptr);
-        return;
-    }
-
-}
-
-void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                          std::vector<Gradient>& gradients) const {
-    gradients.assign(size_, Gradient{});
-
     const Real x = xi[0];
     const Real y = xi[1];
     const Real z = xi[2];
@@ -594,216 +576,174 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         if (quad_monomial_exponents_.size() != size_ ||
             quad_inv_vandermonde_.size() != size_ * size_) {
             throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for gradient evaluation",
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
                 __FILE__, __LINE__, __func__);
         }
 
-        std::vector<Real> dmon_dx(size_, Real(0));
-        std::vector<Real> dmon_dy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dmon_dx[j] =
+            const Real value = std::pow(x, ax) * std::pow(y, ay);
+            const Real dx =
                 (ax > 0) ? Real(ax) * std::pow(x, ax - 1) * std::pow(y, ay) : Real(0);
-            dmon_dy[j] =
+            const Real dy =
                 (ay > 0) ? std::pow(x, ax) * Real(ay) * std::pow(y, ay - 1) : Real(0);
-        }
+            const Real dxx =
+                (ax > 1) ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
+                         : Real(0);
+            const Real dxy =
+                (ax > 0 && ay > 0)
+                    ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
+                    : Real(0);
+            const Real dyy =
+                (ay > 1) ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
+                         : Real(0);
 
-        for (std::size_t i = 0; i < size_; ++i) {
-            Real gx = Real(0);
-            Real gy = Real(0);
-            for (std::size_t j = 0; j < size_; ++j) {
+            for (std::size_t i = 0; i < size_; ++i) {
                 const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                gx += dmon_dx[j] * coeff;
-                gy += dmon_dy[j] * coeff;
+                if (values_out) {
+                    values_out[i] += value * coeff;
+                }
+                if (gradients_out) {
+                    Real* g = gradients_out + i * 3u;
+                    g[0] += dx * coeff;
+                    g[1] += dy * coeff;
+                }
+                if (hessians_out) {
+                    Real* h = hessians_out + i * 9u;
+                    h[0] += dxx * coeff;
+                    h[1] += dxy * coeff;
+                    h[3] += dxy * coeff;
+                    h[4] += dyy * coeff;
+                }
             }
-            gradients[i][0] = gx;
-            gradients[i][1] = gy;
         }
         return;
     }
 
-    // 3D linear hex (Hex8)
     if (dimension_ == 3 && order_ == 1) {
-        const Real r = x, s = y, t = z;
-        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
-
-        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
 
-    // Hex20 geometry mode: use Hex8 gradients
-    if (dimension_ == 3 && order_ == 2 && geometry_mode_ &&
-        (element_type_ == ElementType::Hex20 || element_type_ == ElementType::Quad8)) {
-        const Real r = x, s = y, t = z;
-        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
-
-        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
-        // Edge-node gradients remain zero
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
 
-    // Hex20 analytical gradients using monomial differentiation
-    if (element_type_ == ElementType::Hex20 && order_ == 2) {
-        const Real r = x, s = y, t = z;
-        Gradient internal_grads[20];
-        eval_hex20_grad_internal(r, s, t, internal_grads);
+    if (element_type_ == ElementType::Hex20) {
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
         BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
                          "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            gradients[i] = internal_grads[mesh_to_basis[i]];
+
+        if (values_out) {
+            Real internal_vals[20];
+            eval_hex20_internal(x, y, z, internal_vals);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                values_out[i] = internal_vals[mesh_to_basis[i]];
+            }
+        }
+        if (gradients_out) {
+            Gradient internal_grads[20];
+            eval_hex20_grad_internal(x, y, z, internal_grads);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                store_gradient(internal_grads[mesh_to_basis[i]], gradients_out + i * 3u);
+            }
+        }
+        if (hessians_out) {
+            Hessian internal_hessians[20];
+            eval_hex20_hess_internal(x, y, z, internal_hessians);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                store_hessian(internal_hessians[mesh_to_basis[i]], hessians_out + i * 9u);
+            }
         }
         return;
     }
 
-    // Wedge15 analytical gradients using monomial differentiation
-    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
-        eval_wedge15_polynomial(x, y, z, nullptr, gradients.data(), nullptr);
+    if (element_type_ == ElementType::Wedge15) {
+        std::array<Gradient, 15u> wedge_gradients{};
+        std::array<Hessian, 15u> wedge_hessians{};
+        eval_wedge15_polynomial(x,
+                                 y,
+                                 z,
+                                 values_out,
+                                 gradients_out ? wedge_gradients.data() : nullptr,
+                                 hessians_out ? wedge_hessians.data() : nullptr);
+        if (gradients_out) {
+            for (std::size_t i = 0; i < 15u; ++i) {
+                store_gradient(wedge_gradients[i], gradients_out + i * 3u);
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t i = 0; i < 15u; ++i) {
+                store_hessian(wedge_hessians[i], hessians_out + i * 9u);
+            }
+        }
         return;
     }
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
+    throw BasisEvaluationException("SerendipityBasis::evaluate_all_to: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
 
-void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                         std::vector<Hessian>& hessians) const {
-    hessians.assign(size_, Hessian{});
-    const Real x = xi[0];
-    const Real y = xi[1];
-    const Real z = xi[2];
-
-    if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for Hessian evaluation",
-                __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> dxx(size_, Real(0));
-        std::vector<Real> dxy(size_, Real(0));
-        std::vector<Real> dyy(size_, Real(0));
-        for (std::size_t j = 0; j < size_; ++j) {
-            const auto [ax, ay] = quad_monomial_exponents_[j];
-            dxx[j] = (ax > 1)
-                         ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
-                         : Real(0);
-            dxy[j] = (ax > 0 && ay > 0)
-                         ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
-                         : Real(0);
-            dyy[j] = (ay > 1)
-                         ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
-                         : Real(0);
-        }
+void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                       std::vector<Real>& values) const {
+    values.resize(size_);
+    evaluate_values_to(xi, values.data());
+}
 
-        for (std::size_t i = 0; i < size_; ++i) {
-            for (std::size_t j = 0; j < size_; ++j) {
-                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                hessians[i](0, 0) += dxx[j] * coeff;
-                hessians[i](0, 1) += dxy[j] * coeff;
-                hessians[i](1, 1) += dyy[j] * coeff;
-            }
-            hessians[i](1, 0) = hessians[i](0, 1);
-        }
-        return;
+void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                          std::vector<Gradient>& gradients) const {
+    gradients.resize(size_);
+    std::vector<Real> flat(size_ * 3u, Real(0));
+    evaluate_gradients_to(xi, flat.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        gradients[i][0] = flat[i * 3u + 0u];
+        gradients[i][1] = flat[i * 3u + 1u];
+        gradients[i][2] = flat[i * 3u + 2u];
     }
+}
 
-    if (element_type_ == ElementType::Hex8 && order_ == 1) {
-        static const LagrangeBasis parent(ElementType::Hex8, 1);
-        parent.evaluate_hessians(xi, hessians);
-        return;
+void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                         std::vector<Hessian>& hessians) const {
+    hessians.resize(size_);
+    std::vector<Real> flat(size_ * 9u, Real(0));
+    evaluate_hessians_to(xi, flat.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        hessians[i] = load_hessian(flat.data() + i * 9u);
     }
+}
 
-    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
-        static const LagrangeBasis parent(ElementType::Hex8, 1);
-        std::array<Real, 8u * 9u> parent_hessians{};
-        parent.evaluate_hessians_to(xi, parent_hessians.data());
-        for (std::size_t i = 0; i < 8; ++i) {
-            for (std::size_t r = 0; r < 3; ++r) {
-                for (std::size_t c = 0; c < 3; ++c) {
-                    hessians[i](r, c) = parent_hessians[i * 9u + r * 3u + c];
-                }
-            }
-        }
-        return;
+void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
+                                    std::vector<Real>& values,
+                                    std::vector<Gradient>& gradients,
+                                    std::vector<Hessian>& hessians) const {
+    values.resize(size_);
+    gradients.resize(size_);
+    hessians.resize(size_);
+    std::vector<Real> flat_gradients(size_ * 3u, Real(0));
+    std::vector<Real> flat_hessians(size_ * 9u, Real(0));
+    evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        gradients[i][0] = flat_gradients[i * 3u + 0u];
+        gradients[i][1] = flat_gradients[i * 3u + 1u];
+        gradients[i][2] = flat_gradients[i * 3u + 2u];
+        hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
     }
+}
 
-    if (element_type_ == ElementType::Hex20 && order_ == 2) {
-        Hessian internal_hessians[20];
-        eval_hex20_hess_internal(x, y, z, internal_hessians);
-        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            hessians[i] = internal_hessians[mesh_to_basis[i]];
-        }
-        return;
-    }
+void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT values_out) const {
+    evaluate_all_to(xi, values_out, nullptr, nullptr);
+}
 
-    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
-        eval_wedge15_polynomial(x, y, z, nullptr, nullptr, hessians.data());
-        return;
-    }
+void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                             Real* SVMP_RESTRICT gradients_out) const {
+    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+}
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
-                                   __FILE__, __LINE__, __func__);
+void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                            Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index fc0b897cf..9c55c8eec 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -61,7 +61,7 @@ namespace basis {
 /// Hex8 corner functions for geometry mapping and assigns zero contribution to
 /// the quadratic edge nodes. This preserves the public Hex20 node count while
 /// intentionally reducing the geometry interpolation order.
-class SerendipityBasis : public BasisFunction {
+class SerendipityBasis final : public BasisFunction {
 public:
     /// \brief Construct a serendipity basis for an element type and polynomial order.
     ///
@@ -81,19 +81,19 @@ class SerendipityBasis : public BasisFunction {
     SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
 
     /// \copydoc BasisFunction::basis_type()
-    BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+    BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
 
     /// \copydoc BasisFunction::element_type()
-    ElementType element_type() const noexcept override { return element_type_; }
+    ElementType element_type() const noexcept final { return element_type_; }
 
     /// \copydoc BasisFunction::dimension()
-    int dimension() const noexcept override { return dimension_; }
+    int dimension() const noexcept final { return dimension_; }
 
     /// \copydoc BasisFunction::order()
-    int order() const noexcept override { return order_; }
+    int order() const noexcept final { return order_; }
 
     /// \copydoc BasisFunction::size()
-    std::size_t size() const noexcept override { return size_; }
+    std::size_t size() const noexcept final { return size_; }
 
     /// \brief Return the reference interpolation nodes in basis ordering.
     ///
@@ -119,7 +119,7 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override;
+                         std::vector<Real>& values) const final;
 
     /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
     ///
@@ -134,7 +134,7 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                            std::vector<Gradient>& gradients) const override;
+                            std::vector<Gradient>& gradients) const final;
 
     /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
     ///
@@ -149,7 +149,40 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                           std::vector<Hessian>& hessians) const override;
+                           std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate serendipity values, gradients, and Hessians together.
+    ///
+    /// \details This vector API is backed by the same flat-buffer evaluator as
+    /// the assembly-oriented `*_to` methods, so topology-specific polynomial
+    /// setup can be shared for a quadrature point.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    void evaluate_all(const math::Vector<Real, 3>& xi,
+                      std::vector<Real>& values,
+                      std::vector<Gradient>& gradients,
+                      std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate serendipity basis values into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
+    void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                            Real* SVMP_RESTRICT values_out) const final;
+
+    /// \brief Evaluate serendipity basis gradients into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                               Real* SVMP_RESTRICT gradients_out) const final;
+
+    /// \brief Evaluate serendipity basis Hessians into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -164,6 +197,11 @@ class SerendipityBasis : public BasisFunction {
     // When true, this basis is used purely for geometry mapping and may use
     // reduced polynomial order (e.g., Hex20 geometry as Hex8).
     bool geometry_mode_;
+
+    void evaluate_all_to(const math::Vector<Real, 3>& xi,
+                         Real* SVMP_RESTRICT values_out,
+                         Real* SVMP_RESTRICT gradients_out,
+                         Real* SVMP_RESTRICT hessians_out) const;
 };
 
 /// @}

From 1289c086f637cdc1544aff0bfe99eb78ad3b9f1c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 16:33:06 -0700
Subject: [PATCH 13/91] adding switch cases for converting consts element types
 to fe element types. replaced custom math vector/matrix implementations for
 Eigen-backed implementations

---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |   27 +
 Code/Source/solver/FE/Basis/BasisFactory.h    |   24 +
 Code/Source/solver/FE/Basis/BasisFunction.cpp |    4 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |    4 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |    7 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      |    4 +-
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  278 +---
 .../solver/FE/Math/DenseLinearAlgebra.h       |    9 +-
 .../solver/FE/Math/DenseTransformKernels.h    |   70 +-
 Code/Source/solver/FE/Math/Matrix.h           | 1472 +----------------
 Code/Source/solver/FE/Math/MatrixExpr.h       |  630 -------
 Code/Source/solver/FE/Math/Vector.h           |  826 +--------
 Code/Source/solver/FE/Math/VectorExpr.h       |  476 ------
 Code/Source/solver/nn.cpp                     |  115 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  106 +-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  141 +-
 .../FE/Basis/test_HigherOrderWedge.cpp        |   22 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  207 ++-
 .../FE/Basis/test_SerendipityTensorModal.cpp  |  185 ++-
 .../FE/Math/test_DenseLinearAlgebra.cpp       |  143 +-
 tests/unitTests/FE/Math/test_Matrix.cpp       |  593 -------
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |  527 ------
 tests/unitTests/FE/Math/test_Vector.cpp       |  588 -------
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |  408 -----
 24 files changed, 1038 insertions(+), 5828 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/MatrixExpr.h
 delete mode 100644 Code/Source/solver/FE/Math/VectorExpr.h
 delete mode 100644 tests/unitTests/FE/Math/test_Matrix.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_MatrixExpr.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_Vector.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_VectorExpr.cpp

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index bc01be0ed..b48e25536 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -3,6 +3,7 @@
 
 #include "BasisFactory.h"
 
+#include "BasisTraits.h"
 #include "LagrangeBasis.h"
 #include "SerendipityBasis.h"
 
@@ -74,6 +75,32 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
     }
 }
 
+BasisRequest default_basis_request(ElementType element_type) {
+    switch (element_type) {
+        // Reduced serendipity node layouts have no complete Lagrange basis at
+        // their node count; they always use the quadratic serendipity space.
+        case ElementType::Quad8:
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+            return BasisRequest{element_type, BasisType::Serendipity, 2};
+        case ElementType::Point1:
+            return BasisRequest{element_type, BasisType::Lagrange, 0};
+        default: {
+            const int order = complete_lagrange_alias_order(element_type);
+            if (order >= 0) {
+                return BasisRequest{element_type, BasisType::Lagrange, order};
+            }
+            throw BasisElementCompatibilityException(
+                "BasisFactory: no default basis is defined for the requested element type",
+                __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+std::shared_ptr<BasisFunction> create_default_for(ElementType element_type) {
+    return create(default_basis_request(element_type));
+}
+
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index b188b3aa2..3922d5ced 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -38,6 +38,30 @@ namespace basis_factory {
 
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
 
+/// \brief Return the default basis request (family and order) for an element type.
+///
+/// \details This is the single source of truth for which basis family and
+/// polynomial order a given element type uses by default: serendipity node
+/// layouts (Quad8, Hex20, Wedge15) select the quadratic serendipity family,
+/// and every complete Lagrange element selects the Lagrange family at the
+/// order given by its node layout. Solver-facing adapters should translate
+/// their element names to ElementType and delegate the basis choice here
+/// rather than tabulating family/order themselves.
+///
+/// \param element_type Element type to select a default basis for.
+/// \return Basis request suitable for create().
+/// \throws BasisElementCompatibilityException If no default basis is defined
+///         for the element type.
+[[nodiscard]] BasisRequest default_basis_request(ElementType element_type);
+
+/// \brief Create the default basis for an element type.
+///
+/// \details Equivalent to create(default_basis_request(element_type)).
+///
+/// \param element_type Element type to create a default basis for.
+/// \return Shared basis instance.
+[[nodiscard]] std::shared_ptr<BasisFunction> create_default_for(ElementType element_type);
+
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 3d95671f4..b98a36292 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -84,7 +84,7 @@ void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
                                        Real eps) const {
     std::vector<Real> base;
     evaluate_values(xi, base);
-    gradients.assign(base.size(), Gradient{});
+    gradients.assign(base.size(), Gradient::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
@@ -109,7 +109,7 @@ void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
                                       Real eps) const {
     std::vector<Gradient> base_grad;
     evaluate_gradients(xi, base_grad);
-    hessians.assign(base_grad.size(), Hessian{});
+    hessians.assign(base_grad.size(), Hessian::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index f8f78d7b6..e7de2bf01 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -39,7 +39,7 @@ using Hessian  = math::Matrix<Real, 3, 3>;
                                                     Real xy,
                                                     Real xz,
                                                     Real yz) {
-    Hessian hessian{};
+    Hessian hessian = Hessian::Zero();
     hessian(0, 0) = xx;
     hessian(1, 1) = yy;
     hessian(2, 2) = zz;
@@ -62,7 +62,7 @@ inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
 }
 
 [[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
-    Hessian hessian{};
+    Hessian hessian = Hessian::Zero();
     hessian(0, 0) = src[0];
     hessian(0, 1) = src[1];
     hessian(0, 2) = src[2];
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index d777447cb..4f8c15bb1 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -220,8 +220,8 @@ void evaluate_simplex(const Vec3& xi,
                       SimplexEval& out) {
     const std::size_t n = exponents.size();
     out.value.assign(n, Real(0));
-    out.gradient.assign(n, Gradient{});
-    out.hessian.assign(n, Hessian{});
+    out.gradient.assign(n, Gradient::Zero());
+    out.hessian.assign(n, Hessian::Zero());
 
     if (n == 1u && order == 0) {
         out.value[0] = Real(1);
@@ -230,7 +230,8 @@ void evaluate_simplex(const Vec3& xi,
 
     const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
     std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
-    std::array<Gradient, 4> lambda_grad{};
+    std::array<Gradient, 4> lambda_grad;
+    lambda_grad.fill(Gradient::Zero());
 
     lambda[1] = xi[0];
     lambda[2] = xi[1];
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 358e76123..30eac9c38 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -377,7 +377,7 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians
     }
 
     for (int i = 0; i < 20; ++i) {
-        Hessian H{};
+        Hessian H = Hessian::Zero();
         for (int j = 0; j < 20; ++j) {
             H(0, 0) += hex20_coeffs[j][i] * d2phi_drr[j];
             H(1, 1) += hex20_coeffs[j][i] * d2phi_dss[j];
@@ -450,7 +450,7 @@ void eval_wedge15_polynomial(Real r,
         Real gr = Real(0);
         Real gs = Real(0);
         Real gt = Real(0);
-        Hessian H{};
+        Hessian H = Hessian::Zero();
         for (int j = 0; j < 15; ++j) {
             const Real coefficient =
                 kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 8be9a7560..fb27ad7bf 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -5,9 +5,7 @@
 
 #include "FEException.h"
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
 #include <Eigen/Dense>
-#endif
 
 #include <algorithm>
 #include <cmath>
@@ -24,16 +22,24 @@ namespace math {
 
 namespace {
 
-constexpr std::size_t kDenseSolveRhsBlock = 32u;
+using DenseMatrix = DenseLUSolver::DenseMatrix;
+using RowMajorMatrix =
+    Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ConstRowMajorMap = Eigen::Map<const RowMajorMatrix>;
+
+ConstRowMajorMap map_row_major(std::span<const Real> matrix,
+                               std::size_t rows,
+                               std::size_t cols) {
+    return ConstRowMajorMap(matrix.data(),
+                            static_cast<Eigen::Index>(rows),
+                            static_cast<Eigen::Index>(cols));
+}
 
-void materialize_inverse_from_solver(const DenseLUSolver& solver,
-                                     std::vector<Real>& inverse) {
-    const std::size_t n = solver.n;
-    inverse.assign(n * n, Real(0));
-    for (std::size_t diag = 0; diag < n; ++diag) {
-        inverse[diag * n + diag] = Real(1);
-    }
-    solver.solve_in_place(std::span<Real>(inverse.data(), inverse.size()), n);
+void copy_to_row_major(const DenseMatrix& source, std::vector<Real>& dest) {
+    const auto rows = static_cast<std::size_t>(source.rows());
+    const auto cols = static_cast<std::size_t>(source.cols());
+    dest.resize(rows * cols);
+    Eigen::Map<RowMajorMatrix>(dest.data(), source.rows(), source.cols()) = source;
 }
 
 } // namespace
@@ -84,59 +90,18 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs,
                              label + ": dense solve requires at least one right-hand side");
     DENSE_LINALG_CHECK(rhs.size() == n * rhs_count,
                              label + ": dense multi-RHS solve size mismatch");
-    DENSE_LINALG_CHECK(lu.size() == n * n && pivots.size() == n,
+    DENSE_LINALG_CHECK(lu.rows() == static_cast<Eigen::Index>(n),
                              label + ": dense solver is not factorized");
-
-    for (std::size_t k = 0; k < n; ++k) {
-        if (pivots[k] != k) {
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    std::swap(rhs[k * rhs_count + r],
-                              rhs[pivots[k] * rhs_count + r]);
-                }
-            }
-        }
-    }
-
-    for (std::size_t row = 0; row < n; ++row) {
-        for (std::size_t col = 0; col < row; ++col) {
-            const Real factor = lu[row * n + col];
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
-                }
-            }
-        }
+    if (n == 0) {
+        return;
     }
 
-    for (std::size_t rev = 0; rev < n; ++rev) {
-        const std::size_t row = n - 1u - rev;
-        for (std::size_t col = row + 1u; col < n; ++col) {
-            const Real factor = lu[row * n + col];
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
-                }
-            }
-        }
-        const Real pivot = lu[row * n + row];
-        DENSE_LINALG_CHECK(
-            std::abs(pivot) > pivot_tolerance,
-            label + ": zero pivot during dense solve");
-        for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-            const std::size_t end =
-                std::min(rhs_count, block + kDenseSolveRhsBlock);
-            for (std::size_t r = block; r < end; ++r) {
-                rhs[row * rhs_count + r] /= pivot;
-            }
-        }
-    }
+    Eigen::Map<RowMajorMatrix> rhs_map(rhs.data(),
+                                       static_cast<Eigen::Index>(n),
+                                       static_cast<Eigen::Index>(rhs_count));
+    // Evaluate into a temporary: lu.solve cannot alias its argument.
+    const DenseMatrix solution = lu.solve(rhs_map);
+    rhs_map = solution;
 }
 
 std::vector<Real> DenseLUSolver::solve(std::span<const Real> rhs) const {
@@ -155,14 +120,8 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     DENSE_LINALG_CHECK(rows > 0 && cols > 0,
                              std::string(label) + ": diagnostics require a nonempty matrix");
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
-    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                             static_cast<Eigen::Index>(rows),
-                                             static_cast<Eigen::Index>(cols));
-    const Matrix dense = A;
-    Eigen::JacobiSVD<Matrix> svd(dense);
+    const DenseMatrix dense = map_row_major(matrix, rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense);
 
     DenseMatrixDiagnostics diagnostics;
     const auto& singular_values = svd.singularValues();
@@ -189,22 +148,6 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
             diagnostics.smallest_retained_singular_value;
     }
     return diagnostics;
-#else
-    DenseMatrixDiagnostics diagnostics;
-    diagnostics.largest_singular_value = dense_matrix_max_abs(matrix);
-    diagnostics.tolerance =
-        dense_matrix_pivot_tolerance(rows, cols, diagnostics.largest_singular_value);
-    diagnostics.rank =
-        dense_matrix_rank(std::vector<Real>(matrix.begin(), matrix.end()), rows, cols);
-    const std::size_t full_rank = std::min(rows, cols);
-    if (diagnostics.rank == full_rank) {
-        diagnostics.smallest_retained_singular_value = diagnostics.tolerance;
-    }
-    // Exact condition estimates require SVD diagnostics. In Eigen-disabled
-    // builds this stays explicit instead of relying on a misleading estimate.
-    diagnostics.condition_estimate = std::numeric_limits<Real>::infinity();
-    return diagnostics;
-#endif
 }
 
 DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
@@ -215,55 +158,28 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
 
     DenseLUSolver solver;
     solver.n = n;
-    solver.lu = std::move(matrix);
-    solver.pivots.resize(n);
-    const Real max_abs = dense_matrix_max_abs(solver.lu);
-    solver.pivot_tolerance =
-        dense_matrix_pivot_tolerance(n, n, max_abs);
     solver.label = std::string(label);
+    const Real max_abs =
+        dense_matrix_max_abs(std::span<const Real>(matrix.data(), matrix.size()));
+    solver.pivot_tolerance = dense_matrix_pivot_tolerance(n, n, max_abs);
 
+    solver.lu.compute(map_row_major(matrix, n, n));
+
+    // Partial pivoting leaves the pivots on the diagonal of the packed LU
+    // factor; a pivot below the scale-aware tolerance marks rank deficiency.
     Real max_pivot_abs = Real(0);
     Real min_pivot_abs = std::numeric_limits<Real>::infinity();
-    for (std::size_t col = 0; col < n; ++col) {
-        std::size_t pivot_row = col;
-        Real pivot_abs = std::abs(solver.lu[col * n + col]);
-        for (std::size_t row = col + 1; row < n; ++row) {
-            const Real candidate = std::abs(solver.lu[row * n + col]);
-            if (candidate > pivot_abs) {
-                pivot_abs = candidate;
-                pivot_row = row;
-            }
-        }
-
+    const auto diagonal = solver.lu.matrixLU().diagonal();
+    for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
+        const Real pivot_magnitude = std::abs(diagonal[col]);
         DENSE_LINALG_CHECK(
-            pivot_abs > solver.pivot_tolerance,
+            pivot_magnitude > solver.pivot_tolerance,
             solver.label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
                 ", pivot below scale-aware tolerance " +
                 std::to_string(solver.pivot_tolerance) + ")");
-
-        solver.pivots[col] = pivot_row;
-        if (pivot_row != col) {
-            for (std::size_t j = 0; j < n; ++j) {
-                std::swap(solver.lu[col * n + j], solver.lu[pivot_row * n + j]);
-            }
-        }
-
-        const Real pivot = solver.lu[col * n + col];
-        DENSE_LINALG_CHECK(
-            std::abs(pivot) > solver.pivot_tolerance,
-            solver.label + ": zero pivot after row exchange");
-        const Real pivot_magnitude = std::abs(pivot);
         max_pivot_abs = std::max(max_pivot_abs, pivot_magnitude);
         min_pivot_abs = std::min(min_pivot_abs, pivot_magnitude);
-
-        for (std::size_t row = col + 1; row < n; ++row) {
-            const Real factor = solver.lu[row * n + col] / pivot;
-            solver.lu[row * n + col] = factor;
-            for (std::size_t j = col + 1; j < n; ++j) {
-                solver.lu[row * n + j] -= factor * solver.lu[col * n + j];
-            }
-        }
     }
 
     solver.diagnostics.rank = n;
@@ -293,20 +209,14 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
         dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
                                  n, n, label);
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     if (std::isfinite(solver.diagnostics.condition_estimate) &&
         std::isfinite(result.diagnostics.condition_estimate) &&
         result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
-        using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-        using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-        const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                                 static_cast<Eigen::Index>(n),
-                                                 static_cast<Eigen::Index>(n));
-        const Matrix dense = A;
-        Eigen::JacobiSVD<Matrix> svd(dense,
-                                     Eigen::ComputeFullU | Eigen::ComputeFullV);
-        Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(n),
-                                            static_cast<Eigen::Index>(n));
+        const DenseMatrix dense = map_row_major(matrix, n, n);
+        Eigen::JacobiSVD<DenseMatrix> svd(dense,
+                                          Eigen::ComputeFullU | Eigen::ComputeFullV);
+        DenseMatrix sigma_inverse = DenseMatrix::Zero(static_cast<Eigen::Index>(n),
+                                                      static_cast<Eigen::Index>(n));
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
             DENSE_LINALG_CHECK(
@@ -314,20 +224,14 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = Real(1) / singular_values[i];
         }
-        const Matrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
-        result.inverse.assign(n * n, Real(0));
-        for (std::size_t row = 0; row < n; ++row) {
-            for (std::size_t col = 0; col < n; ++col) {
-                result.inverse[row * n + col] =
-                    inverse(static_cast<Eigen::Index>(row), static_cast<Eigen::Index>(col));
-            }
-        }
+        const DenseMatrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+        copy_to_row_major(inverse, result.inverse);
         result.used_svd_fallback = true;
         return result;
     }
-#endif
 
-    materialize_inverse_from_solver(solver, result.inverse);
+    const DenseMatrix inverse = solver.lu.inverse();
+    copy_to_row_major(inverse, result.inverse);
     return result;
 }
 
@@ -357,9 +261,10 @@ std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
                                       std::size_t n,
                                       std::string_view label) {
     const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
-    std::vector<Real> inverse;
-    materialize_inverse_from_solver(solver, inverse);
-    return inverse;
+    const DenseMatrix inverse = solver.lu.inverse();
+    std::vector<Real> result;
+    copy_to_row_major(inverse, result);
+    return result;
 }
 
 std::size_t dense_matrix_rank(std::vector<Real> matrix,
@@ -367,46 +272,22 @@ std::size_t dense_matrix_rank(std::vector<Real> matrix,
                               std::size_t cols) {
     DENSE_LINALG_CHECK(matrix.size() == rows * cols,
                              "dense_matrix_rank: size mismatch");
-    const Real tolerance =
-        dense_matrix_pivot_tolerance(rows, cols, dense_matrix_max_abs(matrix));
 
-    std::size_t rank = 0;
-    std::size_t pivot_row = 0;
-    for (std::size_t col = 0; col < cols && pivot_row < rows; ++col) {
-        std::size_t best_row = pivot_row;
-        Real best_abs = std::abs(matrix[pivot_row * cols + col]);
-        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
-            const Real candidate = std::abs(matrix[row * cols + col]);
-            if (candidate > best_abs) {
-                best_abs = candidate;
-                best_row = row;
-            }
-        }
-        if (best_abs <= tolerance) {
-            continue;
-        }
+    const DenseMatrix dense =
+        map_row_major(std::span<const Real>(matrix.data(), matrix.size()), rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense);
 
-        if (best_row != pivot_row) {
-            for (std::size_t c = col; c < cols; ++c) {
-                std::swap(matrix[pivot_row * cols + c], matrix[best_row * cols + c]);
-            }
-        }
+    const auto& singular_values = svd.singularValues();
+    const Real largest =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    const Real tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols, largest);
 
-        const Real pivot = matrix[pivot_row * cols + col];
-        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
-            const Real factor = matrix[row * cols + col] / pivot;
-            if (std::abs(factor) <= tolerance) {
-                matrix[row * cols + col] = Real(0);
-                continue;
-            }
-            matrix[row * cols + col] = Real(0);
-            for (std::size_t c = col + 1; c < cols; ++c) {
-                matrix[row * cols + c] -= factor * matrix[pivot_row * cols + c];
-            }
+    std::size_t rank = 0;
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        if (singular_values[i] > tolerance) {
+            ++rank;
         }
-
-        ++rank;
-        ++pivot_row;
     }
     return rank;
 }
@@ -421,17 +302,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     DENSE_LINALG_CHECK(rows > 0 && cols > 0,
                              std::string(label) + ": pseudo-inverse requires a nonempty matrix");
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
-    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                             static_cast<Eigen::Index>(rows),
-                                             static_cast<Eigen::Index>(cols));
-    const Matrix dense = A;
-    Eigen::JacobiSVD<Matrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
+    const DenseMatrix dense = map_row_major(matrix, rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
 
     DensePseudoInverseResult result;
-    result.inverse.assign(cols * rows, Real(0));
 
     const auto& singular_values = svd.singularValues();
     result.largest_singular_value =
@@ -439,8 +313,8 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     result.tolerance =
         dense_matrix_singular_value_tolerance(rows, cols, result.largest_singular_value);
 
-    Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(cols),
-                                        static_cast<Eigen::Index>(rows));
+    DenseMatrix sigma_inverse = DenseMatrix::Zero(static_cast<Eigen::Index>(cols),
+                                                  static_cast<Eigen::Index>(rows));
     for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
         const Real sigma = singular_values[i];
         if (sigma <= result.tolerance) {
@@ -451,22 +325,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
         result.smallest_retained_singular_value = sigma;
     }
 
-    const Matrix pseudo_inverse =
+    const DenseMatrix pseudo_inverse =
         svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
-    for (std::size_t r = 0; r < cols; ++r) {
-        for (std::size_t c = 0; c < rows; ++c) {
-            result.inverse[r * rows + c] =
-                pseudo_inverse(static_cast<Eigen::Index>(r), static_cast<Eigen::Index>(c));
-        }
-    }
+    copy_to_row_major(pseudo_inverse, result.inverse);
     return result;
-#else
-    DENSE_LINALG_CHECK(
-        false,
-        std::string(label) +
-            ": rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN");
-    return {};
-#endif
 }
 
 } // namespace math
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 6c81755f4..d322ef958 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -6,6 +6,8 @@
 
 #include "Types.h"
 
+#include <Eigen/Dense>
+
 #include <cstddef>
 #include <limits>
 #include <span>
@@ -18,7 +20,7 @@ namespace FE {
 namespace math {
 
 // Dense solve, inverse, rank, and pseudo-inverse support for FE construction
-// utilities. Matrices are row-major: matrix[row * cols + col].
+// utilities, backed by Eigen. Matrices are row-major: matrix[row * cols + col].
 [[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
 
 [[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
@@ -57,9 +59,10 @@ struct DenseInverseResult {
 [[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
 
 struct DenseLUSolver {
+    using DenseMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+
     std::size_t n{0};
-    std::vector<Real> lu;
-    std::vector<std::size_t> pivots;
+    Eigen::PartialPivLU<DenseMatrix> lu;
     DenseMatrixDiagnostics diagnostics;
     Real pivot_tolerance{0};
     std::string label;
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 50f1002de..2ddb9cefa 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -6,17 +6,21 @@
 
 #include "Types.h"
 
-#include <algorithm>
-#include <array>
+#include <Eigen/Core>
+
 #include <cstddef>
 
 namespace svmp {
 namespace FE {
 namespace math {
 
-constexpr std::size_t dense_transform_blocked_min_rows() noexcept { return 32u; }
-constexpr std::size_t dense_transform_blocked_min_rhs() noexcept { return 4u; }
-
+/// \brief Apply a row-major dense matrix to a batch of right-hand sides.
+///
+/// Computes output = matrix * input where matrix is rows-by-cols (row-major),
+/// input holds cols rows of rhs_count values each (row stride
+/// input_row_stride), and output holds rows rows of rhs_count values each
+/// (row stride output_row_stride). Strides may exceed rhs_count for padded
+/// layouts; padding entries are left untouched.
 inline void dense_transform_batched_row_major(
     const Real* SVMP_RESTRICT matrix,
     std::size_t rows,
@@ -30,41 +34,29 @@ inline void dense_transform_batched_row_major(
         return;
     }
 
-    if (rows < dense_transform_blocked_min_rows() ||
-        rhs_count < dense_transform_blocked_min_rhs()) {
-        for (std::size_t row = 0; row < rows; ++row) {
-            const Real* matrix_row = matrix + row * cols;
-            Real* output_row = output + row * output_row_stride;
-            for (std::size_t rhs = 0; rhs < rhs_count; ++rhs) {
-                Real value = Real(0);
-                for (std::size_t col = 0; col < cols; ++col) {
-                    value += matrix_row[col] * input[col * input_row_stride + rhs];
-                }
-                output_row[rhs] = value;
-            }
-        }
-        return;
-    }
+    using RowMajorMatrix =
+        Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMap = Eigen::Map<const RowMajorMatrix>;
+    using ConstStridedMap =
+        Eigen::Map<const RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
+    using StridedMap =
+        Eigen::Map<RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
 
-    constexpr std::size_t kRhsBlock = 32u;
-    for (std::size_t row = 0; row < rows; ++row) {
-        const Real* matrix_row = matrix + row * cols;
-        Real* output_row = output + row * output_row_stride;
-        for (std::size_t rhs_base = 0; rhs_base < rhs_count; rhs_base += kRhsBlock) {
-            const std::size_t block_size = std::min(kRhsBlock, rhs_count - rhs_base);
-            std::array<Real, kRhsBlock> accum{};
-            for (std::size_t col = 0; col < cols; ++col) {
-                const Real coeff = matrix_row[col];
-                const Real* input_row = input + col * input_row_stride + rhs_base;
-                for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
-                    accum[rhs] += coeff * input_row[rhs];
-                }
-            }
-            for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
-                output_row[rhs_base + rhs] = accum[rhs];
-            }
-        }
-    }
+    const ConstMap matrix_map(matrix,
+                              static_cast<Eigen::Index>(rows),
+                              static_cast<Eigen::Index>(cols));
+    const ConstStridedMap input_map(
+        input,
+        static_cast<Eigen::Index>(cols),
+        static_cast<Eigen::Index>(rhs_count),
+        Eigen::OuterStride<>(static_cast<Eigen::Index>(input_row_stride)));
+    StridedMap output_map(
+        output,
+        static_cast<Eigen::Index>(rows),
+        static_cast<Eigen::Index>(rhs_count),
+        Eigen::OuterStride<>(static_cast<Eigen::Index>(output_row_stride)));
+
+    output_map.noalias() = matrix_map * input_map;
 }
 
 } // namespace math
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index f7432f38c..ce1d4a612 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -6,32 +6,25 @@
 
 /**
  * @file Matrix.h
- * @brief Fixed-size matrices with expression templates and specializations for FE computations
+ * @brief Fixed-size matrix types for FE computations, backed by Eigen.
  *
- * This header provides optimized fixed-size matrix operations for element-level
- * computations. Includes specialized analytical formulas for 2x2 and 3x3 matrices
- * (determinant, inverse using Cramer's rule) and Gauss elimination for larger matrices.
- * All operations use expression templates to eliminate temporaries.
+ * The FE library standardizes on Eigen for linear algebra. These aliases give
+ * element-level code a stable vocabulary type without re-exporting all of
+ * Eigen. Storage is Eigen's default (column-major); element access through
+ * operator()(row, col) is unchanged. Note that, unlike the previous in-house
+ * implementation, Eigen types are NOT zero-initialized by default
+ * construction; use Matrix::Zero() where a zeroed value is required.
  */
 
-#include "MatrixExpr.h"
 #include "Vector.h"
-#include "../Common/Types.h"
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <initializer_list>
-#include <ostream>
-#include <stdexcept>
-#include <type_traits>
+
+#include <Eigen/Core>
+
+#include <cstddef>
 
 /// \defgroup FE_MatrixMath Matrix
 /// \ingroup FE_Math
-/// \brief Fixed-size matrix types, matrix expressions, and small-matrix operations.
-///
-/// \details The Matrix submodule contains row-major fixed-size matrices used
-/// by FE kernels, expression-template support for matrix algebra, and direct
-/// determinant/inverse implementations for common element-level sizes.
+/// \brief Fixed-size matrix type aliases.
 
 namespace svmp {
 namespace FE {
@@ -43,1266 +36,9 @@ namespace math {
  * @tparam T Scalar type (float, double)
  * @tparam M Number of rows
  * @tparam N Number of columns
- *
- * Storage is row-major for cache efficiency. Memory is aligned for SIMD operations.
- * Specializations exist for 2x2, 3x3, 4x4 matrices with analytical algorithms.
  */
 template<typename T, std::size_t M, std::size_t N>
-class Matrix : public MatrixExpr<Matrix<T, M, N>> {
-    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
-    static_assert(M > 0 && N > 0, "Matrix dimensions must be positive");
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[M * N];  // Row-major, SIMD-friendly storage
-
-    // Helper to compute linear index from (i,j)
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * N + j;
-    }
-
-public:
-    // Type definitions
-    using value_type = T;
-    using size_type = std::size_t;
-    using reference = T&;
-    using const_reference = const T&;
-    using pointer = T*;
-    using const_pointer = const T*;
-
-    /**
-     * @brief Default constructor - zero initializes all elements
-     */
-    constexpr Matrix() : data_{} {}
-
-    /**
-     * @brief Fill constructor - initializes all elements with same value
-     * @param value Value to fill matrix with
-     */
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Initializer list constructor for row-wise initialization
-     * @param init Nested initializer lists {{row0}, {row1}, ...}
-     */
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= M) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= N) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    /**
-     * @brief Constructor from expression template
-     * @tparam Expr Expression type
-     * @param expr Matrix expression to evaluate
-     */
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    /**
-     * @brief Copy constructor
-     */
-    constexpr Matrix(const Matrix&) = default;
-
-    /**
-     * @brief Move constructor
-     */
-    constexpr Matrix(Matrix&&) noexcept = default;
-
-    /**
-     * @brief Copy assignment
-     */
-    Matrix& operator=(const Matrix&) = default;
-
-    /**
-     * @brief Move assignment
-     */
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    /**
-     * @brief Assignment from expression template
-     * @tparam Expr Expression type
-     * @param expr Matrix expression to evaluate
-     * @return Reference to this
-     */
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Get number of rows (compile-time constant)
-     * @return Number of rows
-     */
-    static constexpr size_type rows() { return M; }
-
-    /**
-     * @brief Get number of columns (compile-time constant)
-     * @return Number of columns
-     */
-    static constexpr size_type cols() { return N; }
-
-    /**
-     * @brief Get total number of elements
-     * @return M * N
-     */
-    static constexpr size_type size() { return M * N; }
-
-    /**
-     * @brief Element access (no bounds checking)
-     * @param i Row index
-     * @param j Column index
-     * @return Reference to element
-     */
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-
-    /**
-     * @brief Element access (no bounds checking) - const version
-     * @param i Row index
-     * @param j Column index
-     * @return Const reference to element
-     */
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    /**
-     * @brief Element access with bounds checking
-     * @param i Row index
-     * @param j Column index
-     * @return Reference to element
-     * @throws std::out_of_range if indices are out of bounds
-     */
-    T& at(size_type i, size_type j) {
-        if (i >= M || j >= N) {
-            throw std::out_of_range("Matrix::at: index out of range");
-        }
-        return (*this)(i, j);
-    }
-
-    /**
-     * @brief Element access with bounds checking - const version
-     * @param i Row index
-     * @param j Column index
-     * @return Const reference to element
-     * @throws std::out_of_range if indices are out of bounds
-     */
-    const T& at(size_type i, size_type j) const {
-        if (i >= M || j >= N) {
-            throw std::out_of_range("Matrix::at: index out of range");
-        }
-        return (*this)(i, j);
-    }
-
-    /**
-     * @brief Get row as vector
-     * @param i Row index
-     * @return Vector containing row elements
-     */
-    Vector<T, N> row(size_type i) const {
-        Vector<T, N> result;
-        for (size_type j = 0; j < N; ++j) {
-            result[j] = (*this)(i, j);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get column as vector
-     * @param j Column index
-     * @return Vector containing column elements
-     */
-    Vector<T, M> column(size_type j) const {
-        Vector<T, M> result;
-        for (size_type i = 0; i < M; ++i) {
-            result[i] = (*this)(i, j);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get column as vector (alias for column)
-     * @param j Column index
-     * @return Vector containing column elements
-     */
-    Vector<T, M> col(size_type j) const {
-        return column(j);
-    }
-
-    /**
-     * @brief Set row from vector
-     * @param i Row index
-     * @param v Vector of values
-     */
-    void set_row(size_type i, const Vector<T, N>& v) {
-        for (size_type j = 0; j < N; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    /**
-     * @brief Set column from vector
-     * @param j Column index
-     * @param v Vector of values
-     */
-    void set_column(size_type j, const Vector<T, M>& v) {
-        for (size_type i = 0; i < M; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    /**
-     * @brief Set column from vector (alias for set_column)
-     * @param j Column index
-     * @param v Vector of values
-     */
-    void set_col(size_type j, const Vector<T, M>& v) {
-        set_column(j, v);
-    }
-
-    /**
-     * @brief Get pointer to underlying data
-     * @return Pointer to first element
-     */
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    /**
-     * @brief Fill matrix with value
-     * @param value Value to fill with
-     */
-    void fill(T value) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Set all elements to zero
-     */
-    void set_zero() {
-        fill(T{0});
-    }
-
-    // Arithmetic operators
-
-    /**
-     * @brief In-place addition
-     * @param other Matrix to add
-     * @return Reference to this
-     */
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place subtraction
-     * @param other Matrix to subtract
-     * @return Reference to this
-     */
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar multiplication
-     * @param scalar Scalar to multiply by
-     * @return Reference to this
-     */
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar division
-     * @param scalar Scalar to divide by
-     * @return Reference to this
-     */
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    // Matrix operations
-
-    /**
-     * @brief Compute transpose
-     * @return Transposed matrix
-     */
-    Matrix<T, N, M> transpose() const {
-        Matrix<T, N, M> result;
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                result(j, i) = (*this)(i, j);
-            }
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute trace (sum of diagonal elements)
-     * @return Trace (only valid for square matrices)
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, T> trace() const {
-        T result = T(0);
-        for (size_type i = 0; i < M; ++i) {
-            result += (*this)(i, i);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute Frobenius norm squared
-     * @return Sum of squares of all elements
-     */
-    T frobenius_norm_squared() const {
-        T result = T(0);
-        for (size_type i = 0; i < M * N; ++i) {
-            result += data_[i] * data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute Frobenius norm
-     * @return Square root of sum of squares
-     */
-    T frobenius_norm() const {
-        using std::sqrt;
-        return sqrt(frobenius_norm_squared());
-    }
-
-    /**
-     * @brief Compute infinity norm (maximum absolute row sum)
-     * @return Infinity norm
-     */
-    T infinity_norm() const {
-        T max_row_sum = T(0);
-        for (size_type i = 0; i < M; ++i) {
-            T row_sum = T(0);
-            for (size_type j = 0; j < N; ++j) {
-                using std::abs;
-                row_sum += abs((*this)(i, j));
-            }
-            max_row_sum = std::max(max_row_sum, row_sum);
-        }
-        return max_row_sum;
-    }
-
-    /**
-     * @brief Compute one norm (maximum absolute column sum)
-     * @return One norm
-     */
-    T one_norm() const {
-        T max_col_sum = T(0);
-        for (size_type j = 0; j < N; ++j) {
-            T col_sum = T(0);
-            for (size_type i = 0; i < M; ++i) {
-                using std::abs;
-                col_sum += abs((*this)(i, j));
-            }
-            max_col_sum = std::max(max_col_sum, col_sum);
-        }
-        return max_col_sum;
-    }
-
-    /**
-     * @brief Get minimum element
-     * @return Minimum value
-     */
-    T min() const {
-        return *std::min_element(data_, data_ + M * N);
-    }
-
-    /**
-     * @brief Get maximum element
-     * @return Maximum value
-     */
-    T max() const {
-        return *std::max_element(data_, data_ + M * N);
-    }
-
-    /**
-     * @brief Get sum of all elements
-     * @return Sum of elements
-     */
-    T sum() const {
-        T result = T(0);
-        for (size_type i = 0; i < M * N; ++i) {
-            result += data_[i];
-        }
-        return result;
-    }
-
-    // Static factory functions
-
-    /**
-     * @brief Create zero matrix
-     * @return Matrix with all elements zero
-     */
-    static constexpr Matrix zeros() {
-        return Matrix();
-    }
-
-    /**
-     * @brief Create matrix with all elements one
-     * @return Matrix with all elements one
-     */
-    static constexpr Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    /**
-     * @brief Create identity matrix (only for square matrices)
-     * @return Identity matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    static std::enable_if_t<M2 == N2, Matrix> identity() {
-        Matrix result;
-        for (size_type i = 0; i < M; ++i) {
-            result(i, i) = T(1);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Create diagonal matrix from vector (only for square matrices)
-     * @param diag Vector of diagonal elements
-     * @return Diagonal matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    static std::enable_if_t<M2 == N2, Matrix> diagonal(const Vector<T, M>& diag) {
-        Matrix result;
-        for (size_type i = 0; i < M; ++i) {
-            result(i, i) = diag[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Create zero matrix (static factory)
-     * @return Zero matrix
-     */
-    static Matrix zero() {
-        return zeros();
-    }
-
-    // Property checking methods
-
-    /**
-     * @brief Check if matrix is symmetric (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if symmetric
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_symmetric(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = i + 1; j < N; ++j) {
-                using std::abs;
-                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Check if matrix is skew-symmetric (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if skew-symmetric
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_skew_symmetric(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            // Diagonal must be zero
-            using std::abs;
-            if (abs((*this)(i, i)) > tol) {
-                return false;
-            }
-            for (size_type j = i + 1; j < N; ++j) {
-                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Check if matrix is diagonal (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if diagonal
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_diagonal(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                if (i != j) {
-                    using std::abs;
-                    if (abs((*this)(i, j)) > tol) {
-                        return false;
-                    }
-                }
-            }
-        }
-        return true;
-    }
-
-    // Determinant (general template, specialized for 2x2, 3x3)
-    /**
-     * @brief Compute determinant (only for square matrices)
-     * @return Determinant value
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, T> determinant() const {
-        // For 4x4 and larger, use LU decomposition
-        return determinant_lu();
-    }
-
-    // Inverse (general template, specialized for 2x2, 3x3)
-    /**
-     * @brief Compute matrix inverse (only for square matrices)
-     * @return Inverse matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, Matrix> inverse() const {
-        // For 4x4 and larger, use Gauss-Jordan elimination
-        return inverse_gauss_jordan();
-    }
-
-private:
-    // LU decomposition for determinant (4x4 and larger)
-    T determinant_lu() const {
-        Matrix<T, M, M> lu = *this;
-        T det = T(1);
-
-        for (size_type k = 0; k < M - 1; ++k) {
-            // Find pivot
-            size_type pivot = k;
-            T max_val = std::abs(lu(k, k));
-            for (size_type i = k + 1; i < M; ++i) {
-                T val = std::abs(lu(i, k));
-                if (val > max_val) {
-                    max_val = val;
-                    pivot = i;
-                }
-            }
-
-            // Swap rows if needed
-            if (pivot != k) {
-                for (size_type j = 0; j < M; ++j) {
-                    std::swap(lu(k, j), lu(pivot, j));
-                }
-                det = -det;  // Row swap changes sign
-            }
-
-            // Check for singularity
-            if (approx_zero(lu(k, k))) {
-                return T(0);
-            }
-
-            // Eliminate column
-            for (size_type i = k + 1; i < M; ++i) {
-                T factor = lu(i, k) / lu(k, k);
-                for (size_type j = k + 1; j < M; ++j) {
-                    lu(i, j) -= factor * lu(k, j);
-                }
-            }
-
-            det *= lu(k, k);
-        }
-        det *= lu(M - 1, M - 1);
-
-        return det;
-    }
-
-    // Gauss-Jordan elimination for inverse (4x4 and larger)
-    Matrix inverse_gauss_jordan() const {
-        Matrix<T, M, M> aug;  // Augmented matrix [A | I]
-        Matrix<T, M, M> result = Matrix::identity();
-
-        // Copy this matrix to augmented matrix
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < M; ++j) {
-                aug(i, j) = (*this)(i, j);
-            }
-        }
-
-        // Forward elimination with partial pivoting
-        for (size_type k = 0; k < M; ++k) {
-            // Find pivot
-            size_type pivot = k;
-            T max_val = std::abs(aug(k, k));
-            for (size_type i = k + 1; i < M; ++i) {
-                T val = std::abs(aug(i, k));
-                if (val > max_val) {
-                    max_val = val;
-                    pivot = i;
-                }
-            }
-
-            // Swap rows
-            if (pivot != k) {
-                for (size_type j = 0; j < M; ++j) {
-                    std::swap(aug(k, j), aug(pivot, j));
-                    std::swap(result(k, j), result(pivot, j));
-                }
-            }
-
-            // Check for singularity
-            if (approx_zero(aug(k, k))) {
-                throw std::runtime_error("Matrix is singular");
-            }
-
-            // Scale pivot row
-            T pivot_val = aug(k, k);
-            for (size_type j = 0; j < M; ++j) {
-                aug(k, j) /= pivot_val;
-                result(k, j) /= pivot_val;
-            }
-
-            // Eliminate column
-            for (size_type i = 0; i < M; ++i) {
-                if (i != k) {
-                    T factor = aug(i, k);
-                    for (size_type j = 0; j < M; ++j) {
-                        aug(i, j) -= factor * aug(k, j);
-                        result(i, j) -= factor * result(k, j);
-                    }
-                }
-            }
-        }
-
-        return result;
-    }
-
-    // Iterators
-public:
-    T* begin() { return data_; }
-    T* end() { return data_ + M * N; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + M * N; }
-    const T* cbegin() const { return data_; }
-    const T* cend() const { return data_ + M * N; }
-};
-
-// Specialization for 2x2 determinant (analytical formula)
-template<typename T>
-inline T determinant_2x2(const Matrix<T, 2, 2>& m) {
-    return m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
-}
-
-// Specialization for 2x2 inverse (Cramer's rule)
-template<typename T>
-inline Matrix<T, 2, 2> inverse_2x2(const Matrix<T, 2, 2>& m) {
-    T det = determinant_2x2(m);
-    if (approx_zero(det)) {
-        throw std::runtime_error("Matrix is singular");
-    }
-
-    T inv_det = T(1) / det;
-    return Matrix<T, 2, 2>{
-        { m(1, 1) * inv_det, -m(0, 1) * inv_det},
-        {-m(1, 0) * inv_det,  m(0, 0) * inv_det}
-    };
-}
-
-// Specialization for 3x3 determinant (Sarrus rule)
-template<typename T>
-inline T determinant_3x3(const Matrix<T, 3, 3>& m) {
-    return m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1))
-         - m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0))
-         + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
-}
-
-// Specialization for 3x3 inverse (Cramer's rule / adjugate method)
-template<typename T>
-inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
-    T det = determinant_3x3(m);
-    if (approx_zero(det)) {
-        throw std::runtime_error("Matrix is singular");
-    }
-
-    T inv_det = T(1) / det;
-
-    // Compute adjugate matrix (transpose of cofactor matrix)
-    Matrix<T, 3, 3> adj;
-    adj(0, 0) =  (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1));
-    adj(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1));
-    adj(0, 2) =  (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
-
-    adj(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0));
-    adj(1, 1) =  (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0));
-    adj(1, 2) = -(m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0));
-
-    adj(2, 0) =  (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
-    adj(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0));
-    adj(2, 2) =  (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
-
-    return adj * inv_det;
-}
-
-/**
- * @brief Specialized fixed-size 2-by-2 matrix for element-level computations.
- * @ingroup FE_MatrixMath
- * @tparam T Scalar type.
- *
- * This specialization preserves the Matrix API while using direct formulas for
- * 2-by-2 determinant and inverse operations.
- */
-template<typename T>
-class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
-    static constexpr std::size_t M = 2;
-    static constexpr std::size_t N = 2;
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[4];
-
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * 2 + j;
-    }
-
-public:
-    using value_type = T;
-    using size_type = std::size_t;
-
-    // Include all the same constructors and methods as the general template
-    constexpr Matrix() : data_{} {}
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] = value;
-        }
-    }
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= 2) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= 2) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 2; ++i) {
-            for (size_type j = 0; j < 2; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    constexpr Matrix(const Matrix&) = default;
-    constexpr Matrix(Matrix&&) noexcept = default;
-    Matrix& operator=(const Matrix&) = default;
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 2; ++i) {
-            for (size_type j = 0; j < 2; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    static constexpr size_type rows() { return 2; }
-    static constexpr size_type cols() { return 2; }
-    static constexpr size_type size() { return 4; }
-
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    void fill(T value) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    void set_zero() { fill(T{0}); }
-
-    void set_row(size_type i, const Vector<T, 2>& v) {
-        for (size_type j = 0; j < 2; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    void set_column(size_type j, const Vector<T, 2>& v) {
-        for (size_type i = 0; i < 2; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    void set_col(size_type j, const Vector<T, 2>& v) {
-        set_column(j, v);
-    }
-
-    Vector<T, 2> col(size_type j) const {
-        return column(j);
-    }
-
-    static Matrix zero() {
-        return zeros();
-    }
-
-    static Matrix diagonal(const Vector<T, 2>& diag) {
-        Matrix result;
-        result(0, 0) = diag[0];
-        result(1, 1) = diag[1];
-        return result;
-    }
-
-    bool is_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        return abs((*this)(0, 1) - (*this)(1, 0)) <= tol;
-    }
-
-    bool is_skew_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        // Diagonal must be zero
-        if (abs((*this)(0, 0)) > tol || abs((*this)(1, 1)) > tol) {
-            return false;
-        }
-        // Off-diagonal must be opposite
-        return abs((*this)(0, 1) + (*this)(1, 0)) <= tol;
-    }
-
-    bool is_diagonal(T tol = tolerance<T>) const {
-        using std::abs;
-        return abs((*this)(0, 1)) <= tol && abs((*this)(1, 0)) <= tol;
-    }
-
-    T frobenius_norm() const {
-        using std::sqrt;
-        T sum = T(0);
-        for (size_type i = 0; i < 4; ++i) {
-            sum += data_[i] * data_[i];
-        }
-        return sqrt(sum);
-    }
-
-    T infinity_norm() const {
-        using std::abs;
-        T row0 = abs((*this)(0, 0)) + abs((*this)(0, 1));
-        T row1 = abs((*this)(1, 0)) + abs((*this)(1, 1));
-        return std::max(row0, row1);
-    }
-
-    T one_norm() const {
-        using std::abs;
-        T col0 = abs((*this)(0, 0)) + abs((*this)(1, 0));
-        T col1 = abs((*this)(0, 1)) + abs((*this)(1, 1));
-        return std::max(col0, col1);
-    }
-
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    Matrix<T, 2, 2> transpose() const {
-        return Matrix<T, 2, 2>{
-            {(*this)(0, 0), (*this)(1, 0)},
-            {(*this)(0, 1), (*this)(1, 1)}
-        };
-    }
-
-    T trace() const {
-        return (*this)(0, 0) + (*this)(1, 1);
-    }
-
-    static Matrix identity() {
-        Matrix result;
-        result(0, 0) = T(1);
-        result(1, 1) = T(1);
-        return result;
-    }
-
-    static Matrix zeros() {
-        return Matrix();
-    }
-
-    static Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    // Specialized 2x2 determinant
-    T determinant() const {
-        return determinant_2x2(*this);
-    }
-
-    // Specialized 2x2 inverse
-    Matrix inverse() const {
-        return inverse_2x2(*this);
-    }
-
-    Vector<T, 2> row(size_type i) const {
-        return Vector<T, 2>{(*this)(i, 0), (*this)(i, 1)};
-    }
-
-    Vector<T, 2> column(size_type j) const {
-        return Vector<T, 2>{(*this)(0, j), (*this)(1, j)};
-    }
-
-    T* begin() { return data_; }
-    T* end() { return data_ + 4; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + 4; }
-};
-
-/**
- * @brief Specialized fixed-size 3-by-3 matrix for element-level computations.
- * @ingroup FE_MatrixMath
- * @tparam T Scalar type.
- *
- * This specialization preserves the Matrix API while using direct formulas for
- * 3-by-3 determinant and inverse operations.
- */
-template<typename T>
-class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
-    static constexpr std::size_t M = 3;
-    static constexpr std::size_t N = 3;
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[9];
-
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * 3 + j;
-    }
-
-public:
-    using value_type = T;
-    using size_type = std::size_t;
-
-    constexpr Matrix() : data_{} {}
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] = value;
-        }
-    }
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= 3) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= 3) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    constexpr Matrix(const Matrix&) = default;
-    constexpr Matrix(Matrix&&) noexcept = default;
-    Matrix& operator=(const Matrix&) = default;
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    static constexpr size_type rows() { return 3; }
-    static constexpr size_type cols() { return 3; }
-    static constexpr size_type size() { return 9; }
-
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    void fill(T value) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    void set_zero() { fill(T{0}); }
-
-    void set_row(size_type i, const Vector<T, 3>& v) {
-        for (size_type j = 0; j < 3; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    void set_column(size_type j, const Vector<T, 3>& v) {
-        for (size_type i = 0; i < 3; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    void set_col(size_type j, const Vector<T, 3>& v) {
-        set_column(j, v);
-    }
-
-    Vector<T, 3> col(size_type j) const {
-        return column(j);
-    }
-
-    static Matrix zero() {
-        return zeros();
-    }
-
-    static Matrix diagonal(const Vector<T, 3>& diag) {
-        Matrix result;
-        result(0, 0) = diag[0];
-        result(1, 1) = diag[1];
-        result(2, 2) = diag[2];
-        return result;
-    }
-
-    bool is_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = i + 1; j < 3; ++j) {
-                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    bool is_skew_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        // Diagonal must be zero
-        for (size_type i = 0; i < 3; ++i) {
-            if (abs((*this)(i, i)) > tol) {
-                return false;
-            }
-        }
-        // Off-diagonal must be opposite
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = i + 1; j < 3; ++j) {
-                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    bool is_diagonal(T tol = tolerance<T>) const {
-        using std::abs;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                if (i != j && abs((*this)(i, j)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    T frobenius_norm() const {
-        using std::sqrt;
-        T sum = T(0);
-        for (size_type i = 0; i < 9; ++i) {
-            sum += data_[i] * data_[i];
-        }
-        return sqrt(sum);
-    }
-
-    T infinity_norm() const {
-        using std::abs;
-        T max_row_sum = T(0);
-        for (size_type i = 0; i < 3; ++i) {
-            T row_sum = T(0);
-            for (size_type j = 0; j < 3; ++j) {
-                row_sum += abs((*this)(i, j));
-            }
-            max_row_sum = std::max(max_row_sum, row_sum);
-        }
-        return max_row_sum;
-    }
-
-    T one_norm() const {
-        using std::abs;
-        T max_col_sum = T(0);
-        for (size_type j = 0; j < 3; ++j) {
-            T col_sum = T(0);
-            for (size_type i = 0; i < 3; ++i) {
-                col_sum += abs((*this)(i, j));
-            }
-            max_col_sum = std::max(max_col_sum, col_sum);
-        }
-        return max_col_sum;
-    }
-
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    Matrix<T, 3, 3> transpose() const {
-        Matrix<T, 3, 3> result;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                result(j, i) = (*this)(i, j);
-            }
-        }
-        return result;
-    }
-
-    T trace() const {
-        return (*this)(0, 0) + (*this)(1, 1) + (*this)(2, 2);
-    }
-
-    static Matrix identity() {
-        Matrix result;
-        result(0, 0) = T(1);
-        result(1, 1) = T(1);
-        result(2, 2) = T(1);
-        return result;
-    }
-
-    static Matrix zeros() {
-        return Matrix();
-    }
-
-    static Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    // Specialized 3x3 determinant
-    T determinant() const {
-        return determinant_3x3(*this);
-    }
-
-    // Specialized 3x3 inverse
-    Matrix inverse() const {
-        return inverse_3x3(*this);
-    }
-
-    Vector<T, 3> row(size_type i) const {
-        return Vector<T, 3>{(*this)(i, 0), (*this)(i, 1), (*this)(i, 2)};
-    }
-
-    Vector<T, 3> column(size_type j) const {
-        return Vector<T, 3>{(*this)(0, j), (*this)(1, j), (*this)(2, j)};
-    }
-
-    T* begin() { return data_; }
-    T* end() { return data_ + 9; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + 9; }
-};
+using Matrix = Eigen::Matrix<T, static_cast<int>(M), static_cast<int>(N)>;
 
 // Type aliases for common matrix types
 template<typename T> using Matrix2x2 = Matrix<T, 2, 2>;
@@ -1323,188 +59,6 @@ using Matrix2x2f = Matrix2x2<float>;
 using Matrix3x3f = Matrix3x3<float>;
 using Matrix4x4f = Matrix4x4<float>;
 
-// Matrix-vector multiplication
-template<typename T, std::size_t M, std::size_t N>
-inline Vector<T, M> operator*(const Matrix<T, M, N>& A, const Vector<T, N>& x) {
-    Vector<T, M> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        T sum = T(0);
-        for (std::size_t j = 0; j < N; ++j) {
-            sum += A(i, j) * x[j];
-        }
-        result[i] = sum;
-    }
-    return result;
-}
-
-// Vector-matrix multiplication (row vector * matrix)
-template<typename T, std::size_t M, std::size_t N>
-inline Vector<T, N> operator*(const Vector<T, M>& x, const Matrix<T, M, N>& A) {
-    Vector<T, N> result;
-    for (std::size_t j = 0; j < N; ++j) {
-        T sum = T(0);
-        for (std::size_t i = 0; i < M; ++i) {
-            sum += x[i] * A(i, j);
-        }
-        result[j] = sum;
-    }
-    return result;
-}
-
-// Matrix-matrix multiplication
-template<typename T, std::size_t M, std::size_t N, std::size_t P>
-inline Matrix<T, M, P> operator*(const Matrix<T, M, N>& A, const Matrix<T, N, P>& B) {
-    Matrix<T, M, P> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t k = 0; k < N; ++k) {
-            T a_ik = A(i, k);
-            for (std::size_t j = 0; j < P; ++j) {
-                result(i, j) += a_ik * B(k, j);
-            }
-        }
-    }
-    return result;
-}
-
-// Free functions
-
-/**
- * @brief Compute matrix transpose
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, N, M> transpose(const Matrix<T, M, N>& m) {
-    return m.transpose();
-}
-
-/**
- * @brief Compute matrix trace
- */
-template<typename T, std::size_t N>
-inline T trace(const Matrix<T, N, N>& m) {
-    return m.trace();
-}
-
-/**
- * @brief Compute matrix determinant
- */
-template<typename T, std::size_t N>
-inline T determinant(const Matrix<T, N, N>& m) {
-    return m.determinant();
-}
-
-/**
- * @brief Compute matrix inverse
- */
-template<typename T, std::size_t N>
-inline Matrix<T, N, N> inverse(const Matrix<T, N, N>& m) {
-    return m.inverse();
-}
-
-/**
- * @brief Compute Frobenius norm
- */
-template<typename T, std::size_t M, std::size_t N>
-inline T frobenius_norm(const Matrix<T, M, N>& m) {
-    return m.frobenius_norm();
-}
-
-/**
- * @brief Component-wise absolute value
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> abs(const Matrix<T, M, N>& m) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            using std::abs;
-            result(i, j) = abs(m(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise minimum
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> min(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = std::min(a(i, j), b(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise maximum
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> max(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = std::max(a(i, j), b(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Outer product of two vectors
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> outer_product(const Vector<T, M>& u, const Vector<T, N>& v) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = u[i] * v[j];
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Check if two matrices are approximately equal
- */
-template<typename T, std::size_t M, std::size_t N>
-inline bool approx_equal(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b, T tol = tolerance<T>) {
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            if (!approx_equal(a(i, j), b(i, j), tol)) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-/**
- * @brief Stream output operator for matrices
- * @tparam T Scalar type
- * @tparam M Number of rows
- * @tparam N Number of columns
- * @param os Output stream
- * @param m Matrix to output
- * @return Reference to output stream
- */
-template<typename T, std::size_t M, std::size_t N>
-inline std::ostream& operator<<(std::ostream& os, const Matrix<T, M, N>& m) {
-    os << "[";
-    for (std::size_t i = 0; i < M; ++i) {
-        if (i > 0) os << "\n ";
-        os << "[";
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j > 0) os << ", ";
-            os << m(i, j);
-        }
-        os << "]";
-    }
-    os << "]";
-    return os;
-}
-
 } // namespace math
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
deleted file mode 100644
index 288bbc5ca..000000000
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ /dev/null
@@ -1,630 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef SVMP_FE_MATH_MATRIX_EXPR_H
-#define SVMP_FE_MATH_MATRIX_EXPR_H
-
-/**
- * @file MatrixExpr.h
- * @brief Expression template infrastructure for lazy evaluation of matrix operations
- *
- * This header provides expression templates that enable compound matrix operations
- * without creating temporary objects. Operations are evaluated lazily at the point
- * of assignment, eliminating intermediate allocations and improving performance.
- */
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <type_traits>
-
-#include "VectorExpr.h"
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-/**
- * @brief Base class for all matrix expressions using CRTP
- * @tparam Derived The derived expression type
- *
- * This uses the Curiously Recurring Template Pattern (CRTP) to provide
- * static polymorphism for expression templates.
- */
-template<typename Derived>
-class MatrixExpr {
-public:
-    /**
-     * @brief Get the derived expression
-     * @return Reference to the derived type
-     */
-    const Derived& derived() const {
-        return static_cast<const Derived&>(*this);
-    }
-
-    /**
-     * @brief Get the derived expression (non-const)
-     * @return Reference to the derived type
-     */
-    Derived& derived() {
-        return static_cast<Derived&>(*this);
-    }
-
-    /**
-     * @brief Access element by row and column indices
-     * @param i Row index
-     * @param j Column index
-     * @return Value at (i,j)
-     */
-    auto operator()(std::size_t i, std::size_t j) const {
-        return derived()(i, j);
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    std::size_t rows() const {
-        return derived().rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    std::size_t cols() const {
-        return derived().cols();
-    }
-};
-
-/**
- * @brief Binary expression for element-wise operations between two matrix expressions
- * @tparam LHS Left-hand side expression type
- * @tparam RHS Right-hand side expression type
- * @tparam Op Binary operation functor
- */
-template<typename LHS, typename RHS, typename Op>
-class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct binary expression
-     * @param lhs Left operand
-     * @param rhs Right operand
-     * @param op Operation to apply
-     */
-    constexpr MatrixBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
-        : lhs_(lhs), rhs_(rhs), op_(op) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Result of operation on elements at (i,j)
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return op_(lhs_(i, j), rhs_(i, j));
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return lhs_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return lhs_.cols();
-    }
-};
-
-/**
- * @brief Unary expression for element-wise operations on a single matrix expression
- * @tparam Expr Expression type
- * @tparam Op Unary operation functor
- */
-template<typename Expr, typename Op>
-class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
-private:
-    Expr expr_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct unary expression
-     * @param expr Operand expression
-     * @param op Operation to apply
-     */
-    constexpr MatrixUnaryExpr(const Expr& expr, Op op = Op{})
-        : expr_(expr), op_(op) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Result of operation on element at (i,j)
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return op_(expr_(i, j));
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Scalar multiplication expression
- * @tparam Expr Matrix expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar multiplication expression
-     * @param expr Matrix expression
-     * @param scalar Scalar value
-     */
-    constexpr MatrixScalarExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Element multiplied by scalar
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(i, j) * scalar_;
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Scalar division expression
- * @tparam Expr Matrix expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar division expression
-     * @param expr Matrix expression
-     * @param scalar Scalar divisor
-     */
-    constexpr MatrixScalarDivExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Element divided by scalar
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(i, j) / scalar_;
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Matrix multiplication expression (lazy evaluation)
- * @tparam LHS Left matrix expression type
- * @tparam RHS Right matrix expression type
- *
- * Computes matrix multiplication A*B lazily
- */
-template<typename LHS, typename RHS>
-class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-
-public:
-    /**
-     * @brief Construct matrix multiplication expression
-     * @param lhs Left matrix
-     * @param rhs Right matrix
-     */
-    constexpr MatrixMulExpr(const LHS& lhs, const RHS& rhs)
-        : lhs_(lhs), rhs_(rhs) {}
-
-    /**
-     * @brief Compute element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Dot product of row i of lhs and column j of rhs
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        using result_type = decltype(lhs_(0, 0) * rhs_(0, 0));
-        result_type sum = result_type{0};
-        const auto n = lhs_.cols();
-        for (std::size_t k = 0; k < n; ++k) {
-            sum += lhs_(i, k) * rhs_(k, j);
-        }
-        return sum;
-    }
-
-    /**
-     * @brief Get number of rows (from left matrix)
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return lhs_.rows();
-    }
-
-    /**
-     * @brief Get number of columns (from right matrix)
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return rhs_.cols();
-    }
-};
-
-/**
- * @brief Transpose expression (lazy evaluation)
- * @tparam Expr Matrix expression type
- */
-template<typename Expr>
-class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
-private:
-    Expr expr_;
-
-public:
-    /**
-     * @brief Construct transpose expression
-     * @param expr Matrix expression to transpose
-     */
-    constexpr explicit TransposeExpr(const Expr& expr)
-        : expr_(expr) {}
-
-    /**
-     * @brief Access transposed element
-     * @param i Row index (becomes column in original)
-     * @param j Column index (becomes row in original)
-     * @return Element at (j,i) of original matrix
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(j, i);
-    }
-
-    /**
-     * @brief Get number of rows (columns of original)
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.cols();
-    }
-
-    /**
-     * @brief Get number of columns (rows of original)
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.rows();
-    }
-};
-
-/**
- * @brief Diagonal matrix expression (creates diagonal matrix from vector)
- * @tparam VecExpr Vector expression type
- */
-template<typename VecExpr>
-class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
-private:
-    VecExpr vec_;
-    std::size_t n_;
-
-public:
-    /**
-     * @brief Construct diagonal matrix from vector
-     * @param vec Vector of diagonal elements
-     * @param n Matrix dimension (default: vector size)
-     */
-    constexpr explicit DiagonalExpr(const VecExpr& vec, std::size_t n = 0)
-        : vec_(vec), n_(n > 0 ? n : vec.size()) {}
-
-    /**
-     * @brief Access element
-     * @param i Row index
-     * @param j Column index
-     * @return Diagonal element if i==j, zero otherwise
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        using result_type = decltype(vec_[0]);
-        return (i == j && i < vec_.size()) ? vec_[i] : result_type{0};
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return n_;
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return n_;
-    }
-};
-
-/**
- * @brief Addition operator for matrix expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator+(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Add>(
-        lhs.derived(), rhs.derived(), detail::ops::Add{}
-    );
-}
-
-/**
- * @brief Subtraction operator for matrix expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator-(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Sub>(
-        lhs.derived(), rhs.derived(), detail::ops::Sub{}
-    );
-}
-
-/**
- * @brief Matrix multiplication operator
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator*(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixMulExpr<LHS, RHS>(lhs.derived(), rhs.derived());
-}
-
-/**
- * @brief Element-wise multiplication (Hadamard product)
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Mul>(
-        lhs.derived(), rhs.derived(), detail::ops::Mul{}
-    );
-}
-
-/**
- * @brief Element-wise division
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard_div(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Div>(
-        lhs.derived(), rhs.derived(), detail::ops::Div{}
-    );
-}
-
-/**
- * @brief Negation operator for matrix expressions
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto operator-(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Negate>(
-        expr.derived(), detail::ops::Negate{}
-    );
-}
-
-/**
- * @brief Scalar multiplication operator (matrix * scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator*(const MatrixExpr<Expr>& expr, Scalar scalar) {
-    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar multiplication operator (scalar * matrix)
- */
-template<typename Scalar, typename Expr,
-         typename = std::enable_if_t<
-             std::is_arithmetic_v<Scalar> &&
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto operator*(Scalar scalar, const MatrixExpr<Expr>& expr) {
-    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar division operator (matrix / scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator/(const MatrixExpr<Expr>& expr, Scalar scalar) {
-    return MatrixScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Transpose function
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto transpose(const MatrixExpr<Expr>& expr) {
-    return TransposeExpr<Expr>(expr.derived());
-}
-
-/**
- * @brief Element-wise absolute value
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto abs(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
-}
-
-/**
- * @brief Element-wise square root
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto sqrt(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
-}
-
-/**
- * @brief Compute Frobenius norm squared of matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Square of the Frobenius norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto frobenius_norm_squared(const MatrixExpr<Expr>& expr) {
-    using result_type = decltype(expr.derived()(0, 0) * expr.derived()(0, 0));
-    result_type sum = result_type{0};
-    const auto m = expr.rows();
-    const auto n = expr.cols();
-    for (std::size_t i = 0; i < m; ++i) {
-        for (std::size_t j = 0; j < n; ++j) {
-            auto val = expr.derived()(i, j);
-            sum += val * val;
-        }
-    }
-    return sum;
-}
-
-/**
- * @brief Compute Frobenius norm of matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Frobenius norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto frobenius_norm(const MatrixExpr<Expr>& expr) {
-    using std::sqrt;
-    return sqrt(frobenius_norm_squared(expr));
-}
-
-/**
- * @brief Compute trace of square matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Sum of diagonal elements
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto trace(const MatrixExpr<Expr>& expr) {
-    using result_type = decltype(expr.derived()(0, 0));
-    result_type sum = result_type{0};
-    const auto n = std::min(expr.rows(), expr.cols());
-    for (std::size_t i = 0; i < n; ++i) {
-        sum += expr.derived()(i, i);
-    }
-    return sum;
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_MATRIX_EXPR_H
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 0ec99c81f..b234bac49 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -6,574 +6,43 @@
 
 /**
  * @file Vector.h
- * @brief Fixed-size vectors with expression templates for FE computations
+ * @brief Fixed-size vector types for FE computations, backed by Eigen.
  *
- * This header provides optimized fixed-size vector operations for element-level
- * computations. All operations use expression templates to eliminate temporaries
- * and are header-only for maximum inlining. Memory is aligned for SIMD operations.
+ * The FE library standardizes on Eigen for linear algebra. These aliases give
+ * element-level code a stable vocabulary type without re-exporting all of
+ * Eigen. Note that, unlike the previous in-house implementation, Eigen types
+ * are NOT zero-initialized by default construction; use Vector::Zero() where a
+ * zeroed value is required.
  */
 
-#include "VectorExpr.h"
-#include "../Common/Types.h"
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <initializer_list>
-#include <limits>
-#include <ostream>
-#include <stdexcept>
-#include <type_traits>
+#include <Eigen/Core>
+
+#include <cstddef>
 
 /// \defgroup FE_Math Math
 /// \ingroup FE
-/// \brief Fixed-size and dense linear algebra utilities for finite-element computations.
+/// \brief Linear algebra vocabulary types and dense utilities for finite-element computations.
 ///
-/// \details The Math module provides small fixed-size vector and matrix types
-/// used in element-level kernels, expression-template infrastructure for
-/// allocation-free algebraic expressions, and dense linear algebra utilities
-/// used by basis construction and local transforms.
+/// \details The Math module defines the fixed-size vector and matrix types
+/// used in element-level kernels (as aliases of Eigen types) and dense linear
+/// algebra utilities used by basis construction and local transforms.
 ///
 /// \defgroup FE_VectorMath Vector
 /// \ingroup FE_Math
-/// \brief Fixed-size vector types and vector expression utilities.
+/// \brief Fixed-size vector type aliases.
 
 namespace svmp {
 namespace FE {
 namespace math {
 
-template<typename T>
-inline constexpr T tolerance =
-    std::is_floating_point_v<T> ? T(1000) * std::numeric_limits<T>::epsilon() : T(0);
-
-template<typename T>
-inline bool approx_zero(T value, T tol = tolerance<T>) {
-    using std::abs;
-    return abs(value) <= tol;
-}
-
-template<typename T>
-inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
-    using std::abs;
-    const T scale = std::max({abs(a), abs(b), T(1)});
-    return abs(a - b) <= tol * scale;
-}
-
 /**
- * @brief Fixed-size vector for element-level computations
+ * @brief Fixed-size column vector for element-level computations
  * @ingroup FE_VectorMath
  * @tparam T Scalar type (float, double)
  * @tparam N Vector dimension
- *
- * This class provides small vector operations optimized for
- * compile-time known dimensions. Memory is aligned for SIMD operations.
  */
 template<typename T, std::size_t N>
-class Vector : public VectorExpr<Vector<T, N>> {
-    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
-    static_assert(N > 0, "Vector dimension must be positive");
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[N];  // SIMD-friendly alignment
-
-public:
-    // Type definitions
-    using value_type = T;
-    using size_type = std::size_t;
-    using reference = T&;
-    using const_reference = const T&;
-    using pointer = T*;
-    using const_pointer = const T*;
-
-    /**
-     * @brief Default constructor - zero initializes all components
-     */
-    constexpr Vector() : data_{} {}
-
-    /**
-     * @brief Fill constructor - initializes all components with same value
-     * @param value Value to fill vector with
-     */
-    constexpr explicit Vector(T value) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Initializer list constructor
-     * @param init List of values
-     */
-    constexpr Vector(std::initializer_list<T> init) : data_{} {
-        auto it = init.begin();
-        for (size_type i = 0; i < N && it != init.end(); ++i, ++it) {
-            data_[i] = *it;
-        }
-    }
-
-    /**
-     * @brief Constructor from expression template
-     * @tparam Expr Expression type
-     * @param expr Vector expression to evaluate
-     */
-    template<typename Expr>
-    Vector(const VectorExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = e[i];
-        }
-    }
-
-    /**
-     * @brief Copy constructor
-     */
-    constexpr Vector(const Vector&) = default;
-
-    /**
-     * @brief Move constructor
-     */
-    constexpr Vector(Vector&&) noexcept = default;
-
-    /**
-     * @brief Copy assignment
-     */
-    Vector& operator=(const Vector&) = default;
-
-    /**
-     * @brief Move assignment
-     */
-    Vector& operator=(Vector&&) noexcept = default;
-
-    /**
-     * @brief Assignment from expression template
-     * @tparam Expr Expression type
-     * @param expr Vector expression to evaluate
-     * @return Reference to this
-     */
-    template<typename Expr>
-    Vector& operator=(const VectorExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = e[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Get vector size (compile-time constant)
-     * @return Number of elements
-     */
-    static constexpr size_type size() { return N; }
-
-    /**
-     * @brief Element access (no bounds checking)
-     * @param i Element index
-     * @return Reference to element
-     */
-    constexpr T& operator[](size_type i) {
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access (no bounds checking) - const version
-     * @param i Element index
-     * @return Const reference to element
-     */
-    constexpr const T& operator[](size_type i) const {
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access with bounds checking
-     * @param i Element index
-     * @return Reference to element
-     * @throws std::out_of_range if i >= N
-     */
-    T& at(size_type i) {
-        if (i >= N) {
-            throw std::out_of_range("Vector::at: index out of range");
-        }
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access with bounds checking - const version
-     * @param i Element index
-     * @return Const reference to element
-     * @throws std::out_of_range if i >= N
-     */
-    const T& at(size_type i) const {
-        if (i >= N) {
-            throw std::out_of_range("Vector::at: index out of range");
-        }
-        return data_[i];
-    }
-
-    /**
-     * @brief Access first element
-     * @return Reference to first element
-     */
-    T& front() { return data_[0]; }
-    const T& front() const { return data_[0]; }
-
-    /**
-     * @brief Access last element
-     * @return Reference to last element
-     */
-    T& back() { return data_[N-1]; }
-    const T& back() const { return data_[N-1]; }
-
-    /**
-     * @brief Get pointer to underlying data
-     * @return Pointer to first element
-     */
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    /**
-     * @brief Fill vector with value
-     * @param value Value to fill with
-     */
-    void fill(T value) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Set all components to zero
-     */
-    void set_zero() {
-        fill(T{0});
-    }
-
-    // Arithmetic operators
-
-    /**
-     * @brief In-place addition
-     * @param other Vector to add
-     * @return Reference to this
-     */
-    Vector& operator+=(const Vector& other) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place subtraction
-     * @param other Vector to subtract
-     * @return Reference to this
-     */
-    Vector& operator-=(const Vector& other) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar multiplication
-     * @param scalar Scalar to multiply by
-     * @return Reference to this
-     */
-    Vector& operator*=(T scalar) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar division
-     * @param scalar Scalar to divide by
-     * @return Reference to this
-     */
-    Vector& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    // Vector operations
-
-    /**
-     * @brief Compute dot product
-     * @param other Other vector
-     * @return Dot product
-     */
-    T dot(const Vector& other) const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            result += data_[i] * other.data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute squared Euclidean norm
-     * @return Squared norm
-     */
-    T norm_squared() const {
-        return dot(*this);
-    }
-
-    /**
-     * @brief Compute Euclidean norm
-     * @return Norm
-     */
-    T norm() const {
-        using std::sqrt;
-        return sqrt(norm_squared());
-    }
-
-    /**
-     * @brief Get normalized vector
-     * @return Unit vector in same direction
-     */
-    Vector normalized() const {
-        const T n = norm();
-        if (approx_zero(n)) {
-            return Vector();  // Return zero vector
-        }
-        return (*this) / n;
-    }
-
-    /**
-     * @brief Normalize this vector in place
-     * @return Reference to this
-     */
-    Vector& normalize() {
-        const T n = norm();
-        if (!approx_zero(n)) {
-            (*this) /= n;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Compute L1 norm (Manhattan norm)
-     * @return Sum of absolute values
-     */
-    T norm_l1() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            result += abs(data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute L-infinity norm (maximum norm)
-     * @return Maximum absolute value
-     */
-    T norm_inf() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            result = std::max(result, abs(data_[i]));
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get minimum component
-     * @return Minimum value
-     */
-    T min() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result = std::min(result, data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get maximum component
-     * @return Maximum value
-     */
-    T max() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result = std::max(result, data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get sum of all components
-     * @return Sum of components
-     */
-    T sum() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            result += data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get product of all components
-     * @return Product of components
-     */
-    T product() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result *= data_[i];
-        }
-        return result;
-    }
-
-    // Static factory functions
-
-    /**
-     * @brief Create zero vector
-     * @return Vector with all components zero
-     */
-    static constexpr Vector zeros() {
-        return Vector();
-    }
-
-    /**
-     * @brief Create vector with all components one
-     * @return Vector with all components one
-     */
-    static constexpr Vector ones() {
-        return Vector(T(1));
-    }
-
-    /**
-     * @brief Create unit vector along axis
-     * @param axis Axis index (0-based)
-     * @return Unit vector
-     */
-    static Vector unit(size_type axis) {
-        Vector v;
-        if (axis < N) {
-            v[axis] = T(1);
-        }
-        return v;
-    }
-
-    /**
-     * @brief Create basis vector (alias for unit)
-     * @param i Axis index (0-based)
-     * @return Basis vector
-     */
-    static Vector basis(size_type i) {
-        return unit(i);
-    }
-
-    /**
-     * @brief Create zero vector (alias for zeros)
-     * @return Zero vector
-     */
-    static constexpr Vector zero() {
-        return zeros();
-    }
-
-    /**
-     * @brief Get index of minimum element
-     * @return Index of minimum value
-     */
-    size_type min_index() const {
-        size_type idx = 0;
-        T min_val = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            if (data_[i] < min_val) {
-                min_val = data_[i];
-                idx = i;
-            }
-        }
-        return idx;
-    }
-
-    /**
-     * @brief Get index of maximum element
-     * @return Index of maximum value
-     */
-    size_type max_index() const {
-        size_type idx = 0;
-        T max_val = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            if (data_[i] > max_val) {
-                max_val = data_[i];
-                idx = i;
-            }
-        }
-        return idx;
-    }
-
-    /**
-     * @brief Compute mean of all components
-     * @return Average value
-     */
-    T mean() const {
-        return sum() / static_cast<T>(N);
-    }
-
-    /**
-     * @brief Cross product for 3D vectors
-     * @param other Other vector
-     * @return Cross product
-     * @note Only available for 3D vectors
-     */
-    template<typename U = T>
-    std::enable_if_t<N == 3, Vector<U, 3>> cross(const Vector<U, 3>& other) const {
-        return Vector<U, 3>{
-            data_[1] * other[2] - data_[2] * other[1],
-            data_[2] * other[0] - data_[0] * other[2],
-            data_[0] * other[1] - data_[1] * other[0]
-        };
-    }
-
-    /**
-     * @brief Check if vectors are approximately equal
-     * @param other Other vector
-     * @param tol Tolerance
-     * @return true if equal within tolerance
-     */
-    bool approx_equal(const Vector& other, T tol = tolerance<T>) const {
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            if (abs(data_[i] - other.data_[i]) > tol) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Equality comparison
-     * @param other Other vector
-     * @return true if exactly equal
-     */
-    bool operator==(const Vector& other) const {
-        for (size_type i = 0; i < N; ++i) {
-            if (data_[i] != other.data_[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Inequality comparison
-     * @param other Other vector
-     * @return true if not equal
-     */
-    bool operator!=(const Vector& other) const {
-        return !(*this == other);
-    }
-
-    // Iterators
-    T* begin() { return data_; }
-    T* end() { return data_ + N; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + N; }
-    const T* cbegin() const { return data_; }
-    const T* cend() const { return data_ + N; }
-};
+using Vector = Eigen::Matrix<T, static_cast<int>(N), 1>;
 
 // Type aliases for common vector types
 template<typename T> using Vector2 = Vector<T, 2>;
@@ -595,269 +64,6 @@ using Vector2i = Vector2<int>;
 using Vector3i = Vector3<int>;
 using Vector4i = Vector4<int>;
 
-/**
- * @brief 3D Cross product
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @return Cross product a × b
- */
-template<typename T>
-inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
-    return Vector3<T>{
-        a[1] * b[2] - a[2] * b[1],
-        a[2] * b[0] - a[0] * b[2],
-        a[0] * b[1] - a[1] * b[0]
-    };
-}
-
-/**
- * @brief 2D Cross product (returns scalar - z component of 3D cross)
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @return Scalar cross product
- */
-template<typename T>
-inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
-    return a[0] * b[1] - a[1] * b[0];
-}
-
-/**
- * @brief Triple scalar product (a · (b × c))
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @param c Third vector
- * @return Scalar triple product
- */
-template<typename T>
-inline T triple_product(const Vector3<T>& a, const Vector3<T>& b, const Vector3<T>& c) {
-    return a.dot(cross(b, c));
-}
-
-// Free functions for common operations
-
-/**
- * @brief Compute dot product
- */
-template<typename T, std::size_t N>
-inline T dot(const Vector<T, N>& a, const Vector<T, N>& b) {
-    return a.dot(b);
-}
-
-/**
- * @brief Compute Euclidean norm
- */
-template<typename T, std::size_t N>
-inline T norm(const Vector<T, N>& v) {
-    return v.norm();
-}
-
-/**
- * @brief Compute squared Euclidean norm
- */
-template<typename T, std::size_t N>
-inline T norm_squared(const Vector<T, N>& v) {
-    return v.norm_squared();
-}
-
-/**
- * @brief Get normalized vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> normalize(const Vector<T, N>& v) {
-    return v.normalized();
-}
-
-/**
- * @brief Component-wise absolute value
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> abs(const Vector<T, N>& v) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        using std::abs;
-        result[i] = abs(v[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise minimum
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> min(const Vector<T, N>& a, const Vector<T, N>& b) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::min(a[i], b[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise maximum
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> max(const Vector<T, N>& a, const Vector<T, N>& b) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::max(a[i], b[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise clamp
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> clamp(const Vector<T, N>& v, const Vector<T, N>& min_v, const Vector<T, N>& max_v) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::clamp(v[i], min_v[i], max_v[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Linear interpolation between vectors
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param t Interpolation parameter [0, 1]
- * @param a Start vector (at t=0)
- * @param b End vector (at t=1)
- * @return Interpolated vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> lerp(T t, const Vector<T, N>& a, const Vector<T, N>& b) {
-    return a + t * (b - a);
-}
-
-/**
- * @brief Spherical linear interpolation (for unit vectors)
- * @tparam T Scalar type
- * @param t Interpolation parameter [0, 1]
- * @param a Start unit vector
- * @param b End unit vector
- * @return Interpolated unit vector
- */
-template<typename T>
-inline Vector3<T> slerp(T t, const Vector3<T>& a, const Vector3<T>& b) {
-    T cos_angle = a.dot(b);
-
-    // Handle numerical issues
-    cos_angle = std::clamp(cos_angle, T(-1), T(1));
-
-    // If vectors are nearly parallel, use linear interpolation
-    if (cos_angle > T(0.9995)) {
-        return normalize(lerp(t, a, b));
-    }
-
-    T angle = std::acos(cos_angle);
-    T sin_angle = std::sin(angle);
-
-    T t0 = std::sin((T(1) - t) * angle) / sin_angle;
-    T t1 = std::sin(t * angle) / sin_angle;
-
-    return t0 * a + t1 * b;
-}
-
-/**
- * @brief Reflect vector about normal
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Incident vector
- * @param n Normal vector (should be unit)
- * @return Reflected vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> reflect(const Vector<T, N>& v, const Vector<T, N>& n) {
-    return v - T(2) * dot(v, n) * n;
-}
-
-/**
- * @brief Project vector onto another vector
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Vector to project
- * @param onto Vector to project onto
- * @return Projection of v onto 'onto'
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> project(const Vector<T, N>& v, const Vector<T, N>& onto) {
-    T denom = onto.norm_squared();
-    if (approx_zero(denom)) {
-        return Vector<T, N>::zeros();
-    }
-    return (dot(v, onto) / denom) * onto;
-}
-
-/**
- * @brief Get perpendicular component of vector
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Vector
- * @param direction Direction to remove
- * @return Component of v perpendicular to direction
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> perpendicular(const Vector<T, N>& v, const Vector<T, N>& direction) {
-    return v - project(v, direction);
-}
-
-/**
- * @brief Compute angle between two vectors
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param a First vector
- * @param b Second vector
- * @return Angle in radians [0, π]
- */
-template<typename T, std::size_t N>
-inline T angle(const Vector<T, N>& a, const Vector<T, N>& b) {
-    T cos_angle = dot(a, b) / (norm(a) * norm(b));
-    cos_angle = std::clamp(cos_angle, T(-1), T(1));
-    return std::acos(cos_angle);
-}
-
-/**
- * @brief Check if two vectors are approximately equal
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param a First vector
- * @param b Second vector
- * @param tol Tolerance
- * @return true if vectors are equal within tolerance
- */
-template<typename T, std::size_t N>
-inline bool approx_equal(const Vector<T, N>& a, const Vector<T, N>& b, T tol = tolerance<T>) {
-    for (std::size_t i = 0; i < N; ++i) {
-        if (!approx_equal(a[i], b[i], tol)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-/**
- * @brief Stream output operator
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param os Output stream
- * @param v Vector to output
- * @return Reference to output stream
- */
-template<typename T, std::size_t N>
-inline std::ostream& operator<<(std::ostream& os, const Vector<T, N>& v) {
-    os << "[";
-    for (std::size_t i = 0; i < N; ++i) {
-        if (i > 0) os << ", ";
-        os << v[i];
-    }
-    os << "]";
-    return os;
-}
-
 } // namespace math
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
deleted file mode 100644
index aa712dd63..000000000
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef SVMP_FE_MATH_VECTOR_EXPR_H
-#define SVMP_FE_MATH_VECTOR_EXPR_H
-
-/**
- * @file VectorExpr.h
- * @brief Expression template infrastructure for lazy evaluation of vector operations
- *
- * This header provides expression templates that enable compound vector operations
- * without creating temporary objects. Operations are evaluated lazily at the point
- * of assignment, eliminating intermediate allocations and improving performance.
- */
-
-#include <cmath>
-#include <cstddef>
-#include <type_traits>
-
-namespace svmp {
-namespace FE {
-namespace math {
-namespace detail {
-namespace ops {
-
-struct Add {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a + b;
-    }
-};
-
-struct Sub {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a - b;
-    }
-};
-
-struct Mul {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a * b;
-    }
-};
-
-struct Div {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a / b;
-    }
-};
-
-struct Negate {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        return -a;
-    }
-};
-
-struct Abs {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::abs;
-        return abs(a);
-    }
-};
-
-struct Sqrt {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::sqrt;
-        return sqrt(a);
-    }
-};
-
-} // namespace ops
-} // namespace detail
-
-/**
- * @brief Base class for all vector expressions using CRTP
- * @tparam Derived The derived expression type
- *
- * This uses the Curiously Recurring Template Pattern (CRTP) to provide
- * static polymorphism for expression templates.
- */
-template<typename Derived>
-class VectorExpr {
-public:
-    /**
-     * @brief Get the derived expression
-     * @return Reference to the derived type
-     */
-    const Derived& derived() const {
-        return static_cast<const Derived&>(*this);
-    }
-
-    /**
-     * @brief Get the derived expression (non-const)
-     * @return Reference to the derived type
-     */
-    Derived& derived() {
-        return static_cast<Derived&>(*this);
-    }
-
-    /**
-     * @brief Access element by index
-     * @param i Element index
-     * @return Value at index i
-     */
-    auto operator[](std::size_t i) const {
-        return derived()[i];
-    }
-
-    /**
-     * @brief Get the size of the vector expression
-     * @return Number of elements
-     */
-    std::size_t size() const {
-        return derived().size();
-    }
-};
-
-/**
- * @brief Binary expression for element-wise operations between two vector expressions
- * @tparam LHS Left-hand side expression type
- * @tparam RHS Right-hand side expression type
- * @tparam Op Binary operation functor
- */
-template<typename LHS, typename RHS, typename Op>
-class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct binary expression
-     * @param lhs Left operand
-     * @param rhs Right operand
-     * @param op Operation to apply
-     */
-    constexpr VectorBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
-        : lhs_(lhs), rhs_(rhs), op_(op) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Result of operation on elements at index i
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return op_(lhs_[i], rhs_[i]);
-    }
-
-    /**
-     * @brief Get size of expression (from left operand)
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return lhs_.size();
-    }
-};
-
-/**
- * @brief Unary expression for element-wise operations on a single vector expression
- * @tparam Expr Expression type
- * @tparam Op Unary operation functor
- */
-template<typename Expr, typename Op>
-class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
-private:
-    Expr expr_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct unary expression
-     * @param expr Operand expression
-     * @param op Operation to apply
-     */
-    constexpr VectorUnaryExpr(const Expr& expr, Op op = Op{})
-        : expr_(expr), op_(op) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Result of operation on element at index i
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return op_(expr_[i]);
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Scalar multiplication expression
- * @tparam Expr Vector expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar multiplication expression
-     * @param expr Vector expression
-     * @param scalar Scalar value
-     */
-    constexpr VectorScalarExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Element multiplied by scalar
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return expr_[i] * scalar_;
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Scalar division expression
- * @tparam Expr Vector expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar division expression
-     * @param expr Vector expression
-     * @param scalar Scalar divisor
-     */
-    constexpr VectorScalarDivExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Element divided by scalar
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return expr_[i] / scalar_;
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Addition operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto operator+(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Add>(
-        lhs.derived(), rhs.derived(), detail::ops::Add{}
-    );
-}
-
-/**
- * @brief Subtraction operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto operator-(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Sub>(
-        lhs.derived(), rhs.derived(), detail::ops::Sub{}
-    );
-}
-
-/**
- * @brief Element-wise multiplication operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Mul>(
-        lhs.derived(), rhs.derived(), detail::ops::Mul{}
-    );
-}
-
-/**
- * @brief Element-wise division operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard_div(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Div>(
-        lhs.derived(), rhs.derived(), detail::ops::Div{}
-    );
-}
-
-/**
- * @brief Negation operator for vector expressions
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto operator-(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Negate>(
-        expr.derived(), detail::ops::Negate{}
-    );
-}
-
-/**
- * @brief Scalar multiplication operator (vector * scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator*(const VectorExpr<Expr>& expr, Scalar scalar) {
-    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar multiplication operator (scalar * vector)
- */
-template<typename Scalar, typename Expr,
-         typename = std::enable_if_t<
-             std::is_arithmetic_v<Scalar> &&
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto operator*(Scalar scalar, const VectorExpr<Expr>& expr) {
-    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar division operator (vector / scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator/(const VectorExpr<Expr>& expr, Scalar scalar) {
-    return VectorScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Element-wise absolute value
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto abs(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
-}
-
-/**
- * @brief Element-wise square root
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto sqrt(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
-}
-
-/**
- * @brief Dot product for vector expressions
- * @tparam LHS Left vector expression type
- * @tparam RHS Right vector expression type
- * @param lhs Left operand
- * @param rhs Right operand
- * @return Dot product result
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto dot(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    using result_type = decltype(lhs.derived()[0] * rhs.derived()[0]);
-    result_type sum = result_type{0};
-    const auto n = lhs.size();
-    for (std::size_t i = 0; i < n; ++i) {
-        sum += lhs.derived()[i] * rhs.derived()[i];
-    }
-    return sum;
-}
-
-/**
- * @brief Compute norm squared of vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Square of the Euclidean norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto norm_squared(const VectorExpr<Expr>& expr) {
-    return dot(expr, expr);
-}
-
-/**
- * @brief Compute norm of vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Euclidean norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto norm(const VectorExpr<Expr>& expr) {
-    using std::sqrt;
-    return sqrt(norm_squared(expr));
-}
-
-/**
- * @brief Normalize vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Normalized vector expression
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto normalize(const VectorExpr<Expr>& expr) {
-    return expr / norm(expr);
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_VECTOR_EXPR_H
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index eb6d35106..60fcddf81 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -28,8 +28,10 @@
 
 #include <array>
 #include <functional>
-#include <math.h> 
+#include <map>
+#include <math.h>
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <span>
 #include <string>
@@ -55,12 +57,6 @@ namespace {
 namespace fe = svmp::FE;
 namespace febasis = svmp::FE::basis;
 
-struct BasisSelection {
-  fe::ElementType element;
-  fe::BasisType basis;
-  int order;
-};
-
 std::string solver_element_name(consts::ElementType eType)
 {
   auto it = consts::element_type_to_string.find(eType);
@@ -70,34 +66,42 @@ std::string solver_element_name(consts::ElementType eType)
   return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
 }
 
-std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
+/// Translate a solver element type into its FE library counterpart. This is a
+/// pure renaming between the two enum vocabularies: the FE library owns the
+/// choice of basis family and polynomial order for each element type
+/// (basis_factory::default_basis_request). The switch deliberately has no
+/// default case so that compilers building with -Wswitch flag any newly added
+/// solver element type that is missing a mapping here.
+std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
 {
-  static constexpr std::array supported{
-      BasisSelection{fe::ElementType::Line2,     fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Line3,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Quad4,     fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Quad8,     fe::BasisType::Serendipity, 2},
-      BasisSelection{fe::ElementType::Quad9,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Tetra4,    fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Tetra10,   fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Hex8,      fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Hex20,     fe::BasisType::Serendipity, 2},
-      BasisSelection{fe::ElementType::Hex27,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Wedge6,    fe::BasisType::Lagrange,    1},
-  };
-
-  const int index = static_cast<int>(eType) - static_cast<int>(consts::ElementType::LIN1);
-  if (index >= 0 && static_cast<std::size_t>(index) < supported.size()) {
-    return supported[static_cast<std::size_t>(index)];
+  switch (eType) {
+    case consts::ElementType::LIN1:  return fe::ElementType::Line2;
+    case consts::ElementType::LIN2:  return fe::ElementType::Line3;
+    case consts::ElementType::TRI3:  return fe::ElementType::Triangle3;
+    case consts::ElementType::TRI6:  return fe::ElementType::Triangle6;
+    case consts::ElementType::QUD4:  return fe::ElementType::Quad4;
+    case consts::ElementType::QUD8:  return fe::ElementType::Quad8;
+    case consts::ElementType::QUD9:  return fe::ElementType::Quad9;
+    case consts::ElementType::TET4:  return fe::ElementType::Tetra4;
+    case consts::ElementType::TET10: return fe::ElementType::Tetra10;
+    case consts::ElementType::HEX8:  return fe::ElementType::Hex8;
+    case consts::ElementType::HEX20: return fe::ElementType::Hex20;
+    case consts::ElementType::HEX27: return fe::ElementType::Hex27;
+    case consts::ElementType::WDG:   return fe::ElementType::Wedge6;
+
+    // No FE basis mapping: points use dedicated shape data in get_gnn and
+    // NURBS are outside the current FE Basis scope.
+    case consts::ElementType::NA:
+    case consts::ElementType::PNT:
+    case consts::ElementType::NRB:
+      return std::nullopt;
   }
   return std::nullopt;
 }
 
 bool use_basis_adapter_for(consts::ElementType eType)
 {
-  return to_basis_selection(eType).has_value();
+  return to_fe_element_type(eType).has_value();
 }
 
 bool supports_face_basis_adapter_for(consts::ElementType eType)
@@ -110,23 +114,36 @@ bool supports_face_basis_adapter_for(consts::ElementType eType)
     case consts::ElementType::QUD4:
     case consts::ElementType::QUD8:
     case consts::ElementType::QUD9:
-      return to_basis_selection(eType).has_value();
+      return use_basis_adapter_for(eType);
     default:
       return false;
   }
 }
 
-std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::ElementType eType)
+/// Return the shared FE basis for a solver element type, constructing it on
+/// first use. Basis construction is not free (node-lattice generation, and a
+/// Vandermonde inversion for quadrilateral serendipity), while callers invoke
+/// this per Gauss point or per probe point, so instances are cached per
+/// element type. Sharing is safe: bases are immutable after construction,
+/// evaluation is const, and BasisFunction scratch state is thread_local.
+const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType)
 {
-  auto selection = to_basis_selection(eType);
-  if (!selection) {
+  static std::mutex cache_mutex;
+  static std::map<consts::ElementType, std::shared_ptr<febasis::BasisFunction>> cache;
+
+  const auto fe_type = to_fe_element_type(eType);
+  if (!fe_type) {
     throw febasis::BasisElementCompatibilityException(
         "No FE Basis selection for solver element " + solver_element_name(eType),
         __FILE__, __LINE__, __func__);
   }
 
-  return febasis::basis_factory::create(
-      {selection->element, selection->basis, selection->order});
+  const std::lock_guard<std::mutex> lock(cache_mutex);
+  auto it = cache.find(eType);
+  if (it == cache.end()) {
+    it = cache.emplace(eType, febasis::basis_factory::create_default_for(*fe_type)).first;
+  }
+  return *it->second;
 }
 
 std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
@@ -192,7 +209,9 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
         __FILE__, __LINE__, __func__);
   }
 
-  fe::math::Vector<fe::Real, 3> point{};
+  // Inactive trailing components must be zero for lower-dimensional elements;
+  // Eigen-backed vectors are not zero-initialized by default construction.
+  fe::math::Vector<fe::Real, 3> point = fe::math::Vector<fe::Real, 3>::Zero();
   for (int d = 0; d < basis.dimension(); ++d) {
     point[static_cast<std::size_t>(d)] = xi(d, g);
   }
@@ -250,19 +269,19 @@ void evaluate_basis_values_and_gradients(const int insd,
                                          Array<double>& N,
                                          Array3<double>& Nx)
 {
-  auto basis = make_basis_for_solver_element(eType);
-  if (insd < basis->dimension()) {
+  const auto& basis = basis_for_solver_element(eType);
+  if (insd < basis.dimension()) {
     throw febasis::BasisConfigurationException(
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
         __FILE__, __LINE__, __func__);
   }
 
-  const auto point = make_basis_point(*basis, g, xi);
+  const auto point = make_basis_point(basis, g, xi);
   std::vector<fe::Real> values;
   std::vector<febasis::Gradient> gradients;
-  basis->evaluate_values(point, values);
-  basis->evaluate_gradients(point, gradients);
+  basis.evaluate_values(point, values);
+  basis.evaluate_gradients(point, gradients);
 
   // FE Basis owns the formulas; fsType and mshType remain the solver-facing storage contract.
   copy_basis_values_to_solver_arrays(eType, eNoN, g, values, gradients, N, Nx);
@@ -355,15 +374,15 @@ void evaluate_basis_hessians(const int insd,
                              const Array<double>& xi,
                              Array3<double>& Nxx)
 {
-  auto basis = make_basis_for_solver_element(eType);
-  if (insd < basis->dimension()) {
+  const auto& basis = basis_for_solver_element(eType);
+  if (insd < basis.dimension()) {
     throw febasis::BasisConfigurationException(
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
         __FILE__, __LINE__, __func__);
   }
 
-  const int required_components = required_nxx_components_for_dimension(basis->dimension());
+  const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
     throw febasis::BasisConfigurationException(
         "solver ind2 " + std::to_string(ind2) +
@@ -371,12 +390,12 @@ void evaluate_basis_hessians(const int insd,
         __FILE__, __LINE__, __func__);
   }
 
-  const auto point = make_basis_point(*basis, gaus_pt, xi);
+  const auto point = make_basis_point(basis, gaus_pt, xi);
   std::vector<febasis::Hessian> hessians;
-  basis->evaluate_hessians(point, hessians);
+  basis.evaluate_hessians(point, hessians);
 
   // Solver Nxx packing is dxx, dyy, dxy in 2D and dxx, dyy, dzz, dxy, dyz, dxz in 3D.
-  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
+  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis.dimension(), hessians, Nxx);
 }
 
 void set_point_face_shape_data(const int gaus_pt, faceType& face)
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index d4bf1d6e5..60ca72114 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -34,6 +34,61 @@ class MinimalScalarBasis : public BasisFunction {
     }
 };
 
+// Quadratic scalar basis with exact analytic derivatives, used to verify the
+// protected numerical_gradient/numerical_hessian development helpers. Centered
+// differences are exact (up to roundoff) on quadratics, so any mismatch is a
+// bug in the helpers themselves.
+class ExactQuadraticBasis : public BasisFunction {
+public:
+    using BasisFunction::numerical_gradient;
+    using BasisFunction::numerical_hessian;
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Hex8; }
+    int dimension() const noexcept override { return 3; }
+    int order() const noexcept override { return 2; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        values.resize(size());
+        values[0] = Real(1) + Real(2) * x - y + Real(0.5) * z +
+                    x * x + Real(0.75) * y * y - Real(0.25) * z * z +
+                    Real(0.2) * x * y - Real(0.3) * x * z + Real(0.4) * y * z;
+        values[1] = Real(3) - x + Real(2) * y + z +
+                    Real(0.5) * x * x - y * y + z * z +
+                    x * y + x * z - y * z;
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const override
+    {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        gradients.assign(size(), Gradient::Zero());
+        gradients[0][0] = Real(2) + Real(2) * x + Real(0.2) * y - Real(0.3) * z;
+        gradients[0][1] = Real(-1) + Real(1.5) * y + Real(0.2) * x + Real(0.4) * z;
+        gradients[0][2] = Real(0.5) - Real(0.5) * z - Real(0.3) * x + Real(0.4) * y;
+        gradients[1][0] = Real(-1) + x + y + z;
+        gradients[1][1] = Real(2) - Real(2) * y + x - z;
+        gradients[1][2] = Real(1) + Real(2) * z + x - y;
+    }
+
+    void exact_hessians(std::vector<Hessian>& hessians) const
+    {
+        hessians.assign(size(), Hessian::Zero());
+        hessians[0] = make_symmetric_hessian(Real(2), Real(1.5), Real(-0.5),
+                                             Real(0.2), Real(-0.3), Real(0.4));
+        hessians[1] = make_symmetric_hessian(Real(1), Real(-2), Real(2),
+                                             Real(1), Real(1), Real(-1));
+    }
+};
+
 class CompleteFallbackBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
@@ -53,7 +108,7 @@ class CompleteFallbackBasis : public BasisFunction {
     void evaluate_gradients(const math::Vector<Real, 3>&,
                             std::vector<Gradient>& gradients) const override
     {
-        gradients.assign(size(), Gradient{});
+        gradients.assign(size(), Gradient::Zero());
         gradients[0][0] = Real(1);
         gradients[1][1] = Real(1);
     }
@@ -61,7 +116,7 @@ class CompleteFallbackBasis : public BasisFunction {
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const override
     {
-        hessians.assign(size(), Hessian{});
+        hessians.assign(size(), Hessian::Zero());
         for (std::size_t d = 0; d < hessians.size(); ++d) {
             for (std::size_t r = 0; r < 3u; ++r) {
                 for (std::size_t c = 0; c < 3u; ++c) {
@@ -96,6 +151,16 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
 }
 
+TEST(BasisErrorPaths, BasisFactoryRejectsNonC0Continuity) {
+    BasisRequest c1_request{ElementType::Line2, BasisType::Lagrange, 1};
+    c1_request.continuity = Continuity::C1;
+    EXPECT_THROW((void)basis_factory::create(c1_request), BasisConfigurationException);
+
+    BasisRequest l2_request{ElementType::Quad8, BasisType::Serendipity, 2};
+    l2_request.continuity = Continuity::L2;
+    EXPECT_THROW((void)basis_factory::create(l2_request), BasisConfigurationException);
+}
+
 TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Line2, BasisType::Lagrange}),
@@ -153,6 +218,43 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
     EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
 }
 
+TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
+    ExactQuadraticBasis basis;
+    const math::Vector<Real, 3> xi{Real(0.2), Real(-0.35), Real(0.4)};
+
+    std::vector<Gradient> exact_gradients;
+    basis.evaluate_gradients(xi, exact_gradients);
+
+    std::vector<Gradient> approx_gradients;
+    basis.numerical_gradient(xi, approx_gradients);
+    ASSERT_EQ(approx_gradients.size(), basis.size());
+    for (std::size_t n = 0; n < basis.size(); ++n) {
+        for (int d = 0; d < basis.dimension(); ++d) {
+            const std::size_t sd = static_cast<std::size_t>(d);
+            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], Real(1e-8))
+                << "basis=" << n << " component=" << d;
+        }
+    }
+
+    std::vector<Hessian> exact_hessians;
+    basis.exact_hessians(exact_hessians);
+
+    std::vector<Hessian> approx_hessians;
+    basis.numerical_hessian(xi, approx_hessians);
+    ASSERT_EQ(approx_hessians.size(), basis.size());
+    for (std::size_t n = 0; n < basis.size(); ++n) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = 0; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(approx_hessians[n](sr, sc), exact_hessians[n](sr, sc),
+                            Real(1e-8))
+                    << "basis=" << n << " component=(" << r << "," << c << ")";
+            }
+        }
+    }
+}
+
 TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index f786b07cd..9ad458c0b 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -18,12 +18,39 @@ using namespace svmp::FE::basis;
 
 namespace {
 
+void numerical_gradient_helper(const BasisFunction& basis,
+                               const math::Vector<Real, 3>& xi,
+                               std::vector<Gradient>& gradients,
+                               Real eps = Real(1e-6))
+{
+    std::vector<Real> base;
+    basis.evaluate_values(xi, base);
+    gradients.assign(base.size(), Gradient::Zero());
+
+    for (int d = 0; d < basis.dimension(); ++d) {
+        const std::size_t sd = static_cast<std::size_t>(d);
+        math::Vector<Real, 3> xi_p = xi;
+        math::Vector<Real, 3> xi_m = xi;
+        xi_p[sd] += eps;
+        xi_m[sd] -= eps;
+
+        std::vector<Real> v_p;
+        std::vector<Real> v_m;
+        basis.evaluate_values(xi_p, v_p);
+        basis.evaluate_values(xi_m, v_m);
+
+        for (std::size_t n = 0; n < base.size(); ++n) {
+            gradients[n][sd] = (v_p[n] - v_m[n]) / (Real(2) * eps);
+        }
+    }
+}
+
 void numerical_hessian_helper(const BasisFunction& basis,
                               const math::Vector<Real, 3>& xi,
                               std::vector<Hessian>& hessians,
                               Real eps = Real(1e-5))
 {
-    hessians.assign(basis.size(), Hessian{});
+    hessians.assign(basis.size(), Hessian::Zero());
     const int dim = basis.dimension();
 
     for (int i = 0; i < dim; ++i) {
@@ -66,7 +93,31 @@ std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
     }
 }
 
-void expect_hessians_match_numerical(const LagrangeBasis& basis,
+void expect_gradients_match_numerical(const BasisFunction& basis,
+                                      const std::vector<math::Vector<Real, 3>>& points,
+                                      Real tol,
+                                      Real eps = Real(1e-6))
+{
+    for (const auto& xi : points) {
+        std::vector<Gradient> analytical;
+        std::vector<Gradient> numerical;
+        basis.evaluate_gradients(xi, analytical);
+        numerical_gradient_helper(basis, xi, numerical, eps);
+
+        ASSERT_EQ(analytical.size(), numerical.size());
+        for (std::size_t n = 0; n < analytical.size(); ++n) {
+            for (int d = 0; d < basis.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(analytical[n][sd], numerical[n][sd], tol)
+                    << "basis " << n << ", component " << d
+                    << ", element " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order();
+            }
+        }
+    }
+}
+
+void expect_hessians_match_numerical(const BasisFunction& basis,
                                      const std::vector<math::Vector<Real, 3>>& points,
                                      Real tol,
                                      Real eps = Real(1e-5))
@@ -100,7 +151,7 @@ void expect_partition_hessian_sum_zero(const LagrangeBasis& basis,
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
 
-    Hessian sum{};
+    Hessian sum = Hessian::Zero();
     for (const auto& hessian : hessians) {
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
@@ -145,7 +196,7 @@ void expect_partition_hessian_sum_zero(const BasisFunction& basis,
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
 
-    Hessian sum{};
+    Hessian sum = Hessian::Zero();
     for (const auto& hessian : hessians) {
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
@@ -183,6 +234,16 @@ void expect_hessians_symmetric(const BasisFunction& basis,
     }
 }
 
+std::vector<math::Vector<Real, 3>> serendipity_sample_points(ElementType type) {
+    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
+        return {{Real(0.17), Real(-0.31), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+    }
+    if (type == ElementType::Hex8 || type == ElementType::Hex20) {
+        return {{Real(0.2), Real(-0.1), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+    }
+    return {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.12), Real(0.16), Real(-0.2)}};
+}
+
 } // namespace
 
 TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
@@ -280,3 +341,75 @@ TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
 
     EXPECT_EQ(covered, 13);
 }
+
+// Gradients must match centered finite differences of values. This is the only
+// check that ties the gradient code path back to the value code path; partition
+// sums and Hessian-vs-FD(gradient) comparisons cannot catch a systematic error
+// shared by the first- and second-derivative recurrences.
+TEST(BasisGradients, LagrangeCanonicalTopologiesMatchNumericalGradients) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 3, Real(1e-8)},
+        {ElementType::Triangle3, 3, Real(1e-7)},
+        {ElementType::Quad4, 3, Real(1e-7)},
+        {ElementType::Tetra4, 2, Real(1e-7)},
+        {ElementType::Hex8, 2, Real(1e-7)},
+        {ElementType::Wedge6, 2, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_gradients_match_numerical(basis, sample_points_for(c.type), c.tol);
+    }
+}
+
+// The serendipity coefficient tables (Hex20 20x20, Wedge15 15x15) and the quad
+// inverse-Vandermonde path each differentiate values through hand-written code
+// that is independent of the value evaluation. Partition sums only verify that
+// the constant function differentiates to zero, and symmetry is assigned
+// structurally, so neither can detect a wrong derivative formula. Finite
+// differences of values are the authoritative check.
+TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad4, 1, Real(1e-8)},
+        {ElementType::Quad8, 2, Real(1e-7)},
+        {ElementType::Quad4, 3, Real(1e-7)},
+        {ElementType::Quad4, 4, Real(5e-7)},
+        {ElementType::Hex8, 1, Real(1e-8)},
+        {ElementType::Hex20, 2, Real(1e-7)},
+        {ElementType::Wedge15, 2, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_gradients_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+    }
+}
+
+TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad4, 1, Real(1e-6)},
+        {ElementType::Quad8, 2, Real(1e-6)},
+        {ElementType::Quad4, 3, Real(1e-6)},
+        {ElementType::Quad4, 4, Real(5e-6)},
+        {ElementType::Hex8, 1, Real(1e-6)},
+        {ElementType::Hex20, 2, Real(1e-6)},
+        {ElementType::Wedge15, 2, Real(1e-6)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_hessians_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 3faffd9e0..8827eebb0 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -57,8 +57,8 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
         basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
+        Gradient gradient_sum = Gradient::Zero();
+        Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
@@ -137,3 +137,21 @@ TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
     expect_all_entries_finite(wedge, {Real(0.2), Real(0.1), Real(-0.6)});
     expect_all_entries_finite(wedge, {Real(0.05), Real(0.8), Real(0.3)});
 }
+
+// Finiteness alone cannot detect a wrong triangle-index or axis-index lookup;
+// the Kronecker property validates the order-four node lattice and its inverse
+// index mapping end to end.
+TEST(HigherOrderWedge, OrderFourIsNodalAndPartitionsUnity) {
+    LagrangeBasis wedge(ElementType::Wedge6, 4);
+
+    EXPECT_EQ(wedge.size(), 75u);
+    expect_kronecker_at_nodes(wedge, Real(1e-9));
+    expect_partition_gradient_hessian_sums(
+        wedge,
+        {
+            {Real(0.18), Real(0.22), Real(-0.2)},
+            {Real(0.25), Real(0.15), Real(0.45)},
+        },
+        Real(1e-12),
+        Real(1e-7));
+}
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 9d93f8931..8a1f43c58 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -10,6 +10,7 @@
 #include "FE/Basis/LagrangeBasis.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 
+#include <algorithm>
 #include <array>
 #include <tuple>
 #include <vector>
@@ -90,8 +91,8 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
         basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
+        Gradient gradient_sum = Gradient::Zero();
+        Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
@@ -190,7 +191,7 @@ Real linear_function(const Point& p) {
 }
 
 Gradient linear_gradient() {
-    Gradient g{};
+    Gradient g = Gradient::Zero();
     g[0] = Real(3);
     g[1] = Real(-4);
     g[2] = Real(5);
@@ -204,6 +205,18 @@ Real quadratic_function(const Point& p) {
            Real(0.4) * p[1] * p[2];
 }
 
+// Total degree three, so it lies in both the P3 simplex space and the Q3
+// tensor-product space.
+Real cubic_function(const Point& p) {
+    return quadratic_function(p) +
+           Real(0.1) * p[0] * p[0] * p[0] -
+           Real(0.2) * p[1] * p[1] * p[1] +
+           Real(0.3) * p[2] * p[2] * p[2] +
+           Real(0.15) * p[0] * p[0] * p[1] -
+           Real(0.12) * p[0] * p[2] * p[2] +
+           Real(0.08) * p[0] * p[1] * p[2];
+}
+
 template<typename Function>
 Real interpolate_value(const LagrangeBasis& basis,
                        const std::vector<Real>& values,
@@ -338,7 +351,7 @@ TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
             interpolate_value(basis, values, linear_function);
         EXPECT_NEAR(interpolated, linear_function(point), Real(1e-12));
 
-        Gradient interpolated_gradient{};
+        Gradient interpolated_gradient = Gradient::Zero();
         for (std::size_t i = 0; i < gradients.size(); ++i) {
             const Real nodal_value = linear_function(basis.nodes()[i]);
             for (int d = 0; d < basis.dimension(); ++d) {
@@ -376,6 +389,192 @@ TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
     }
 }
 
+// Tetra order >= 3 activates the face-interior node loops, tetra order >= 4
+// activates the volume-interior lattice, and hex order >= 3 activates the six
+// orientation-specific face traversals in NodeOrderingConventions. None of
+// those generation paths run at the orders covered elsewhere; the Kronecker
+// test is what validates the node lattice together with its llround-based
+// inverse index mapping (a duplicated or missing node makes the basis
+// non-nodal here).
+TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
+    const struct Case {
+        ElementType type;
+        int order;
+        std::size_t size;
+        Real kronecker_tol;
+        Real derivative_tol;
+        std::vector<Point> points;
+    } cases[] = {
+        {ElementType::Tetra4, 3, 20u, Real(5e-10), Real(1e-8),
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.3), Real(0.2), Real(0.25)}}},
+        {ElementType::Tetra4, 4, 35u, Real(1e-9), Real(1e-7),
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}}},
+        {ElementType::Hex8, 3, 64u, Real(5e-10), Real(1e-8),
+         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        EXPECT_EQ(basis.size(), c.size);
+        expect_kronecker_at_nodes(basis, c.kronecker_tol);
+        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
+    }
+}
+
+TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Tetra4, {Real(0.15), Real(0.2), Real(0.25)}},
+        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
+    };
+
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 3);
+        std::vector<Real> values;
+        basis.evaluate_values(point, values);
+
+        const Real interpolated = interpolate_value(basis, values, cubic_function);
+        EXPECT_NEAR(interpolated, cubic_function(point), Real(1e-10))
+            << "element=" << static_cast<int>(type);
+    }
+}
+
+TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
+    LagrangeBasis basis(ElementType::Point1, 0);
+
+    EXPECT_EQ(basis.element_type(), ElementType::Point1);
+    EXPECT_EQ(basis.size(), 1u);
+    EXPECT_EQ(basis.dimension(), 0);
+    ASSERT_EQ(basis.nodes().size(), 1u);
+
+    const Point xi{Real(0.3), Real(-0.4), Real(0.1)};
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(xi, values, gradients, hessians);
+
+    ASSERT_EQ(values.size(), 1u);
+    EXPECT_EQ(values[0], Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(gradients[0][d], Real(0));
+        for (std::size_t e = 0; e < 3u; ++e) {
+            EXPECT_EQ(hessians[0](d, e), Real(0));
+        }
+    }
+
+    Real flat_value = Real(-1);
+    Real flat_gradient[3] = {Real(-1), Real(-1), Real(-1)};
+    Real flat_hessian[9];
+    std::fill_n(flat_hessian, 9u, Real(-1));
+    basis.evaluate_values_to(xi, &flat_value);
+    basis.evaluate_gradients_to(xi, flat_gradient);
+    basis.evaluate_hessians_to(xi, flat_hessian);
+    EXPECT_EQ(flat_value, Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(flat_gradient[d], Real(0));
+    }
+    for (std::size_t e = 0; e < 9u; ++e) {
+        EXPECT_EQ(flat_hessian[e], Real(0));
+    }
+}
+
+// P0 bases back piecewise-constant fields (e.g. pressure in mixed elements);
+// the order-zero branches in node generation and the simplex/tensor/wedge
+// evaluators have no other coverage.
+TEST(LagrangeBasis, OrderZeroBasesAreConstantUnity) {
+    const std::array<ElementType, 6> types = {
+        ElementType::Line2,
+        ElementType::Triangle3,
+        ElementType::Quad4,
+        ElementType::Tetra4,
+        ElementType::Hex8,
+        ElementType::Wedge6,
+    };
+
+    for (const auto type : types) {
+        LagrangeBasis basis(type, 0);
+        EXPECT_EQ(basis.order(), 0) << "element=" << static_cast<int>(type);
+        EXPECT_EQ(basis.size(), 1u) << "element=" << static_cast<int>(type);
+
+        for (const auto& xi : sample_points_for(type)) {
+            std::vector<Real> values;
+            std::vector<Gradient> gradients;
+            std::vector<Hessian> hessians;
+            basis.evaluate_all(xi, values, gradients, hessians);
+
+            ASSERT_EQ(values.size(), 1u);
+            EXPECT_NEAR(values[0], Real(1), Real(1e-14))
+                << "element=" << static_cast<int>(type);
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(gradients[0][d], Real(0), Real(1e-14));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(hessians[0](d, e), Real(0), Real(1e-14));
+                }
+            }
+        }
+    }
+}
+
+// Pins the default basis selection for every supported element type. The
+// solver adapter (nn.cpp) translates solver element names to ElementType and
+// delegates the family/order choice to default_basis_request; a silent change
+// here would change the discretization of every simulation using that element.
+TEST(BasisFactoryDefaults, SelectionsArePinnedForAllSupportedElements) {
+    struct Expected {
+        ElementType type;
+        BasisType family;
+        int order;
+        std::size_t size;
+    };
+    const std::vector<Expected> cases = {
+        {ElementType::Point1,    BasisType::Lagrange,    0, 1u},
+        {ElementType::Line2,     BasisType::Lagrange,    1, 2u},
+        {ElementType::Line3,     BasisType::Lagrange,    2, 3u},
+        {ElementType::Triangle3, BasisType::Lagrange,    1, 3u},
+        {ElementType::Triangle6, BasisType::Lagrange,    2, 6u},
+        {ElementType::Quad4,     BasisType::Lagrange,    1, 4u},
+        {ElementType::Quad8,     BasisType::Serendipity, 2, 8u},
+        {ElementType::Quad9,     BasisType::Lagrange,    2, 9u},
+        {ElementType::Tetra4,    BasisType::Lagrange,    1, 4u},
+        {ElementType::Tetra10,   BasisType::Lagrange,    2, 10u},
+        {ElementType::Hex8,      BasisType::Lagrange,    1, 8u},
+        {ElementType::Hex20,     BasisType::Serendipity, 2, 20u},
+        {ElementType::Hex27,     BasisType::Lagrange,    2, 27u},
+        {ElementType::Wedge6,    BasisType::Lagrange,    1, 6u},
+        {ElementType::Wedge15,   BasisType::Serendipity, 2, 15u},
+        {ElementType::Wedge18,   BasisType::Lagrange,    2, 18u},
+    };
+
+    for (const auto& expected : cases) {
+        const auto request = basis_factory::default_basis_request(expected.type);
+        EXPECT_EQ(request.element_type, expected.type)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(request.basis_type, expected.family)
+            << "element=" << static_cast<int>(expected.type);
+        ASSERT_TRUE(request.order.has_value())
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(*request.order, expected.order)
+            << "element=" << static_cast<int>(expected.type);
+
+        auto basis = basis_factory::create_default_for(expected.type);
+        ASSERT_NE(basis, nullptr);
+        EXPECT_EQ(basis->basis_type(), expected.family)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(basis->order(), expected.order)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(basis->size(), expected.size)
+            << "element=" << static_cast<int>(expected.type);
+    }
+}
+
+TEST(BasisFactoryDefaults, RejectsElementsWithoutDefaultBasis) {
+    EXPECT_THROW((void)basis_factory::default_basis_request(ElementType::Pyramid5),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::default_basis_request(ElementType::Pyramid13),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::create_default_for(ElementType::Unknown),
+                 BasisElementCompatibilityException);
+}
+
 TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     auto lagrange =
         basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1});
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
index 30f876420..235dc8c40 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -5,9 +5,11 @@
 
 #include <gtest/gtest.h>
 
+#include "FE/Basis/LagrangeBasis.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 #include "FE/Basis/SerendipityBasis.h"
 
+#include <cmath>
 #include <vector>
 
 using namespace svmp::FE;
@@ -25,7 +27,7 @@ void expect_partition_of_unity(const SerendipityBasis& basis,
     basis.evaluate_gradients(xi, gradients);
 
     Real value_sum = Real(0);
-    Gradient gradient_sum{};
+    Gradient gradient_sum = Gradient::Zero();
     for (std::size_t i = 0; i < values.size(); ++i) {
         value_sum += values[i];
         for (std::size_t component = 0; component < 3u; ++component) {
@@ -68,6 +70,36 @@ std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
     return nodes;
 }
 
+template<typename Function>
+Real interpolate_nodal_function(const SerendipityBasis& basis,
+                                const math::Vector<Real, 3>& xi,
+                                Function&& nodal_function)
+{
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    Real result = Real(0);
+    const auto& nodes = basis.nodes();
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        result += values[i] * nodal_function(nodes[i]);
+    }
+    return result;
+}
+
+// Every monomial here has superlinear degree at most three, so it lies in the
+// order-three quadrilateral serendipity space.
+Real cubic_serendipity_function(const math::Vector<Real, 3>& p) {
+    const Real x = p[0];
+    const Real y = p[1];
+    return Real(1) + Real(2) * x - y + Real(3) * x * y +
+           x * x * x - Real(2) * y * y * y +
+           Real(0.5) * x * x * x * y - Real(0.25) * x * y * y * y;
+}
+
+Real bilinear_function(const math::Vector<Real, 3>& p) {
+    return Real(2) - Real(3) * p[0] + Real(4) * p[1] + Real(0.5) * p[0] * p[1];
+}
+
 } // namespace
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
@@ -104,3 +136,154 @@ TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
 }
+
+// Orders other than two run the generic quadrilateral path: serendipity
+// monomial selection, boundary plus interior node placement, and a runtime
+// Vandermonde inversion whose unisolvence is assumed rather than tabulated.
+// Order four is the first order that selects an interior node.
+TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity) {
+    const struct Case {
+        int order;
+        std::size_t size;
+    } cases[] = {
+        {1, 4u},
+        {3, 12u},
+        {4, 17u},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(ElementType::Quad4, c.order);
+        EXPECT_EQ(basis.size(), c.size) << "order=" << c.order;
+        EXPECT_EQ(basis.order(), c.order);
+        EXPECT_EQ(basis.dimension(), 2);
+        ASSERT_EQ(basis.nodes().size(), c.size);
+
+        for (const auto& node : basis.nodes()) {
+            EXPECT_LE(std::abs(node[0]), Real(1));
+            EXPECT_LE(std::abs(node[1]), Real(1));
+        }
+
+        expect_nodal_delta(basis, basis.nodes(), Real(1e-9));
+        expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-9));
+        expect_partition_of_unity(basis, {Real(-0.45), Real(0.25), Real(0)}, Real(1e-9));
+    }
+}
+
+TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
+    SerendipityBasis basis(ElementType::Quad4, 1);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(-0.4), Real(0)},
+        {Real(-0.7), Real(0.6), Real(0)},
+    };
+    for (const auto& xi : points) {
+        EXPECT_NEAR(interpolate_nodal_function(basis, xi, bilinear_function),
+                    bilinear_function(xi),
+                    Real(1e-12));
+    }
+}
+
+TEST(SerendipityBasis, QuadrilateralOrderThreeReproducesSerendipityCubics) {
+    SerendipityBasis basis(ElementType::Quad4, 3);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(-0.4), Real(0)},
+        {Real(-0.7), Real(0.6), Real(0)},
+    };
+    for (const auto& xi : points) {
+        EXPECT_NEAR(interpolate_nodal_function(basis, xi, cubic_serendipity_function),
+                    cubic_serendipity_function(xi),
+                    Real(1e-11));
+    }
+}
+
+// SerendipityBasis(Hex8, 1) is the only route to the hand-written trilinear
+// corner evaluator (values, gradients, and Hessians); it must agree with the
+// trilinear Lagrange basis on the same element.
+TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
+    SerendipityBasis serendipity(ElementType::Hex8, 1);
+    LagrangeBasis lagrange(ElementType::Hex8, 1);
+
+    EXPECT_EQ(serendipity.size(), 8u);
+    EXPECT_EQ(serendipity.dimension(), 3);
+    expect_nodal_delta(serendipity,
+                       reference_nodes(ElementType::Hex8, serendipity.size()),
+                       Real(1e-12));
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.2), Real(-0.1), Real(0.3)},
+        {Real(-0.35), Real(0.25), Real(-0.15)},
+    };
+    for (const auto& xi : points) {
+        std::vector<Real> s_values;
+        std::vector<Real> l_values;
+        std::vector<Gradient> s_gradients;
+        std::vector<Gradient> l_gradients;
+        std::vector<Hessian> s_hessians;
+        std::vector<Hessian> l_hessians;
+        serendipity.evaluate_all(xi, s_values, s_gradients, s_hessians);
+        lagrange.evaluate_all(xi, l_values, l_gradients, l_hessians);
+
+        ASSERT_EQ(s_values.size(), l_values.size());
+        for (std::size_t i = 0; i < s_values.size(); ++i) {
+            EXPECT_NEAR(s_values[i], l_values[i], Real(1e-13));
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(s_gradients[i][d], l_gradients[i][d], Real(1e-13));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(s_hessians[i](d, e), l_hessians[i](d, e), Real(1e-13));
+                }
+            }
+        }
+    }
+}
+
+// Geometry mode keeps the public Hex20 node count while mapping geometry with
+// the trilinear corner functions: corners must match the Hex8 basis exactly
+// and the quadratic edge nodes must contribute nothing.
+TEST(SerendipityBasis, Hex20GeometryModeUsesTrilinearCornersOnly) {
+    SerendipityBasis geometry(ElementType::Hex20, 2, true);
+    SerendipityBasis trilinear(ElementType::Hex8, 1);
+
+    EXPECT_EQ(geometry.size(), 20u);
+    EXPECT_EQ(geometry.order(), 2);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.2), Real(-0.1), Real(0.3)},
+        {Real(-0.35), Real(0.25), Real(-0.15)},
+    };
+    for (const auto& xi : points) {
+        std::vector<Real> g_values;
+        std::vector<Gradient> g_gradients;
+        std::vector<Hessian> g_hessians;
+        geometry.evaluate_all(xi, g_values, g_gradients, g_hessians);
+        ASSERT_EQ(g_values.size(), 20u);
+
+        std::vector<Real> t_values;
+        std::vector<Gradient> t_gradients;
+        std::vector<Hessian> t_hessians;
+        trilinear.evaluate_all(xi, t_values, t_gradients, t_hessians);
+
+        Real value_sum = Real(0);
+        for (std::size_t i = 0; i < 20u; ++i) {
+            value_sum += g_values[i];
+            if (i < 8u) {
+                EXPECT_NEAR(g_values[i], t_values[i], Real(1e-13)) << "corner=" << i;
+                for (std::size_t d = 0; d < 3u; ++d) {
+                    EXPECT_NEAR(g_gradients[i][d], t_gradients[i][d], Real(1e-13));
+                    for (std::size_t e = 0; e < 3u; ++e) {
+                        EXPECT_NEAR(g_hessians[i](d, e), t_hessians[i](d, e), Real(1e-13));
+                    }
+                }
+            } else {
+                EXPECT_EQ(g_values[i], Real(0)) << "edge node=" << i;
+                for (std::size_t d = 0; d < 3u; ++d) {
+                    EXPECT_EQ(g_gradients[i][d], Real(0));
+                    for (std::size_t e = 0; e < 3u; ++e) {
+                        EXPECT_EQ(g_hessians[i](d, e), Real(0));
+                    }
+                }
+            }
+        }
+        EXPECT_NEAR(value_sum, Real(1), Real(1e-13));
+    }
+}
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
index 2b44ad2bf..9e9e08e95 100644
--- a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -108,6 +108,132 @@ TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
     }
 }
 
+// Every other matrix in this file already has its largest pivot on the
+// diagonal, so without these cases the row-exchange branch in
+// factor_dense_matrix and the permutation replay in solve_in_place never
+// execute. SerendipityBasis inverts its Vandermonde matrices through this
+// code in production.
+TEST(DenseLinearAlgebra, FactorizationPivotsThroughZeroLeadingDiagonal) {
+    const std::vector<Real> swap_2x2{
+        Real(0), Real(1),
+        Real(1), Real(0)
+    };
+
+    const auto solver = factor_dense_matrix(swap_2x2, 2u, "swap 2x2");
+    const std::vector<Real> rhs{Real(3), Real(7)};
+    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    ASSERT_EQ(x.size(), 2u);
+    EXPECT_NEAR(x[0], Real(7), Real(1.0e-14));
+    EXPECT_NEAR(x[1], Real(3), Real(1.0e-14));
+
+    const auto inv = invert_dense_matrix(swap_2x2, 2u, "swap 2x2");
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            EXPECT_NEAR(inv[row * 2u + col], swap_2x2[row * 2u + col], Real(1.0e-14));
+        }
+    }
+
+    // Every column requires a row exchange during elimination.
+    const std::vector<Real> permuted_scaled{
+        Real(0), Real(0), Real(1), Real(0),
+        Real(1), Real(0), Real(0), Real(0),
+        Real(0), Real(0), Real(0), Real(2),
+        Real(0), Real(3), Real(0), Real(0)
+    };
+
+    const auto inv4 = invert_dense_matrix(permuted_scaled, 4u, "permuted scaled 4x4");
+    for (std::size_t row = 0; row < 4u; ++row) {
+        for (std::size_t col = 0; col < 4u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(permuted_scaled, inv4, 4u, row, col),
+                        expected,
+                        Real(1.0e-14));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, WideMultiRhsSolveWithPivoting) {
+    // Requires a row swap in column 0 and uses a wide right-hand-side block to
+    // exercise the row-interleaved multi-RHS layout end to end.
+    const std::vector<Real> A{
+        Real(0), Real(2), Real(1),
+        Real(4), Real(1), Real(0),
+        Real(1), Real(0), Real(3)
+    };
+    constexpr std::size_t kRhsCount = 33u;
+
+    const auto solver = factor_dense_matrix(A, 3u, "pivoting 3x3");
+
+    std::vector<Real> rhs(3u * kRhsCount, Real(0));
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t r = 0; r < kRhsCount; ++r) {
+            rhs[row * kRhsCount + r] =
+                Real(1) + static_cast<Real>(row) - Real(0.25) * static_cast<Real>(r % 7u);
+        }
+    }
+    const auto original_rhs = rhs;
+
+    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), kRhsCount);
+
+    for (std::size_t r = 0; r < kRhsCount; ++r) {
+        for (std::size_t row = 0; row < 3u; ++row) {
+            Real ax = Real(0);
+            for (std::size_t col = 0; col < 3u; ++col) {
+                ax += A[row * 3u + col] * rhs[col * kRhsCount + r];
+            }
+            EXPECT_NEAR(ax, original_rhs[row * kRhsCount + r], Real(1.0e-12))
+                << "rhs column " << r << ", row " << row;
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, SolveInPlaceValidatesInputs) {
+    const std::vector<Real> identity{
+        Real(1), Real(0),
+        Real(0), Real(1)
+    };
+    const auto solver = factor_dense_matrix(identity, 2u, "identity 2x2");
+
+    std::vector<Real> rhs{Real(1), Real(2)};
+    EXPECT_THROW(solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 0u),
+                 FEException);
+
+    std::vector<Real> wrong_size{Real(1), Real(2), Real(3)};
+    EXPECT_THROW(
+        solver.solve_in_place(std::span<Real>(wrong_size.data(), wrong_size.size()), 1u),
+        FEException);
+
+    DenseLUSolver unfactored;
+    unfactored.n = 2u;
+    unfactored.label = "unfactored";
+    EXPECT_FALSE(unfactored.empty());
+    EXPECT_THROW(unfactored.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 1u),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, DiagnosticValidationRejectsRankMismatch) {
+    DenseInverseResult result;
+    result.diagnostics.rank = 1u;
+
+    EXPECT_THROW(validate_dense_inverse_diagnostics(result, 2u, "rank mismatch"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, RankHandlesNonSquareMatrices) {
+    const std::vector<Real> wide_full{
+        Real(1), Real(0), Real(2),
+        Real(0), Real(1), Real(-1)
+    };
+    EXPECT_EQ(dense_matrix_rank(wide_full, 2u, 3u), 2u);
+
+    const std::vector<Real> tall_rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4),
+        Real(3), Real(6)
+    };
+    EXPECT_EQ(dense_matrix_rank(tall_rank_one, 3u, 2u), 1u);
+}
+
 TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
     const std::vector<Real> high_condition{
         Real(1), Real(0),
@@ -117,13 +243,9 @@ TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
     const auto result =
         invert_dense_matrix_with_diagnostics(high_condition, 2u, "high-condition diagonal");
     EXPECT_EQ(result.diagnostics.rank, 2u);
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     EXPECT_GT(result.diagnostics.condition_estimate,
               dense_matrix_condition_fallback_threshold());
     EXPECT_TRUE(result.used_svd_fallback);
-#else
-    EXPECT_FALSE(result.used_svd_fallback);
-#endif
 
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
@@ -136,9 +258,6 @@ TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
 }
 
 TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "condition rejection requires FE_ENABLE_EIGEN diagnostics";
-#endif
     DenseInverseResult result;
     result.diagnostics.rank = 2u;
     result.diagnostics.condition_estimate =
@@ -193,13 +312,9 @@ TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
     const auto full =
         dense_matrix_diagnostics(diagonal, 2u, 2u, "diagonal 2x2");
     EXPECT_EQ(full.rank, 2u);
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     EXPECT_NEAR(full.largest_singular_value, Real(4), Real(1.0e-14));
     EXPECT_NEAR(full.smallest_retained_singular_value, Real(0.5), Real(1.0e-14));
     EXPECT_NEAR(full.condition_estimate, Real(8), Real(1.0e-14));
-#else
-    EXPECT_TRUE(std::isinf(full.condition_estimate));
-#endif
 
     const std::vector<Real> rank_one{
         Real(1), Real(2),
@@ -212,9 +327,6 @@ TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquations) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
-#endif
     const std::vector<Real> rank_one{
         Real(1), Real(2),
         Real(2), Real(4)
@@ -246,9 +358,6 @@ TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquation
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseDropsNearZeroSingularValues) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
-#endif
     const std::vector<Real> near_singular{
         Real(1), Real(0),
         Real(0), Real(1.0e-18)
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
deleted file mode 100644
index 3b2fe664a..000000000
--- a/tests/unitTests/FE/Math/test_Matrix.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-/**
- * @file test_Matrix.cpp
- * @brief Unit tests for Matrix.h - fixed-size matrices with expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Matrix.h"
-#include "FE/Math/Vector.h"
-#include "FE/Math/MatrixExpr.h"
-#include <limits>
-#include <cmath>
-#include <thread>
-#include <vector>
-
-using namespace svmp::FE::math;
-
-// Test fixture for Matrix tests
-class MatrixTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    // Helper function to check if two values are approximately equal
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Construction and Initialization Tests
-// =============================================================================
-
-TEST_F(MatrixTest, DefaultConstruction) {
-    Matrix<double, 3, 3> m;
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(m(i, j), 0.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, FillConstruction) {
-    Matrix<double, 2, 3> m(5.0);
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(m(i, j), 5.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, InitializerListConstruction) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    EXPECT_EQ(m(0, 0), 1.0);
-    EXPECT_EQ(m(0, 1), 2.0);
-    EXPECT_EQ(m(0, 2), 3.0);
-    EXPECT_EQ(m(1, 0), 4.0);
-    EXPECT_EQ(m(1, 1), 5.0);
-    EXPECT_EQ(m(1, 2), 6.0);
-}
-
-TEST_F(MatrixTest, CopyConstruction) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0},
-                            {3.0, 4.0}};
-    Matrix<double, 2, 2> m2(m1);
-
-    EXPECT_EQ(m2(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 1), 2.0);
-    EXPECT_EQ(m2(1, 0), 3.0);
-    EXPECT_EQ(m2(1, 1), 4.0);
-
-    // Ensure deep copy
-    m2(0, 0) = 10.0;
-    EXPECT_EQ(m1(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 0), 10.0);
-}
-
-TEST_F(MatrixTest, MoveConstruction) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0},
-                            {3.0, 4.0}};
-    Matrix<double, 2, 2> m2(std::move(m1));
-
-    EXPECT_EQ(m2(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 1), 2.0);
-    EXPECT_EQ(m2(1, 0), 3.0);
-    EXPECT_EQ(m2(1, 1), 4.0);
-}
-
-// =============================================================================
-// Element Access Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ElementAccess) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    // Non-const access using operator()
-    EXPECT_EQ(m(0, 0), 1.0);
-    EXPECT_EQ(m(0, 2), 3.0);
-    EXPECT_EQ(m(1, 1), 5.0);
-
-    // Modification
-    m(1, 2) = 7.0;
-    EXPECT_EQ(m(1, 2), 7.0);
-
-    // Const access
-    const Matrix<double, 2, 3> cm{{1.0, 2.0, 3.0},
-                                  {4.0, 5.0, 6.0}};
-    EXPECT_EQ(cm(0, 1), 2.0);
-    EXPECT_EQ(cm(1, 0), 4.0);
-}
-
-TEST_F(MatrixTest, ElementAccessBounds) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    // at() with bounds checking
-    EXPECT_EQ(m.at(0, 0), 1.0);
-    EXPECT_EQ(m.at(1, 2), 6.0);
-
-    // Test out of bounds throws
-    EXPECT_THROW(m.at(2, 0), std::out_of_range);
-    EXPECT_THROW(m.at(0, 3), std::out_of_range);
-    EXPECT_THROW(m.at(10, 10), std::out_of_range);
-}
-
-TEST_F(MatrixTest, RowColumnAccess) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-
-    // Get row
-    auto row1 = m.row(1);
-    EXPECT_EQ(row1[0], 4.0);
-    EXPECT_EQ(row1[1], 5.0);
-    EXPECT_EQ(row1[2], 6.0);
-
-    // Get column
-    auto col2 = m.col(2);
-    EXPECT_EQ(col2[0], 3.0);
-    EXPECT_EQ(col2[1], 6.0);
-    EXPECT_EQ(col2[2], 9.0);
-
-    // Set row
-    Vector<double, 3> new_row{10.0, 11.0, 12.0};
-    m.set_row(0, new_row);
-    EXPECT_EQ(m(0, 0), 10.0);
-    EXPECT_EQ(m(0, 1), 11.0);
-    EXPECT_EQ(m(0, 2), 12.0);
-
-    // Set column
-    Vector<double, 3> new_col{20.0, 21.0, 22.0};
-    m.set_col(1, new_col);
-    EXPECT_EQ(m(0, 1), 20.0);
-    EXPECT_EQ(m(1, 1), 21.0);
-    EXPECT_EQ(m(2, 1), 22.0);
-}
-
-// =============================================================================
-// Arithmetic Operations Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Addition) {
-    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
-                           {10.0, 11.0, 12.0}};
-
-    Matrix<double, 2, 3> c = a + b;
-    EXPECT_EQ(c(0, 0), 8.0);
-    EXPECT_EQ(c(0, 1), 10.0);
-    EXPECT_EQ(c(0, 2), 12.0);
-    EXPECT_EQ(c(1, 0), 14.0);
-    EXPECT_EQ(c(1, 1), 16.0);
-    EXPECT_EQ(c(1, 2), 18.0);
-}
-
-TEST_F(MatrixTest, Subtraction) {
-    Matrix<double, 2, 3> a{{8.0, 10.0, 12.0},
-                           {14.0, 16.0, 18.0}};
-    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
-                           {10.0, 11.0, 12.0}};
-
-    Matrix<double, 2, 3> c = a - b;
-    EXPECT_EQ(c(0, 0), 1.0);
-    EXPECT_EQ(c(0, 1), 2.0);
-    EXPECT_EQ(c(0, 2), 3.0);
-    EXPECT_EQ(c(1, 0), 4.0);
-    EXPECT_EQ(c(1, 1), 5.0);
-    EXPECT_EQ(c(1, 2), 6.0);
-}
-
-TEST_F(MatrixTest, ScalarMultiplication) {
-    Matrix<double, 2, 2> a{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> b = 2.0 * a;
-    EXPECT_EQ(b(0, 0), 2.0);
-    EXPECT_EQ(b(0, 1), 4.0);
-    EXPECT_EQ(b(1, 0), 6.0);
-    EXPECT_EQ(b(1, 1), 8.0);
-
-    Matrix<double, 2, 2> c = a * 3.0;
-    EXPECT_EQ(c(0, 0), 3.0);
-    EXPECT_EQ(c(0, 1), 6.0);
-    EXPECT_EQ(c(1, 0), 9.0);
-    EXPECT_EQ(c(1, 1), 12.0);
-}
-
-TEST_F(MatrixTest, ScalarDivision) {
-    Matrix<double, 2, 2> a{{2.0, 4.0},
-                           {6.0, 8.0}};
-
-    Matrix<double, 2, 2> b = a / 2.0;
-    EXPECT_EQ(b(0, 0), 1.0);
-    EXPECT_EQ(b(0, 1), 2.0);
-    EXPECT_EQ(b(1, 0), 3.0);
-    EXPECT_EQ(b(1, 1), 4.0);
-}
-
-TEST_F(MatrixTest, MatrixMultiplication) {
-    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-    Matrix<double, 3, 2> b{{7.0, 8.0},
-                           {9.0, 10.0},
-                           {11.0, 12.0}};
-
-    Matrix<double, 2, 2> c = a * b;
-    EXPECT_EQ(c(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
-    EXPECT_EQ(c(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
-    EXPECT_EQ(c(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
-    EXPECT_EQ(c(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
-}
-
-TEST_F(MatrixTest, MatrixVectorMultiplication) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    Vector<double, 3> result = m * v;
-    EXPECT_EQ(result[0], 14.0);  // 1*1 + 2*2 + 3*3
-    EXPECT_EQ(result[1], 32.0);  // 4*1 + 5*2 + 6*3
-    EXPECT_EQ(result[2], 50.0);  // 7*1 + 8*2 + 9*3
-}
-
-// =============================================================================
-// Special Matrix Operations Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Transpose) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    Matrix<double, 3, 2> mt = m.transpose();
-    EXPECT_EQ(mt(0, 0), 1.0);
-    EXPECT_EQ(mt(0, 1), 4.0);
-    EXPECT_EQ(mt(1, 0), 2.0);
-    EXPECT_EQ(mt(1, 1), 5.0);
-    EXPECT_EQ(mt(2, 0), 3.0);
-    EXPECT_EQ(mt(2, 1), 6.0);
-}
-
-TEST_F(MatrixTest, Determinant2x2) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, -2.0);  // 1*4 - 2*3 = 4 - 6 = -2
-}
-
-TEST_F(MatrixTest, Determinant3x3) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {0.0, 1.0, 4.0},
-                           {5.0, 6.0, 0.0}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, 1.0);  // Using Sarrus rule
-}
-
-TEST_F(MatrixTest, Determinant4x4) {
-    Matrix<double, 4, 4> m{{1, 0, 0, 0},
-                           {0, 2, 0, 0},
-                           {0, 0, 3, 0},
-                           {0, 0, 0, 4}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, 24.0);  // 1*2*3*4 = 24 (diagonal matrix)
-}
-
-TEST_F(MatrixTest, Inverse2x2) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> inv = m.inverse();
-
-    // Check inverse properties
-    EXPECT_NEAR(inv(0, 0), -2.0, tolerance);
-    EXPECT_NEAR(inv(0, 1), 1.0, tolerance);
-    EXPECT_NEAR(inv(1, 0), 1.5, tolerance);
-    EXPECT_NEAR(inv(1, 1), -0.5, tolerance);
-
-    // Verify M * M^-1 = I
-    Matrix<double, 2, 2> identity = m * inv;
-    EXPECT_NEAR(identity(0, 0), 1.0, tolerance);
-    EXPECT_NEAR(identity(0, 1), 0.0, tolerance);
-    EXPECT_NEAR(identity(1, 0), 0.0, tolerance);
-    EXPECT_NEAR(identity(1, 1), 1.0, tolerance);
-}
-
-TEST_F(MatrixTest, Inverse3x3) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {0.0, 1.0, 4.0},
-                           {5.0, 6.0, 0.0}};
-
-    Matrix<double, 3, 3> inv = m.inverse();
-
-    // Verify M * M^-1 = I
-    Matrix<double, 3, 3> identity = m * inv;
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(identity(i, j), expected, tolerance);
-        }
-    }
-}
-
-TEST_F(MatrixTest, Trace) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-
-    double trace = m.trace();
-    EXPECT_EQ(trace, 15.0);  // 1 + 5 + 9 = 15
-}
-
-// =============================================================================
-// Special Matrix Types Tests
-// =============================================================================
-
-TEST_F(MatrixTest, IdentityMatrix) {
-    Matrix<double, 3, 3> I = Matrix<double, 3, 3>::identity();
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_EQ(I(i, j), expected);
-        }
-    }
-
-    // Test identity property
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-    Matrix<double, 3, 3> result = m * I;
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(result(i, j), m(i, j));
-        }
-    }
-}
-
-TEST_F(MatrixTest, ZeroMatrix) {
-    Matrix<double, 2, 3> Z = Matrix<double, 2, 3>::zero();
-
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(Z(i, j), 0.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, DiagonalMatrix) {
-    Vector<double, 3> diag{1.0, 2.0, 3.0};
-    Matrix<double, 3, 3> D = Matrix<double, 3, 3>::diagonal(diag);
-
-    EXPECT_EQ(D(0, 0), 1.0);
-    EXPECT_EQ(D(1, 1), 2.0);
-    EXPECT_EQ(D(2, 2), 3.0);
-
-    // Off-diagonal elements should be zero
-    EXPECT_EQ(D(0, 1), 0.0);
-    EXPECT_EQ(D(0, 2), 0.0);
-    EXPECT_EQ(D(1, 0), 0.0);
-    EXPECT_EQ(D(1, 2), 0.0);
-    EXPECT_EQ(D(2, 0), 0.0);
-    EXPECT_EQ(D(2, 1), 0.0);
-}
-
-// =============================================================================
-// Expression Template Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ExpressionTemplatesNoTemporaries) {
-    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
-    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
-    Matrix<double, 2, 2> c{{9, 10}, {11, 12}};
-
-    // Complex expression should create no temporaries
-    Matrix<double, 2, 2> result = a + b - c;
-
-    EXPECT_EQ(result(0, 0), -3.0);   // 1 + 5 - 9
-    EXPECT_EQ(result(0, 1), -2.0);   // 2 + 6 - 10
-    EXPECT_EQ(result(1, 0), -1.0);   // 3 + 7 - 11
-    EXPECT_EQ(result(1, 1), 0.0);    // 4 + 8 - 12
-}
-
-TEST_F(MatrixTest, LazyEvaluation) {
-    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
-    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
-
-    // Expression should not be evaluated until assignment
-    auto expr = a + b;  // No computation yet
-
-    Matrix<double, 2, 2> result = expr;  // Evaluation happens here
-    EXPECT_EQ(result(0, 0), 6.0);
-    EXPECT_EQ(result(0, 1), 8.0);
-}
-
-// =============================================================================
-// Edge Cases and Error Handling Tests
-// =============================================================================
-
-TEST_F(MatrixTest, SingularMatrixInverse) {
-    Matrix<double, 2, 2> singular{{1.0, 2.0},
-                                  {2.0, 4.0}};  // det = 0
-
-    EXPECT_THROW(singular.inverse(), std::runtime_error);
-}
-
-TEST_F(MatrixTest, DivisionByZero) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> result = m / 0.0;
-    EXPECT_TRUE(std::isinf(result(0, 0)));
-    EXPECT_TRUE(std::isinf(result(0, 1)));
-}
-
-TEST_F(MatrixTest, ExtremeLargeValues) {
-    double large = 1e308;
-    Matrix<double, 2, 2> m{{large, 0}, {0, large}};
-
-    Matrix<double, 2, 2> half = m / 2.0;
-    EXPECT_FALSE(std::isinf(half(0, 0)));
-    EXPECT_EQ(half(0, 0), large / 2.0);
-}
-
-// =============================================================================
-// Numerical Precision Tests
-// =============================================================================
-
-TEST_F(MatrixTest, NumericalStability) {
-    // Test near-singular matrix
-    double eps = 1e-15;
-    Matrix<double, 2, 2> m{{1.0, 1.0},
-                           {1.0, 1.0 + eps}};
-
-    double det = m.determinant();
-    // Relax tolerance due to floating-point arithmetic in determinant calculation
-    EXPECT_NEAR(det, eps, 1e-14);
-}
-
-TEST_F(MatrixTest, OrthogonalMatrixProperties) {
-    // Create rotation matrix (orthogonal)
-    double angle = M_PI / 4;
-    Matrix<double, 2, 2> R{{cos(angle), -sin(angle)},
-                           {sin(angle), cos(angle)}};
-
-    // Check orthogonality: R * R^T = I
-    Matrix<double, 2, 2> RRt = R * R.transpose();
-    EXPECT_NEAR(RRt(0, 0), 1.0, tolerance);
-    EXPECT_NEAR(RRt(0, 1), 0.0, tolerance);
-    EXPECT_NEAR(RRt(1, 0), 0.0, tolerance);
-    EXPECT_NEAR(RRt(1, 1), 1.0, tolerance);
-
-    // Check determinant = ±1
-    EXPECT_NEAR(std::abs(R.determinant()), 1.0, tolerance);
-}
-
-// =============================================================================
-// Matrix Properties Tests
-// =============================================================================
-
-TEST_F(MatrixTest, IsSymmetric) {
-    Matrix<double, 3, 3> sym{{1, 2, 3},
-                             {2, 4, 5},
-                             {3, 5, 6}};
-    EXPECT_TRUE(sym.is_symmetric(tolerance));
-
-    Matrix<double, 3, 3> nonsym{{1, 2, 3},
-                                {4, 5, 6},
-                                {7, 8, 9}};
-    EXPECT_FALSE(nonsym.is_symmetric(tolerance));
-}
-
-TEST_F(MatrixTest, IsSkewSymmetric) {
-    Matrix<double, 3, 3> skew{{0, -1, 2},
-                              {1, 0, -3},
-                              {-2, 3, 0}};
-    EXPECT_TRUE(skew.is_skew_symmetric(tolerance));
-
-    Matrix<double, 3, 3> nonskew{{1, 2, 3},
-                                 {4, 5, 6},
-                                 {7, 8, 9}};
-    EXPECT_FALSE(nonskew.is_skew_symmetric(tolerance));
-}
-
-TEST_F(MatrixTest, IsDiagonal) {
-    Matrix<double, 3, 3> diag{{1, 0, 0},
-                              {0, 2, 0},
-                              {0, 0, 3}};
-    EXPECT_TRUE(diag.is_diagonal(tolerance));
-
-    Matrix<double, 3, 3> nondiag{{1, 0.1, 0},
-                                 {0, 2, 0},
-                                 {0, 0, 3}};
-    EXPECT_FALSE(nondiag.is_diagonal(tolerance));
-}
-
-// =============================================================================
-// Thread Safety Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ThreadSafetyReadOnly) {
-    Matrix<double, 3, 3> m{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
-
-    std::vector<std::thread> threads;
-    std::vector<double> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&m, &results, i]() {
-            results[static_cast<std::size_t>(i)] = m.trace();
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    for (double r : results) {
-        EXPECT_EQ(r, 15.0);
-    }
-}
-
-// =============================================================================
-// Memory Alignment Tests
-// =============================================================================
-
-TEST_F(MatrixTest, MemoryAlignment) {
-    Matrix<double, 3, 3> m;
-
-    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(m.data());
-    EXPECT_EQ(addr % 32, 0) << "Matrix data should be 32-byte aligned for AVX";
-}
-
-// =============================================================================
-// Utility Function Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Norms) {
-    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
-
-    // Frobenius norm: sqrt(1^2 + 2^2 + 3^2 + 4^2) = sqrt(30)
-    EXPECT_NEAR(m.frobenius_norm(), std::sqrt(30.0), tolerance);
-
-    // Infinity norm (max row sum)
-    EXPECT_EQ(m.infinity_norm(), 7.0);  // max(|1|+|2|, |3|+|4|) = max(3, 7)
-
-    // One norm (max column sum)
-    EXPECT_EQ(m.one_norm(), 6.0);  // max(|1|+|3|, |2|+|4|) = max(4, 6)
-}
-
-TEST_F(MatrixTest, MinMaxElements) {
-    Matrix<double, 2, 3> m{{3, -1, 4}, {1, -2, 5}};
-
-    EXPECT_EQ(m.min(), -2.0);
-    EXPECT_EQ(m.max(), 5.0);
-}
-
-TEST_F(MatrixTest, ToString) {
-    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
-    std::stringstream ss;
-    ss << m;
-
-    std::string expected = "[[1, 2]\n [3, 4]]";
-    EXPECT_EQ(ss.str(), expected);
-}
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
deleted file mode 100644
index b17bce928..000000000
--- a/tests/unitTests/FE/Math/test_MatrixExpr.cpp
+++ /dev/null
@@ -1,527 +0,0 @@
-/**
- * @file test_MatrixExpr.cpp
- * @brief Unit tests for MatrixExpr.h - matrix expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Matrix.h"
-#include "FE/Math/MatrixExpr.h"
-#include "FE/Math/Vector.h"
-#include <limits>
-#include <cmath>
-#include <memory>
-#include <atomic>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for MatrixExpr tests
-class MatrixExprTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    // Custom allocator to track memory allocations
-    template<typename T>
-    class TrackingAllocator {
-    public:
-        using value_type = T;
-
-        static std::atomic<size_t> allocations;
-        static std::atomic<size_t> deallocations;
-        static std::atomic<size_t> bytes_allocated;
-
-        TrackingAllocator() = default;
-
-        template<typename U>
-        TrackingAllocator(const TrackingAllocator<U>&) {}
-
-        T* allocate(size_t n) {
-            allocations.fetch_add(1);
-            bytes_allocated.fetch_add(n * sizeof(T));
-            return static_cast<T*>(::operator new(n * sizeof(T)));
-        }
-
-        void deallocate(T* p, size_t n) {
-            deallocations.fetch_add(1);
-            ::operator delete(p);
-        }
-
-        static void reset() {
-            allocations = 0;
-            deallocations = 0;
-            bytes_allocated = 0;
-        }
-    };
-
-    void SetUp() override {
-        TrackingAllocator<double>::reset();
-    }
-
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::allocations{0};
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::deallocations{0};
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::bytes_allocated{0};
-
-// =============================================================================
-// Lazy Evaluation Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, LazyEvaluationNoTemporaries) {
-    // Expression templates should not create temporary matrices
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Build expression without evaluation
-    auto expr = A + B - C;
-
-    // Expression type should not be Matrix, but an expression type
-    using ExprType = decltype(expr);
-    EXPECT_FALSE((std::is_same_v<ExprType, Matrix<double, 2, 2>>));
-
-    // Now evaluate
-    Matrix<double, 2, 2> result = expr;
-    EXPECT_DOUBLE_EQ(result(0, 0), -3.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), -2.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), -1.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 0.0);
-}
-
-TEST_F(MatrixExprTest, LazyEvaluationAccessPattern) {
-    Matrix<double, 3, 3> A;
-    Matrix<double, 3, 3> B;
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = i * 3 + j + 1;
-            B(i, j) = (i * 3 + j + 1) * 2;
-        }
-    }
-
-    auto expr = A + B;
-
-    // Access individual elements without full evaluation
-    EXPECT_DOUBLE_EQ(expr(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(expr(1, 1), 15.0);
-    EXPECT_DOUBLE_EQ(expr(2, 2), 27.0);
-
-    // Size should be accessible
-    EXPECT_EQ(expr.rows(), 3u);
-    EXPECT_EQ(expr.cols(), 3u);
-}
-
-// =============================================================================
-// Matrix Multiplication Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, MatrixMultiplicationExpression) {
-    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
-    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
-
-    // Matrix multiplication should produce 2x2 result
-    Matrix<double, 2, 2> C = A * B;
-
-    // Verify results
-    EXPECT_DOUBLE_EQ(C(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
-    EXPECT_DOUBLE_EQ(C(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
-    EXPECT_DOUBLE_EQ(C(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
-    EXPECT_DOUBLE_EQ(C(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
-}
-
-TEST_F(MatrixExprTest, ChainedMatrixMultiplication) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Chain matrix multiplications: (A * B) * C
-    Matrix<double, 2, 2> result = A * B * C;
-
-    // First compute A * B
-    Matrix<double, 2, 2> AB = A * B;
-    EXPECT_DOUBLE_EQ(AB(0, 0), 19.0);  // 1*5 + 2*7
-    EXPECT_DOUBLE_EQ(AB(0, 1), 22.0);  // 1*6 + 2*8
-    EXPECT_DOUBLE_EQ(AB(1, 0), 43.0);  // 3*5 + 4*7
-    EXPECT_DOUBLE_EQ(AB(1, 1), 50.0);  // 3*6 + 4*8
-
-    // Then (A * B) * C
-    EXPECT_DOUBLE_EQ(result(0, 0), 413.0);  // 19*9 + 22*11
-    EXPECT_DOUBLE_EQ(result(0, 1), 454.0);  // 19*10 + 22*12
-    EXPECT_DOUBLE_EQ(result(1, 0), 937.0);  // 43*9 + 50*11
-    EXPECT_DOUBLE_EQ(result(1, 1), 1030.0); // 43*10 + 50*12
-}
-
-// =============================================================================
-// Mixed Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, MixedMatrixOperations) {
-    Matrix<double, 3, 3> A, B, C, D;
-
-    // Initialize matrices
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = i + j + 1;
-            B(i, j) = (i + 1) * (j + 1);
-            C(i, j) = i * j + 1;
-            D(i, j) = 1.0;
-        }
-    }
-
-    // Complex expression: A * B + C * D
-    Matrix<double, 3, 3> result = A * B + C * D;
-
-    // Verify a few key elements
-    Matrix<double, 3, 3> AB = A * B;
-    Matrix<double, 3, 3> CD = C * D;
-
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            EXPECT_DOUBLE_EQ(result(i, j), AB(i, j) + CD(i, j));
-        }
-    }
-}
-
-TEST_F(MatrixExprTest, ScalarMultiplicationInExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-
-    Matrix<double, 2, 2> result = 2.0 * (A + B) / 3.0;
-
-    EXPECT_TRUE(approx_equal(result(0, 0), 4.0));
-    EXPECT_TRUE(approx_equal(result(0, 1), 16.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 0), 20.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 1), 8.0));
-}
-
-// =============================================================================
-// Transpose Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, TransposeExpression) {
-    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
-
-    auto AT = transpose(A);
-
-    // Check dimensions
-    EXPECT_EQ(AT.rows(), 3u);
-    EXPECT_EQ(AT.cols(), 2u);
-
-    // Check values
-    EXPECT_DOUBLE_EQ(AT(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(AT(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(AT(1, 0), 2.0);
-    EXPECT_DOUBLE_EQ(AT(1, 1), 5.0);
-    EXPECT_DOUBLE_EQ(AT(2, 0), 3.0);
-    EXPECT_DOUBLE_EQ(AT(2, 1), 6.0);
-}
-
-TEST_F(MatrixExprTest, TransposeInExpression) {
-    Matrix<double, 3, 2> A{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}};
-    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
-
-    // Compute A^T * B (should be 2x2)
-    Matrix<double, 2, 2> result = transpose(A) * B;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 89.0);   // 1*7 + 3*9 + 5*11
-    EXPECT_DOUBLE_EQ(result(0, 1), 98.0);   // 1*8 + 3*10 + 5*12
-    EXPECT_DOUBLE_EQ(result(1, 0), 116.0);  // 2*7 + 4*9 + 6*11
-    EXPECT_DOUBLE_EQ(result(1, 1), 128.0);  // 2*8 + 4*10 + 6*12
-}
-
-// =============================================================================
-// Unary Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, NegationInExpression) {
-    Matrix<double, 2, 2> A{{1.0, -2.0}, {3.0, -4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {-7.0, 8.0}};
-
-    Matrix<double, 2, 2> result = -A + (-B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), -6.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), -4.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), -4.0);
-}
-
-TEST_F(MatrixExprTest, AbsoluteValueExpression) {
-    Matrix<double, 2, 3> M{{-1.5, 2.3, -4.7}, {0.0, -3.2, 5.1}};
-
-    Matrix<double, 2, 3> result = abs(M);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 1.5);
-    EXPECT_DOUBLE_EQ(result(0, 1), 2.3);
-    EXPECT_DOUBLE_EQ(result(0, 2), 4.7);
-    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 3.2);
-    EXPECT_DOUBLE_EQ(result(1, 2), 5.1);
-}
-
-TEST_F(MatrixExprTest, SqrtExpression) {
-    Matrix<double, 2, 2> M{{4.0, 9.0}, {16.0, 25.0}};
-
-    Matrix<double, 2, 2> result = sqrt(M);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 3.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 5.0);
-}
-
-// =============================================================================
-// Element-wise Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, HadamardProductExpression) {
-    Matrix<double, 2, 3> A{{2.0, 3.0, 4.0}, {5.0, 6.0, 7.0}};
-    Matrix<double, 2, 3> B{{8.0, 9.0, 10.0}, {11.0, 12.0, 13.0}};
-
-    Matrix<double, 2, 3> result = hadamard(A, B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 16.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 27.0);
-    EXPECT_DOUBLE_EQ(result(0, 2), 40.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 55.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 72.0);
-    EXPECT_DOUBLE_EQ(result(1, 2), 91.0);
-}
-
-TEST_F(MatrixExprTest, HadamardDivisionExpression) {
-    Matrix<double, 2, 2> A{{10.0, 18.0}, {28.0, 36.0}};
-    Matrix<double, 2, 2> B{{2.0, 3.0}, {4.0, 6.0}};
-
-    Matrix<double, 2, 2> result = hadamard_div(A, B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 5.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 6.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 7.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 6.0);
-}
-
-// =============================================================================
-// Norm and Trace Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, FrobeniusNormOfExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{2.0, 2.0}, {2.0, 2.0}};
-
-    double norm_sq = frobenius_norm_squared(A - B);
-    double norm = frobenius_norm(A - B);
-
-    // (A - B) = [[-1, 0], [1, 2]]
-    // norm_squared = 1 + 0 + 1 + 4 = 6
-    EXPECT_DOUBLE_EQ(norm_sq, 6.0);
-    EXPECT_DOUBLE_EQ(norm, std::sqrt(6.0));
-}
-
-TEST_F(MatrixExprTest, TraceOfExpression) {
-    Matrix<double, 3, 3> A;
-    Matrix<double, 3, 3> B;
-
-    // Initialize as diagonal matrices
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = (i == j) ? (i + 1) : 0.0;  // diag(1, 2, 3)
-            B(i, j) = (i == j) ? (i + 4) : 0.0;  // diag(4, 5, 6)
-        }
-    }
-
-    double tr = trace(A + B);
-
-    // trace(A + B) = trace(diag(5, 7, 9)) = 21
-    EXPECT_DOUBLE_EQ(tr, 21.0);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, TypeDeductionCorrectness) {
-    Matrix<float, 2, 2> Mf{{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Matrix<double, 2, 2> Md{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Float expression
-    auto expr = Mf + Mf;
-    using ExprType = decltype(expr(0, 0));
-    EXPECT_TRUE((std::is_same_v<ExprType, float>));
-
-    // Test that expression evaluates correctly
-    Matrix<float, 2, 2> result = expr;
-    EXPECT_FLOAT_EQ(result(0, 0), 2.0f);
-    EXPECT_FLOAT_EQ(result(1, 1), 8.0f);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SFINAEConstraints) {
-    // Test that MatrixExpr operators only work with MatrixExpr types
-    Matrix<double, 2, 2> M1{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> M2{{5.0, 6.0}, {7.0, 8.0}};
-
-    // This should compile
-    auto expr = M1 + M2;
-    Matrix<double, 2, 2> result = expr;
-
-    // Verify the constraint checking
-    EXPECT_TRUE((std::is_base_of_v<MatrixExpr<Matrix<double, 2, 2>>, Matrix<double, 2, 2>>));
-}
-
-// =============================================================================
-// Aliasing and Self-Assignment Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SelfAssignmentWithExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Self-assignment through expression
-    A = A + B;
-
-    EXPECT_DOUBLE_EQ(A(0, 0), 6.0);
-    EXPECT_DOUBLE_EQ(A(0, 1), 8.0);
-    EXPECT_DOUBLE_EQ(A(1, 0), 10.0);
-    EXPECT_DOUBLE_EQ(A(1, 1), 12.0);
-}
-
-TEST_F(MatrixExprTest, AliasingInExpression) {
-    Matrix<double, 2, 2> A{{2.0, 3.0}, {4.0, 5.0}};
-    Matrix<double, 2, 2> B{{1.0, 1.0}, {1.0, 1.0}};
-
-    // A appears on both sides
-    A = B + A;
-
-    EXPECT_DOUBLE_EQ(A(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(A(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(A(1, 0), 5.0);
-    EXPECT_DOUBLE_EQ(A(1, 1), 6.0);
-}
-
-// =============================================================================
-// Edge Cases Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SingleElementMatrix) {
-    Matrix<double, 1, 1> A{{5.0}};
-    Matrix<double, 1, 1> B{{3.0}};
-
-    Matrix<double, 1, 1> result = A + B - A * 0.5;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 5.5);
-}
-
-TEST_F(MatrixExprTest, NonSquareMatrixOperations) {
-    Matrix<double, 2, 4> A;
-    Matrix<double, 2, 4> B;
-
-    for (int i = 0; i < 2; ++i) {
-        for (int j = 0; j < 4; ++j) {
-            A(i, j) = i * 4 + j + 1;
-            B(i, j) = (i * 4 + j + 1) * 2;
-        }
-    }
-
-    Matrix<double, 2, 4> result = A + B - A * 0.5;
-
-    for (int i = 0; i < 2; ++i) {
-        for (int j = 0; j < 4; ++j) {
-            double expected = A(i, j) + B(i, j) - A(i, j) * 0.5;
-            EXPECT_DOUBLE_EQ(result(i, j), expected);
-        }
-    }
-}
-
-// =============================================================================
-// Diagonal Matrix Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, DiagonalMatrixExpression) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    auto diag = DiagonalExpr<Vector<double, 3>>(v);
-
-    // Check dimensions
-    EXPECT_EQ(diag.rows(), 3u);
-    EXPECT_EQ(diag.cols(), 3u);
-
-    // Check values
-    EXPECT_DOUBLE_EQ(diag(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(diag(1, 1), 2.0);
-    EXPECT_DOUBLE_EQ(diag(2, 2), 3.0);
-
-    // Off-diagonal should be zero
-    EXPECT_DOUBLE_EQ(diag(0, 1), 0.0);
-    EXPECT_DOUBLE_EQ(diag(1, 0), 0.0);
-}
-
-TEST_F(MatrixExprTest, DiagonalMatrixInExpression) {
-    Vector<double, 2> v{2.0, 3.0};
-    Matrix<double, 2, 2> A{{1.0, 1.0}, {1.0, 1.0}};
-
-    auto diag = DiagonalExpr<Vector<double, 2>>(v);
-    Matrix<double, 2, 2> result = A + diag;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 1.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 1.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 4.0);
-}
-
-// =============================================================================
-// Complex Expression Pattern Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, ComplexNestedExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Complex expression with multiple operation types
-    Matrix<double, 2, 2> result = 2.0 * abs(A - B) + sqrt(hadamard(C, C)) / 3.0;
-
-    // |A - B| = |[-4, -4], [-4, -4]| = [4, 4], [4, 4]
-    // 2 * [4, 4], [4, 4] = [8, 8], [8, 8]
-    // C * C (element-wise) = [81, 100], [121, 144]
-    // sqrt(C * C) = [9, 10], [11, 12]
-    // sqrt(C * C) / 3 = [3, 10/3], [11/3, 4]
-    // result = [11, 34/3], [35/3, 12]
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 11.0);
-    EXPECT_TRUE(approx_equal(result(0, 1), 34.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 0), 35.0/3.0));
-    EXPECT_DOUBLE_EQ(result(1, 1), 12.0);
-}
-
-TEST_F(MatrixExprTest, MatrixVectorMixedExpression) {
-    Matrix<double, 3, 3> A;
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Create identity matrix
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = (i == j) ? 1.0 : 0.0;
-        }
-    }
-
-    // Create diagonal from vector and add to identity
-    auto diag = DiagonalExpr<Vector<double, 3>>(v);
-    Matrix<double, 3, 3> result = A + diag;
-
-    // Result should be diag(2, 3, 4)
-    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 3.0);
-    EXPECT_DOUBLE_EQ(result(2, 2), 4.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 0.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
-}
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
deleted file mode 100644
index 754ad819d..000000000
--- a/tests/unitTests/FE/Math/test_Vector.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-/**
- * @file test_Vector.cpp
- * @brief Unit tests for Vector.h - fixed-size vectors with expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Vector.h"
-#include "FE/Math/VectorExpr.h"
-#include <limits>
-#include <cmath>
-#include <sstream>
-#include <thread>
-#include <vector>
-
-using namespace svmp::FE::math;
-
-// Test fixture for Vector tests
-class VectorTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    // Helper function to check if two values are approximately equal
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Construction and Initialization Tests
-// =============================================================================
-
-TEST_F(VectorTest, DefaultConstruction) {
-    Vector<double, 3> v;
-    EXPECT_EQ(v[0], 0.0);
-    EXPECT_EQ(v[1], 0.0);
-    EXPECT_EQ(v[2], 0.0);
-
-    Vector<float, 4> vf;
-    for (size_t i = 0; i < 4; ++i) {
-        EXPECT_EQ(vf[i], 0.0f);
-    }
-}
-
-TEST_F(VectorTest, FillConstruction) {
-    Vector<double, 3> v(5.0);
-    EXPECT_EQ(v[0], 5.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 5.0);
-
-    Vector<int, 10> vi(-3);
-    for (size_t i = 0; i < 10; ++i) {
-        EXPECT_EQ(vi[i], -3);
-    }
-}
-
-TEST_F(VectorTest, InitializerListConstruction) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 2.0);
-    EXPECT_EQ(v[2], 3.0);
-
-    // Partial initialization
-    Vector<double, 5> v2{1.0, 2.0};
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 0.0);
-    EXPECT_EQ(v2[3], 0.0);
-    EXPECT_EQ(v2[4], 0.0);
-}
-
-TEST_F(VectorTest, CopyConstruction) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2(v1);
-
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 3.0);
-
-    // Ensure deep copy
-    v2[0] = 10.0;
-    EXPECT_EQ(v1[0], 1.0);
-    EXPECT_EQ(v2[0], 10.0);
-}
-
-TEST_F(VectorTest, MoveConstruction) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2(std::move(v1));
-
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 3.0);
-}
-
-// =============================================================================
-// Element Access Tests
-// =============================================================================
-
-TEST_F(VectorTest, ElementAccess) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Non-const access
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 2.0);
-    EXPECT_EQ(v[2], 3.0);
-
-    // Modification
-    v[1] = 5.0;
-    EXPECT_EQ(v[1], 5.0);
-
-    // Const access
-    const Vector<double, 3> cv{4.0, 5.0, 6.0};
-    EXPECT_EQ(cv[0], 4.0);
-    EXPECT_EQ(cv[1], 5.0);
-    EXPECT_EQ(cv[2], 6.0);
-}
-
-TEST_F(VectorTest, ElementAccessBounds) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // at() with bounds checking
-    EXPECT_EQ(v.at(0), 1.0);
-    EXPECT_EQ(v.at(1), 2.0);
-    EXPECT_EQ(v.at(2), 3.0);
-
-    // Test out of bounds throws
-    EXPECT_THROW(v.at(3), std::out_of_range);
-    EXPECT_THROW(v.at(100), std::out_of_range);
-}
-
-TEST_F(VectorTest, DataPointerAccess) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    double* data = v.data();
-    EXPECT_EQ(data[0], 1.0);
-    EXPECT_EQ(data[1], 2.0);
-    EXPECT_EQ(data[2], 3.0);
-
-    // Const data access
-    const Vector<double, 3> cv{4.0, 5.0, 6.0};
-    const double* cdata = cv.data();
-    EXPECT_EQ(cdata[0], 4.0);
-    EXPECT_EQ(cdata[1], 5.0);
-    EXPECT_EQ(cdata[2], 6.0);
-}
-
-// =============================================================================
-// Arithmetic Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, Addition) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> c = a + b;
-    EXPECT_EQ(c[0], 5.0);
-    EXPECT_EQ(c[1], 7.0);
-    EXPECT_EQ(c[2], 9.0);
-}
-
-TEST_F(VectorTest, Subtraction) {
-    Vector<double, 3> a{5.0, 7.0, 9.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> c = a - b;
-    EXPECT_EQ(c[0], 1.0);
-    EXPECT_EQ(c[1], 2.0);
-    EXPECT_EQ(c[2], 3.0);
-}
-
-TEST_F(VectorTest, ScalarMultiplication) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-
-    // Scalar * Vector
-    Vector<double, 3> b = 2.0 * a;
-    EXPECT_EQ(b[0], 2.0);
-    EXPECT_EQ(b[1], 4.0);
-    EXPECT_EQ(b[2], 6.0);
-
-    // Vector * Scalar
-    Vector<double, 3> c = a * 3.0;
-    EXPECT_EQ(c[0], 3.0);
-    EXPECT_EQ(c[1], 6.0);
-    EXPECT_EQ(c[2], 9.0);
-}
-
-TEST_F(VectorTest, ScalarDivision) {
-    Vector<double, 3> a{2.0, 4.0, 6.0};
-
-    Vector<double, 3> b = a / 2.0;
-    EXPECT_EQ(b[0], 1.0);
-    EXPECT_EQ(b[1], 2.0);
-    EXPECT_EQ(b[2], 3.0);
-}
-
-TEST_F(VectorTest, UnaryNegation) {
-    Vector<double, 3> a{1.0, -2.0, 3.0};
-
-    Vector<double, 3> b = -a;
-    EXPECT_EQ(b[0], -1.0);
-    EXPECT_EQ(b[1], 2.0);
-    EXPECT_EQ(b[2], -3.0);
-}
-
-TEST_F(VectorTest, CompoundAssignment) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // +=
-    a += b;
-    EXPECT_EQ(a[0], 5.0);
-    EXPECT_EQ(a[1], 7.0);
-    EXPECT_EQ(a[2], 9.0);
-
-    // -=
-    a -= b;
-    EXPECT_EQ(a[0], 1.0);
-    EXPECT_EQ(a[1], 2.0);
-    EXPECT_EQ(a[2], 3.0);
-
-    // *=
-    a *= 2.0;
-    EXPECT_EQ(a[0], 2.0);
-    EXPECT_EQ(a[1], 4.0);
-    EXPECT_EQ(a[2], 6.0);
-
-    // /=
-    a /= 2.0;
-    EXPECT_EQ(a[0], 1.0);
-    EXPECT_EQ(a[1], 2.0);
-    EXPECT_EQ(a[2], 3.0);
-}
-
-// =============================================================================
-// Vector Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, DotProduct) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    double dot = a.dot(b);
-    EXPECT_EQ(dot, 32.0);  // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
-
-    // Test commutativity
-    EXPECT_EQ(b.dot(a), dot);
-
-    // Test orthogonal vectors
-    Vector<double, 3> x{1.0, 0.0, 0.0};
-    Vector<double, 3> y{0.0, 1.0, 0.0};
-    EXPECT_EQ(x.dot(y), 0.0);
-}
-
-TEST_F(VectorTest, CrossProduct3D) {
-    Vector<double, 3> x{1.0, 0.0, 0.0};
-    Vector<double, 3> y{0.0, 1.0, 0.0};
-    Vector<double, 3> z{0.0, 0.0, 1.0};
-
-    // Test basis vector cross products
-    Vector<double, 3> xy = x.cross(y);
-    EXPECT_EQ(xy[0], 0.0);
-    EXPECT_EQ(xy[1], 0.0);
-    EXPECT_EQ(xy[2], 1.0);
-
-    Vector<double, 3> yx = y.cross(x);
-    EXPECT_EQ(yx[0], 0.0);
-    EXPECT_EQ(yx[1], 0.0);
-    EXPECT_EQ(yx[2], -1.0);
-
-    // General cross product
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c = a.cross(b);
-
-    EXPECT_EQ(c[0], -3.0);  // 2*6 - 3*5 = 12 - 15 = -3
-    EXPECT_EQ(c[1], 6.0);   // 3*4 - 1*6 = 12 - 6 = 6
-    EXPECT_EQ(c[2], -3.0);  // 1*5 - 2*4 = 5 - 8 = -3
-}
-
-TEST_F(VectorTest, Norm) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    EXPECT_EQ(v.norm(), 5.0);
-
-    Vector<double, 3> unit{1.0, 0.0, 0.0};
-    EXPECT_EQ(unit.norm(), 1.0);
-
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-    EXPECT_EQ(zero.norm(), 0.0);
-}
-
-TEST_F(VectorTest, NormSquared) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    EXPECT_EQ(v.norm_squared(), 25.0);
-
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    EXPECT_EQ(a.norm_squared(), 14.0);  // 1 + 4 + 9 = 14
-}
-
-TEST_F(VectorTest, Normalize) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    Vector<double, 3> n = v.normalized();
-
-    EXPECT_NEAR(n[0], 0.6, tolerance);
-    EXPECT_NEAR(n[1], 0.8, tolerance);
-    EXPECT_NEAR(n[2], 0.0, tolerance);
-    EXPECT_NEAR(n.norm(), 1.0, tolerance);
-
-    // In-place normalization
-    v.normalize();
-    EXPECT_NEAR(v[0], 0.6, tolerance);
-    EXPECT_NEAR(v[1], 0.8, tolerance);
-    EXPECT_NEAR(v.norm(), 1.0, tolerance);
-}
-
-// =============================================================================
-// Expression Template Tests
-// =============================================================================
-
-TEST_F(VectorTest, ExpressionTemplatesNoTemporaries) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-    Vector<double, 3> d{10.0, 11.0, 12.0};
-
-    // Complex expression should create no temporaries
-    Vector<double, 3> result = a + b - c + d;
-
-    EXPECT_EQ(result[0], 8.0);   // 1 + 4 - 7 + 10
-    EXPECT_EQ(result[1], 10.0);  // 2 + 5 - 8 + 11
-    EXPECT_EQ(result[2], 12.0);  // 3 + 6 - 9 + 12
-}
-
-TEST_F(VectorTest, LazyEvaluation) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // Expression should not be evaluated until assignment
-    auto expr = a + b;  // No computation yet
-
-    Vector<double, 3> result = expr;  // Evaluation happens here
-    EXPECT_EQ(result[0], 5.0);
-    EXPECT_EQ(result[1], 7.0);
-    EXPECT_EQ(result[2], 9.0);
-}
-
-TEST_F(VectorTest, MixedExpressions) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    double scalar = 2.0;
-
-    // Complex mixed expression
-    Vector<double, 3> result = scalar * (a + b) - a / scalar;
-
-    EXPECT_NEAR(result[0], 9.5, tolerance);   // 2*(1+4) - 1/2
-    EXPECT_NEAR(result[1], 13.0, tolerance);  // 2*(2+5) - 2/2
-    EXPECT_NEAR(result[2], 16.5, tolerance);  // 2*(3+6) - 3/2
-}
-
-// =============================================================================
-// Special Values Tests
-// =============================================================================
-
-TEST_F(VectorTest, ZeroVector) {
-    Vector<double, 3> zero = Vector<double, 3>::zero();
-    EXPECT_EQ(zero[0], 0.0);
-    EXPECT_EQ(zero[1], 0.0);
-    EXPECT_EQ(zero[2], 0.0);
-    EXPECT_EQ(zero.norm(), 0.0);
-}
-
-TEST_F(VectorTest, OnesVector) {
-    Vector<double, 3> ones = Vector<double, 3>::ones();
-    EXPECT_EQ(ones[0], 1.0);
-    EXPECT_EQ(ones[1], 1.0);
-    EXPECT_EQ(ones[2], 1.0);
-}
-
-TEST_F(VectorTest, BasisVectors) {
-    auto e0 = Vector<double, 3>::basis(0);
-    EXPECT_EQ(e0[0], 1.0);
-    EXPECT_EQ(e0[1], 0.0);
-    EXPECT_EQ(e0[2], 0.0);
-
-    auto e1 = Vector<double, 3>::basis(1);
-    EXPECT_EQ(e1[0], 0.0);
-    EXPECT_EQ(e1[1], 1.0);
-    EXPECT_EQ(e1[2], 0.0);
-
-    auto e2 = Vector<double, 3>::basis(2);
-    EXPECT_EQ(e2[0], 0.0);
-    EXPECT_EQ(e2[1], 0.0);
-    EXPECT_EQ(e2[2], 1.0);
-}
-
-// =============================================================================
-// Edge Cases and Error Handling Tests
-// =============================================================================
-
-TEST_F(VectorTest, DivisionByZero) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Division by zero should produce inf
-    Vector<double, 3> result = v / 0.0;
-    EXPECT_TRUE(std::isinf(result[0]));
-    EXPECT_TRUE(std::isinf(result[1]));
-    EXPECT_TRUE(std::isinf(result[2]));
-}
-
-TEST_F(VectorTest, NormalizeZeroVector) {
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-
-    // Normalizing zero vector should handle gracefully
-    Vector<double, 3> n = zero.normalized();
-    EXPECT_TRUE(std::isnan(n[0]) || n[0] == 0.0);
-}
-
-TEST_F(VectorTest, ExtremeLargeValues) {
-    double large = 1e308;  // Near double max
-    Vector<double, 3> v{large, large, large};
-
-    // Operations should not overflow
-    Vector<double, 3> half = v / 2.0;
-    EXPECT_FALSE(std::isinf(half[0]));
-    EXPECT_EQ(half[0], large / 2.0);
-}
-
-TEST_F(VectorTest, ExtremeSmallValues) {
-    double tiny = 1e-308;  // Near double min
-    Vector<double, 3> v{tiny, tiny, tiny};
-
-    // Operations should maintain precision
-    Vector<double, 3> doubled = v * 2.0;
-    EXPECT_EQ(doubled[0], tiny * 2.0);
-}
-
-// =============================================================================
-// Numerical Precision Tests
-// =============================================================================
-
-TEST_F(VectorTest, NumericalStability) {
-    // Test Kahan summation for better precision
-    Vector<double, 4> v{1e16, 1.0, -1e16, 1.0};
-    // Computed for future validation - demonstrates numerical precision issues
-    [[maybe_unused]] double sum = v[0] + v[1] + v[2] + v[3];
-
-    // Direct summation might lose precision
-    // But vector operations should maintain it
-    Vector<double, 4> a{1e16, 0.0, -1e16, 0.0};
-    Vector<double, 4> b{0.0, 1.0, 0.0, 1.0};
-    Vector<double, 4> c = a + b;
-
-    EXPECT_EQ(c[0], 1e16);
-    EXPECT_EQ(c[1], 1.0);
-    EXPECT_EQ(c[2], -1e16);
-    EXPECT_EQ(c[3], 1.0);
-}
-
-TEST_F(VectorTest, OrthogonalityPreservation) {
-    // Create nearly orthogonal vectors
-    Vector<double, 3> a{1.0, 1e-15, 0.0};
-    Vector<double, 3> b{0.0, 1.0, 0.0};
-
-    double dot = a.dot(b);
-    EXPECT_NEAR(dot, 1e-15, 1e-16);
-}
-
-// =============================================================================
-// Comparison Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, Equality) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{1.0, 2.0, 3.0};
-    Vector<double, 3> c{1.0, 2.0, 3.1};
-
-    EXPECT_TRUE(a == b);
-    EXPECT_FALSE(a == c);
-    EXPECT_FALSE(a != b);
-    EXPECT_TRUE(a != c);
-}
-
-TEST_F(VectorTest, ApproximateEquality) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{1.0 + 1e-15, 2.0 - 1e-15, 3.0 + 1e-15};
-
-    EXPECT_TRUE(a.approx_equal(b, 1e-14));
-    EXPECT_FALSE(a.approx_equal(b, 1e-16));
-}
-
-// =============================================================================
-// Thread Safety Tests
-// =============================================================================
-
-TEST_F(VectorTest, ThreadSafetyReadOnly) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Multiple threads reading should be safe
-    std::vector<std::thread> threads;
-    std::vector<double> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&v, &results, i]() {
-            results[static_cast<std::size_t>(i)] = v.norm();
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    // All threads should get same result
-    double expected = v.norm();
-    for (double r : results) {
-        EXPECT_EQ(r, expected);
-    }
-}
-
-TEST_F(VectorTest, ThreadSafetyIsolated) {
-    // Each thread works on its own vector
-    std::vector<std::thread> threads;
-    std::vector<Vector<double, 3>> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&results, i]() {
-            Vector<double, 3> local{static_cast<double>(i), 0.0, 0.0};
-            results[static_cast<std::size_t>(i)] = local * 2.0;
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    // Check each thread computed correctly
-    for (int i = 0; i < 10; ++i) {
-        EXPECT_EQ(results[static_cast<std::size_t>(i)][0], 2.0 * i);
-    }
-}
-
-// =============================================================================
-// Memory Alignment Tests
-// =============================================================================
-
-TEST_F(VectorTest, MemoryAlignment) {
-    Vector<double, 3> v;
-
-    // Check that data is properly aligned for SIMD
-    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(v.data());
-    EXPECT_EQ(addr % 32, 0) << "Vector data should be 32-byte aligned for AVX";
-}
-
-// =============================================================================
-// Utility Function Tests
-// =============================================================================
-
-TEST_F(VectorTest, MinMaxElements) {
-    Vector<double, 5> v{3.0, -1.0, 4.0, 1.0, -2.0};
-
-    EXPECT_EQ(v.min(), -2.0);
-    EXPECT_EQ(v.max(), 4.0);
-    EXPECT_EQ(v.min_index(), 4);
-    EXPECT_EQ(v.max_index(), 2);
-}
-
-TEST_F(VectorTest, Sum) {
-    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
-    EXPECT_EQ(v.sum(), 10.0);
-
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-    EXPECT_EQ(zero.sum(), 0.0);
-}
-
-TEST_F(VectorTest, Mean) {
-    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
-    EXPECT_EQ(v.mean(), 2.5);
-}
-
-TEST_F(VectorTest, ToString) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-    std::stringstream ss;
-    ss << v;
-
-    std::string expected = "[1, 2, 3]";
-    EXPECT_EQ(ss.str(), expected);
-}
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
deleted file mode 100644
index 0e7363c64..000000000
--- a/tests/unitTests/FE/Math/test_VectorExpr.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/**
- * @file test_VectorExpr.cpp
- * @brief Unit tests for VectorExpr.h - vector expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Vector.h"
-#include "FE/Math/VectorExpr.h"
-#include <limits>
-#include <cmath>
-#include <memory>
-#include <atomic>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for VectorExpr tests
-class VectorExprTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    // Custom allocator to track memory allocations
-    template<typename T>
-    class TrackingAllocator {
-    public:
-        using value_type = T;
-
-        static std::atomic<size_t> allocations;
-        static std::atomic<size_t> deallocations;
-        static std::atomic<size_t> bytes_allocated;
-
-        TrackingAllocator() = default;
-
-        template<typename U>
-        TrackingAllocator(const TrackingAllocator<U>&) {}
-
-        T* allocate(size_t n) {
-            allocations.fetch_add(1);
-            bytes_allocated.fetch_add(n * sizeof(T));
-            return static_cast<T*>(::operator new(n * sizeof(T)));
-        }
-
-        void deallocate(T* p, size_t n) {
-            deallocations.fetch_add(1);
-            ::operator delete(p);
-        }
-
-        static void reset() {
-            allocations = 0;
-            deallocations = 0;
-            bytes_allocated = 0;
-        }
-    };
-
-    void SetUp() override {
-        TrackingAllocator<double>::reset();
-    }
-
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::allocations{0};
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::deallocations{0};
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::bytes_allocated{0};
-
-// =============================================================================
-// Lazy Evaluation Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, LazyEvaluationNoTemporaries) {
-    // Expression templates should not create temporary vectors
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-
-    // Build expression without evaluation
-    auto expr = a + b - c;
-
-    // Expression type should not be Vector, but an expression type
-    using ExprType = decltype(expr);
-    EXPECT_FALSE((std::is_same_v<ExprType, Vector<double, 3>>));
-
-    // Now evaluate
-    Vector<double, 3> result = expr;
-    EXPECT_DOUBLE_EQ(result[0], -2.0);
-    EXPECT_DOUBLE_EQ(result[1], -1.0);
-    EXPECT_DOUBLE_EQ(result[2], 0.0);
-}
-
-TEST_F(VectorExprTest, LazyEvaluationAccessPattern) {
-    Vector<double, 4> a{1.0, 2.0, 3.0, 4.0};
-    Vector<double, 4> b{5.0, 6.0, 7.0, 8.0};
-
-    auto expr = a + b;
-
-    // Access individual elements without full evaluation
-    EXPECT_DOUBLE_EQ(expr[0], 6.0);
-    EXPECT_DOUBLE_EQ(expr[2], 10.0);
-
-    // Size should be accessible
-    EXPECT_EQ(expr.size(), 4u);
-}
-
-// =============================================================================
-// Expression Chaining Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ChainedAdditionSubtraction) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{2.0, 3.0, 4.0};
-    Vector<double, 3> d{1.0, 1.0, 1.0};
-
-    // Chain multiple operations
-    Vector<double, 3> result = a + b - c + d;
-
-    EXPECT_DOUBLE_EQ(result[0], 4.0);
-    EXPECT_DOUBLE_EQ(result[1], 5.0);
-    EXPECT_DOUBLE_EQ(result[2], 6.0);
-}
-
-TEST_F(VectorExprTest, DeepExpressionNesting) {
-    Vector<double, 2> v1{1.0, 2.0};
-    Vector<double, 2> v2{3.0, 4.0};
-    Vector<double, 2> v3{5.0, 6.0};
-    Vector<double, 2> v4{7.0, 8.0};
-    Vector<double, 2> v5{9.0, 10.0};
-
-    // Deep nesting
-    Vector<double, 2> result = ((v1 + v2) - (v3 - v4)) + v5;
-
-    EXPECT_DOUBLE_EQ(result[0], 15.0);
-    EXPECT_DOUBLE_EQ(result[1], 18.0);
-}
-
-// =============================================================================
-// Mixed Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ScalarMultiplicationInExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> result = 2.0 * (a + b) / 3.0;
-
-    EXPECT_TRUE(approx_equal(result[0], 10.0/3.0));
-    EXPECT_TRUE(approx_equal(result[1], 14.0/3.0));
-    EXPECT_TRUE(approx_equal(result[2], 6.0));
-}
-
-TEST_F(VectorExprTest, MixedScalarVectorOperations) {
-    Vector<double, 4> v{2.0, 4.0, 6.0, 8.0};
-
-    // Complex mixed expression
-    Vector<double, 4> result = 3.0 * v / 2.0 + v * 0.5 - 1.0 * v;
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 4.0);
-    EXPECT_DOUBLE_EQ(result[2], 6.0);
-    EXPECT_DOUBLE_EQ(result[3], 8.0);
-}
-
-// =============================================================================
-// Unary Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, NegationInExpression) {
-    Vector<double, 3> a{1.0, -2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, -6.0};
-
-    Vector<double, 3> result = -a + (-b);
-
-    EXPECT_DOUBLE_EQ(result[0], -5.0);
-    EXPECT_DOUBLE_EQ(result[1], -3.0);
-    EXPECT_DOUBLE_EQ(result[2], 3.0);
-}
-
-TEST_F(VectorExprTest, AbsoluteValueExpression) {
-    Vector<double, 4> v{-1.5, 2.3, -4.7, 0.0};
-
-    Vector<double, 4> result = abs(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 1.5);
-    EXPECT_DOUBLE_EQ(result[1], 2.3);
-    EXPECT_DOUBLE_EQ(result[2], 4.7);
-    EXPECT_DOUBLE_EQ(result[3], 0.0);
-}
-
-TEST_F(VectorExprTest, SqrtExpression) {
-    Vector<double, 3> v{4.0, 9.0, 16.0};
-
-    Vector<double, 3> result = sqrt(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 3.0);
-    EXPECT_DOUBLE_EQ(result[2], 4.0);
-}
-
-// =============================================================================
-// Element-wise Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, HadamardProductExpression) {
-    Vector<double, 3> a{2.0, 3.0, 4.0};
-    Vector<double, 3> b{5.0, 6.0, 7.0};
-
-    Vector<double, 3> result = hadamard(a, b);
-
-    EXPECT_DOUBLE_EQ(result[0], 10.0);
-    EXPECT_DOUBLE_EQ(result[1], 18.0);
-    EXPECT_DOUBLE_EQ(result[2], 28.0);
-}
-
-TEST_F(VectorExprTest, HadamardDivisionExpression) {
-    Vector<double, 3> a{10.0, 18.0, 28.0};
-    Vector<double, 3> b{2.0, 3.0, 4.0};
-
-    Vector<double, 3> result = hadamard_div(a, b);
-
-    EXPECT_DOUBLE_EQ(result[0], 5.0);
-    EXPECT_DOUBLE_EQ(result[1], 6.0);
-    EXPECT_DOUBLE_EQ(result[2], 7.0);
-}
-
-// =============================================================================
-// Dot Product and Norm Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, DotProductOfExpressions) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{2.0, 2.0, 2.0};
-
-    // Dot product of expressions
-    double result = dot(a + b, c);
-
-    EXPECT_DOUBLE_EQ(result, 42.0);
-}
-
-TEST_F(VectorExprTest, NormOfExpression) {
-    Vector<double, 2> a{3.0, 0.0};
-    Vector<double, 2> b{0.0, 4.0};
-
-    double result = norm(a + b);
-
-    EXPECT_DOUBLE_EQ(result, 5.0);  // norm of (3,4) = 5
-}
-
-TEST_F(VectorExprTest, NormalizeExpression) {
-    Vector<double, 3> v{3.0, 0.0, 4.0};
-
-    Vector<double, 3> result = normalize(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 0.6);
-    EXPECT_DOUBLE_EQ(result[1], 0.0);
-    EXPECT_DOUBLE_EQ(result[2], 0.8);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, TypeDeductionCorrectness) {
-    Vector<float, 3> vf{1.0f, 2.0f, 3.0f};
-    Vector<double, 3> vd{4.0, 5.0, 6.0};
-
-    // Mixed type operations should promote to higher precision
-    auto expr = vf + vf;  // float expression
-    using ExprType = decltype(expr[0]);
-    EXPECT_TRUE((std::is_same_v<ExprType, float>));
-
-    // Test that expression evaluates correctly
-    Vector<float, 3> result = expr;
-    EXPECT_FLOAT_EQ(result[0], 2.0f);
-    EXPECT_FLOAT_EQ(result[1], 4.0f);
-    EXPECT_FLOAT_EQ(result[2], 6.0f);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SFINAEConstraints) {
-    // Test that VectorExpr operators only work with VectorExpr types
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2{4.0, 5.0, 6.0};
-
-    // This should compile
-    auto expr = v1 + v2;
-    Vector<double, 3> result = expr;
-
-    // Verify the constraint checking
-    EXPECT_TRUE((std::is_base_of_v<VectorExpr<Vector<double, 3>>, Vector<double, 3>>));
-}
-
-// =============================================================================
-// Aliasing and Self-Assignment Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SelfAssignmentWithExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // Self-assignment through expression
-    a = a + b;
-
-    EXPECT_DOUBLE_EQ(a[0], 5.0);
-    EXPECT_DOUBLE_EQ(a[1], 7.0);
-    EXPECT_DOUBLE_EQ(a[2], 9.0);
-}
-
-TEST_F(VectorExprTest, AliasingInExpression) {
-    Vector<double, 3> a{2.0, 3.0, 4.0};
-    Vector<double, 3> b{1.0, 1.0, 1.0};
-
-    // a appears on both sides
-    a = b + a;
-
-    EXPECT_DOUBLE_EQ(a[0], 3.0);
-    EXPECT_DOUBLE_EQ(a[1], 4.0);
-    EXPECT_DOUBLE_EQ(a[2], 5.0);
-}
-
-// =============================================================================
-// Edge Cases Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SingleElementVector) {
-    Vector<double, 1> a{5.0};
-    Vector<double, 1> b{3.0};
-
-    Vector<double, 1> result = a + b - a * 0.5;
-
-    EXPECT_DOUBLE_EQ(result[0], 5.5);
-}
-
-TEST_F(VectorExprTest, EmptyExpression) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Expression that evaluates to identity
-    Vector<double, 3> result = v + v * 0.0;
-
-    EXPECT_DOUBLE_EQ(result[0], 1.0);
-    EXPECT_DOUBLE_EQ(result[1], 2.0);
-    EXPECT_DOUBLE_EQ(result[2], 3.0);
-}
-
-TEST_F(VectorExprTest, LargeVectorExpression) {
-    const size_t N = 100;
-    Vector<double, N> a, b, c;
-
-    for (size_t i = 0; i < N; ++i) {
-        a[i] = static_cast<double>(i);
-        b[i] = static_cast<double>(i * 2);
-        c[i] = static_cast<double>(i * 3);
-    }
-
-    Vector<double, N> result = a + b - c / 2.0;
-
-    for (size_t i = 0; i < N; ++i) {
-        EXPECT_DOUBLE_EQ(result[i], i + 2.0 * i - 1.5 * i);
-    }
-}
-
-// =============================================================================
-// Complex Expression Pattern Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ComplexNestedExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-
-    // Complex expression with multiple operation types
-    Vector<double, 3> result = 2.0 * abs(a - b) + sqrt(hadamard(c, c)) / 3.0;
-
-    // Verify each component
-    // |a - b| = |(-3, -3, -3)| = (3, 3, 3)
-    // 2 * (3, 3, 3) = (6, 6, 6)
-    // c * c = (49, 64, 81)
-    // sqrt(c * c) = (7, 8, 9)
-    // sqrt(c * c) / 3 = (7/3, 8/3, 3)
-    // result = (6 + 7/3, 6 + 8/3, 6 + 3) = (25/3, 26/3, 9)
-
-    EXPECT_TRUE(approx_equal(result[0], 25.0/3.0));
-    EXPECT_TRUE(approx_equal(result[1], 26.0/3.0));
-    EXPECT_DOUBLE_EQ(result[2], 9.0);
-}
-
-TEST_F(VectorExprTest, ChainedUnaryOperations) {
-    Vector<double, 4> v{-4.0, -9.0, -16.0, -25.0};
-
-    // Chain of unary operations
-    Vector<double, 4> result = sqrt(abs(-v));
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 3.0);
-    EXPECT_DOUBLE_EQ(result[2], 4.0);
-    EXPECT_DOUBLE_EQ(result[3], 5.0);
-}

From 82a1158eceeb4ad5c09591f9139bc29cea2e5e55 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 18:21:37 -0700
Subject: [PATCH 14/91] adding doxygen to Common submodule

---
 Code/Source/solver/FE/Common/FEException.h | 189 +++++++++++++++
 Code/Source/solver/FE/Common/Types.h       | 253 ++++++++++++++-------
 2 files changed, 354 insertions(+), 88 deletions(-)

diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index 67b7da234..033b85eb1 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -22,8 +22,34 @@
 namespace svmp {
 namespace FE {
 
+/// \defgroup FE_CommonExceptions Exceptions
+/// \ingroup FE_Common
+/// \brief FE exception hierarchy and throw/check helper functions.
+///
+/// \details All FE-specific exceptions derive from FEException, which itself
+/// derives from the shared solver ExceptionBase. Specialized subclasses carry
+/// structured context (element type, DOF index, backend name and error code,
+/// iteration counts, Jacobian determinants) so call sites can report
+/// actionable diagnostics. The free helper templates raise(), throw_if(),
+/// check_arg(), check_not_null(), and check_index() wrap common validation
+/// patterns with source-location capture.
+/// @{
+
+/**
+ * @brief Base exception type for errors originating in the FE library
+ *
+ * Carries a status code and source location alongside the message. Derived
+ * classes select an appropriate StatusCode and may attach additional
+ * structured context.
+ */
 class FEException : public ExceptionBase {
 public:
+    /// @brief Construct with a message and optional status code and source location.
+    /// @param message Human-readable error description.
+    /// @param status Status code classifying the failure.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     FEException(const std::string& message,
                 StatusCode status = StatusCode::Unknown,
                 const char* file = "",
@@ -38,6 +64,11 @@ class FEException : public ExceptionBase {
     {
     }
 
+    /// @brief Construct with a message and source location, using an Unknown status.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     FEException(const std::string& message,
                 const char* file,
                 int line,
@@ -46,11 +77,21 @@ class FEException : public ExceptionBase {
     {
     }
 
+    /// @brief Status code classifying the failure.
+    /// @return The status code recorded at construction.
     StatusCode status() const noexcept { return status_code(); }
 };
 
+/**
+ * @brief An argument failed validation
+ */
 class InvalidArgumentException : public FEException {
 public:
+    /// @brief Construct with a message and optional source location.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     InvalidArgumentException(const std::string& message,
                              const char* file = "",
                              int line = 0,
@@ -61,8 +102,19 @@ class InvalidArgumentException : public FEException {
     }
 };
 
+/**
+ * @brief Unsupported or malformed element request
+ *
+ * Records the offending element type so error reports can name it.
+ */
 class InvalidElementException : public FEException {
 public:
+    /// @brief Construct with a message and optional element-type context.
+    /// @param message Human-readable error description.
+    /// @param element_type Name of the offending element type; appended to the message when non-empty.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     InvalidElementException(const std::string& message,
                             std::string element_type = "",
                             const char* file = "",
@@ -77,6 +129,8 @@ class InvalidElementException : public FEException {
     {
     }
 
+    /// @brief Name of the offending element type.
+    /// @return Element-type name; empty when not provided.
     const std::string& element_type() const noexcept { return element_type_; }
 
 private:
@@ -93,8 +147,19 @@ class InvalidElementException : public FEException {
     std::string element_type_;
 };
 
+/**
+ * @brief Degree-of-freedom numbering or lookup failure
+ *
+ * Records the offending DOF index so error reports can name it.
+ */
 class DofException : public FEException {
 public:
+    /// @brief Construct with a message and optional DOF-index context.
+    /// @param message Human-readable error description.
+    /// @param dof_index Offending DOF index; appended to the message unless it equals invalid_dof_index().
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     DofException(const std::string& message,
                  long long dof_index = invalid_dof_index(),
                  const char* file = "",
@@ -109,7 +174,11 @@ class DofException : public FEException {
     {
     }
 
+    /// @brief Offending DOF index.
+    /// @return DOF index; invalid_dof_index() when not provided.
     long long dof_index() const noexcept { return dof_index_; }
+    /// @brief Sentinel meaning "no DOF index attached".
+    /// @return The sentinel value -1.
     static constexpr long long invalid_dof_index() noexcept { return -1; }
 
 private:
@@ -126,8 +195,16 @@ class DofException : public FEException {
     long long dof_index_ = invalid_dof_index();
 };
 
+/**
+ * @brief Global assembly failure
+ */
 class AssemblyException : public FEException {
 public:
+    /// @brief Construct with a message and optional source location.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     AssemblyException(const std::string& message,
                       const char* file = "",
                       int line = 0,
@@ -137,8 +214,21 @@ class AssemblyException : public FEException {
     }
 };
 
+/**
+ * @brief Failure reported by a linear-algebra or solver backend
+ *
+ * Records the backend name and its native error code so error reports can
+ * identify the failing dependency.
+ */
 class BackendException : public FEException {
 public:
+    /// @brief Construct with a message and optional backend context.
+    /// @param message Human-readable error description.
+    /// @param backend_name Name of the failing backend; appended to the message when non-empty.
+    /// @param error_code Backend-native error code; appended to the message when nonzero.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     BackendException(const std::string& message,
                      std::string backend_name = "",
                      int error_code = 0,
@@ -155,7 +245,11 @@ class BackendException : public FEException {
     {
     }
 
+    /// @brief Name of the failing backend.
+    /// @return Backend name; empty when not provided.
     const std::string& backend_name() const noexcept { return backend_name_; }
+    /// @brief Backend-native error code.
+    /// @return Error code; zero when not provided.
     int error_code() const noexcept { return error_code_; }
 
 private:
@@ -185,8 +279,16 @@ class BackendException : public FEException {
     int error_code_ = 0;
 };
 
+/**
+ * @brief Requested feature is not implemented
+ */
 class NotImplementedException : public FEException {
 public:
+    /// @brief Construct from the name of the missing feature.
+    /// @param feature Description of the unimplemented feature.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     NotImplementedException(const std::string& feature,
                             const char* file = "",
                             int line = 0,
@@ -200,8 +302,16 @@ class NotImplementedException : public FEException {
     }
 };
 
+/**
+ * @brief Required initialization step has not been performed
+ */
 class NotInitializedException : public FEException {
 public:
+  /// @brief Construct from the name of the uninitialized feature.
+  /// @param feature Description of the missing initialization.
+  /// @param file Source file where the error was raised.
+  /// @param line Source line where the error was raised.
+  /// @param function Function where the error was raised.
   NotInitializedException(const std::string &feature,
                           const char *file,
                           int line = 0,
@@ -215,8 +325,21 @@ class NotInitializedException : public FEException {
   }
 };
 
+/**
+ * @brief Iterative process failed to converge
+ *
+ * Records the iteration count and final residual so error reports can show
+ * how far the iteration progressed.
+ */
 class ConvergenceException : public FEException {
 public:
+    /// @brief Construct with a message and optional iteration context.
+    /// @param message Human-readable error description.
+    /// @param iteration Iteration at which the failure was detected; appended to the message when non-negative.
+    /// @param residual Final residual; appended to the message when positive.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     ConvergenceException(const std::string& message,
                          int iteration = -1,
                          double residual = 0.0,
@@ -233,7 +356,11 @@ class ConvergenceException : public FEException {
     {
     }
 
+    /// @brief Iteration at which the failure was detected.
+    /// @return Iteration count; -1 when not provided.
     int iteration() const noexcept { return iteration_; }
+    /// @brief Final residual value.
+    /// @return Residual; 0.0 when not provided.
     double residual() const noexcept { return residual_; }
 
 private:
@@ -257,8 +384,20 @@ class ConvergenceException : public FEException {
     double residual_ = 0.0;
 };
 
+/**
+ * @brief Element geometric mapping is singular or inverted
+ *
+ * Records the offending Jacobian determinant so error reports can show the
+ * degeneracy.
+ */
 class SingularMappingException : public FEException {
 public:
+    /// @brief Construct with a message and the offending Jacobian determinant.
+    /// @param message Human-readable error description.
+    /// @param jacobian_det Jacobian determinant at the failure point; appended to the message.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     SingularMappingException(const std::string& message,
                              double jacobian_det = 0.0,
                              const char* file = "",
@@ -273,6 +412,8 @@ class SingularMappingException : public FEException {
     {
     }
 
+    /// @brief Jacobian determinant at the failure point.
+    /// @return The determinant recorded at construction.
     double jacobian_det() const noexcept { return jacobian_det_; }
 
 private:
@@ -285,12 +426,27 @@ class SingularMappingException : public FEException {
     double jacobian_det_ = 0.0;
 };
 
+/**
+ * @brief Throw an FE exception with source-location capture
+ * @tparam ExceptionT Exception type to throw.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT, class... Args>
 [[noreturn]] inline void raise(SourceLocation location, Args&&... args)
 {
     ::svmp::raise<ExceptionT>(location, std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Throw an FE exception when a condition holds
+ * @tparam ExceptionT Exception type to throw; defaults to FEException.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param condition Condition that triggers the throw when true.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = FEException, class... Args>
 inline void throw_if(bool condition, SourceLocation location, Args&&... args)
 {
@@ -299,6 +455,14 @@ inline void throw_if(bool condition, SourceLocation location, Args&&... args)
     }
 }
 
+/**
+ * @brief Validate an argument condition, throwing when it fails
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param condition Condition that must hold for the argument to be valid.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = InvalidArgumentException, class... Args>
 inline void check_arg(bool condition, SourceLocation location, Args&&... args)
 {
@@ -306,6 +470,15 @@ inline void check_arg(bool condition, SourceLocation location, Args&&... args)
                                   std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Validate that a pointer is non-null, throwing when it is null
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam PointerT Pointer-like type being checked.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param ptr Pointer to validate.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = InvalidArgumentException, class PointerT,
           class... Args>
 inline void check_not_null(PointerT ptr, SourceLocation location,
@@ -314,6 +487,15 @@ inline void check_not_null(PointerT ptr, SourceLocation location,
     ::svmp::check_not_null<ExceptionT>(ptr, location, std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Validate that an index lies in [0, size), throwing when out of bounds
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam IndexT Integral index type.
+ * @tparam SizeT Integral size type.
+ * @param index Index to validate.
+ * @param size Exclusive upper bound for the index.
+ * @param location Source location to record in the exception.
+ */
 template <class ExceptionT = InvalidArgumentException, class IndexT,
           class SizeT>
 inline void check_index(IndexT index, SizeT size, SourceLocation location)
@@ -329,12 +511,19 @@ inline void check_index(IndexT index, SizeT size, SourceLocation location)
             " out of bounds [0, " + std::to_string(fe_check_size_value) + ")");
 }
 
+/**
+ * @brief Throw NotImplementedException for a missing feature
+ * @param feature Description of the unimplemented feature.
+ * @param location Source location to record in the exception.
+ */
 [[noreturn]] inline void not_implemented(const std::string& feature,
                                          SourceLocation location)
 {
     ::svmp::FE::raise<NotImplementedException>(location, feature);
 }
 
+/// @}
+
 } // namespace FE
 } // namespace svmp
 
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index e3d5a46e9..1f57ffcc5 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -16,18 +16,26 @@
 
 #if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
 #  include "Mesh/Core/MeshTypes.h"
+/// Nonzero when FE shares scalar/index types with the Mesh library.
 #  define SVMP_FE_HAS_MESH_TYPES 1
 #else
 // Build FE without Mesh types unless explicitly enabled.
+/// Nonzero when FE shares scalar/index types with the Mesh library.
 #  define SVMP_FE_HAS_MESH_TYPES 0
 #endif
 
 #if !SVMP_FE_HAS_MESH_TYPES
 namespace svmp {
-// Minimal fallback when the Mesh library is not available.
-// Keeps FE compilation self-contained while preserving the same namespace.
 #ifndef SVMP_CELL_FAMILY_DEFINED
+/// Guard marking that svmp::CellFamily has been defined.
 #define SVMP_CELL_FAMILY_DEFINED 1
+/**
+ * @brief Minimal fallback for svmp::CellFamily when the Mesh library is unavailable
+ * @ingroup FE_CommonTypes
+ *
+ * Keeps FE compilation self-contained while preserving the same namespace
+ * and enumerator set as the Mesh library's cell-family classification.
+ */
 enum class CellFamily {
     Point,
     Line,
@@ -51,16 +59,40 @@ enum class CellFamily {
 #include <limits>
 
 #if defined(_MSC_VER)
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT __restrict
 #elif defined(__clang__) || defined(__GNUC__)
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT __restrict__
 #else
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT
 #endif
 
+/// \defgroup FE_Common Common
+/// \ingroup FE
+/// \brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
+///
+/// \details The Common module collects the foundational definitions that the
+/// rest of the FE library builds on: index and scalar type aliases; element,
+/// basis, quadrature, and field enumerations; sentinel constants and strong
+/// type wrappers; and the FE exception hierarchy together with its
+/// argument-checking helpers.
+
 namespace svmp {
 namespace FE {
 
+/// \defgroup FE_CommonTypes Types
+/// \ingroup FE_Common
+/// \brief Core type aliases, enumerations, constants, geometric types, and compile-time traits.
+///
+/// \details This group documents the index and identifier types used for
+/// element-local and global numbering, the element/basis/quadrature/field
+/// enumerations shared across modules, sentinel constants, reference- and
+/// physical-space geometric aliases, and the strong-type utilities that
+/// prevent accidental mixing of conceptually distinct values.
+/// @{
+
 // ============================================================================
 // Index Types
 // ============================================================================
@@ -88,10 +120,16 @@ using GlobalIndex = std::int64_t;
  * Provides type safety at compile time.
  */
 struct DofIndex {
-    GlobalIndex value;
+    GlobalIndex value;  ///< Underlying global DOF index; negative values are invalid.
 
+    /// @brief Construct a DOF index, defaulting to the invalid sentinel.
+    /// @param v Global DOF index value.
     constexpr explicit DofIndex(GlobalIndex v = -1) noexcept : value(v) {}
+    /// @brief Convert to the underlying global index value.
+    /// @return The stored global index.
     constexpr operator GlobalIndex() const noexcept { return value; }
+    /// @brief Check whether this index refers to a valid DOF.
+    /// @return True when the stored value is non-negative.
     constexpr bool is_valid() const noexcept { return value >= 0; }
 };
 
@@ -109,28 +147,32 @@ using BlockId = std::uint16_t;
 
 // Import mesh library scalar/index types when available (optional dependency).
 #if SVMP_FE_HAS_MESH_TYPES
-using MeshIndex = svmp::index_t;
-using MeshOffset = svmp::offset_t;
-using MeshGlobalId = svmp::gid_t;
-using Real = svmp::real_t;  // Use same precision as Mesh library
+using MeshIndex = svmp::index_t;        ///< Local mesh entity index, shared with the Mesh library.
+using MeshOffset = svmp::offset_t;      ///< Offset type for mesh connectivity arrays.
+using MeshGlobalId = svmp::gid_t;       ///< Global mesh entity identifier.
+using Real = svmp::real_t;              ///< Floating-point scalar type; same precision as the Mesh library.
 #else
-using MeshIndex = std::int32_t;
-using MeshOffset = std::int64_t;
-using MeshGlobalId = std::int64_t;
-using Real = double;
+using MeshIndex = std::int32_t;         ///< Local mesh entity index, shared with the Mesh library.
+using MeshOffset = std::int64_t;        ///< Offset type for mesh connectivity arrays.
+using MeshGlobalId = std::int64_t;      ///< Global mesh entity identifier.
+using Real = double;                    ///< Floating-point scalar type; same precision as the Mesh library.
 #endif
 
 // ============================================================================
 // Constants
 // ============================================================================
 
+/// Sentinel for an unset or out-of-range local index.
 constexpr LocalIndex INVALID_LOCAL_INDEX = std::numeric_limits<LocalIndex>::max();
+/// Sentinel for an unset or out-of-range global index.
 constexpr GlobalIndex INVALID_GLOBAL_INDEX = -1;
+/// Sentinel FieldId meaning "uninitialized / no field".
 constexpr FieldId INVALID_FIELD_ID = std::numeric_limits<FieldId>::max();
 /// Sentinel FieldId for geometry-only quantities (no DOF dependence).
 /// Uses first registered field's space for quadrature, but logically decoupled
 /// from any specific field's DOFs.
 constexpr FieldId GEOMETRY_FIELD_ID = std::numeric_limits<FieldId>::max() - 1;
+/// Sentinel for an unset or out-of-range block identifier.
 constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
 
 /**
@@ -169,9 +211,9 @@ constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
  * Node-scoped auxiliary models with Lagrange Kronecker delta).
  */
 struct FieldValueEntry {
-    FieldId field{INVALID_FIELD_ID};
-    int n_components{0};
-    Real components[MAX_FIELD_VALUE_COMPONENTS]{};
+    FieldId field{INVALID_FIELD_ID};                  ///< Field this value belongs to.
+    int n_components{0};                              ///< Number of valid entries in components.
+    Real components[MAX_FIELD_VALUE_COMPONENTS]{};    ///< Component values, row-major for tensors.
 };
 
 // ============================================================================
@@ -186,115 +228,115 @@ struct FieldValueEntry {
  */
 enum class ElementType : std::uint8_t {
     // Linear elements
-    Line2      = 0,   // 2-node line
-    Triangle3  = 1,   // 3-node triangle
-    Quad4      = 2,   // 4-node quadrilateral
-    Tetra4     = 3,   // 4-node tetrahedron
-    Hex8       = 4,   // 8-node hexahedron
-    Wedge6     = 5,   // 6-node wedge/prism
-    Pyramid5   = 6,   // 5-node pyramid
+    Line2      = 0,   ///< 2-node line
+    Triangle3  = 1,   ///< 3-node triangle
+    Quad4      = 2,   ///< 4-node quadrilateral
+    Tetra4     = 3,   ///< 4-node tetrahedron
+    Hex8       = 4,   ///< 8-node hexahedron
+    Wedge6     = 5,   ///< 6-node wedge/prism
+    Pyramid5   = 6,   ///< 5-node pyramid
 
     // Quadratic elements
-    Line3      = 10,  // 3-node line
-    Triangle6  = 11,  // 6-node triangle
-    Quad9      = 12,  // 9-node quadrilateral (bi-quadratic)
-    Quad8      = 13,  // 8-node quadrilateral (serendipity)
-    Tetra10    = 14,  // 10-node tetrahedron
-    Hex27      = 15,  // 27-node hexahedron (tri-quadratic)
-    Hex20      = 16,  // 20-node hexahedron (serendipity)
-    Wedge15    = 17,  // 15-node wedge
-    Wedge18    = 18,  // 18-node wedge (complete quadratic)
-    Pyramid13  = 19,  // 13-node pyramid
-    Pyramid14  = 20,  // 14-node pyramid
+    Line3      = 10,  ///< 3-node line
+    Triangle6  = 11,  ///< 6-node triangle
+    Quad9      = 12,  ///< 9-node quadrilateral (bi-quadratic)
+    Quad8      = 13,  ///< 8-node quadrilateral (serendipity)
+    Tetra10    = 14,  ///< 10-node tetrahedron
+    Hex27      = 15,  ///< 27-node hexahedron (tri-quadratic)
+    Hex20      = 16,  ///< 20-node hexahedron (serendipity)
+    Wedge15    = 17,  ///< 15-node wedge
+    Wedge18    = 18,  ///< 18-node wedge (complete quadratic)
+    Pyramid13  = 19,  ///< 13-node pyramid
+    Pyramid14  = 20,  ///< 14-node pyramid
 
     // Special elements
-    Point1     = 30,  // 1-node point element
+    Point1     = 30,  ///< 1-node point element
 
-    Unknown    = 255
+    Unknown    = 255  ///< Unrecognized or uninitialized element type
 };
 
 /**
  * @brief Quadrature rule types
  */
 enum class QuadratureType : std::uint8_t {
-    GaussLegendre,     // Standard Gaussian quadrature
-    GaussLobatto,      // Includes endpoints (for spectral elements)
-    Newton,            // Newton-Cotes rules
-    Reduced,           // Order-based reduced integration for locking
-    PositionBased,     // Position-based reduced integration (legacy compatible)
-    Composite,         // Composite rules for adaptivity
-    Custom             // User-defined quadrature points
+    GaussLegendre,     ///< Standard Gaussian quadrature
+    GaussLobatto,      ///< Includes endpoints (for spectral elements)
+    Newton,            ///< Newton-Cotes rules
+    Reduced,           ///< Order-based reduced integration for locking
+    PositionBased,     ///< Position-based reduced integration (legacy compatible)
+    Composite,         ///< Composite rules for adaptivity
+    Custom             ///< User-defined quadrature points
 };
 
 /**
  * @brief Basis function families
  */
 enum class BasisType : std::uint8_t {
-    Lagrange,          // Standard nodal Lagrange basis
-    Hierarchical,      // Hierarchical/modal basis
-    Bernstein,         // Bernstein polynomials
-    NURBS,             // Non-uniform rational B-splines
-    BSpline,           // Non-rational B-spline basis
-    Spectral,          // Spectral element basis
-    Serendipity,       // Serendipity elements
-    Hermite,           // Hermite C1 continuity basis
-    RaviartThomas,     // H(div) Raviart-Thomas family
-    Nedelec,           // H(curl) Nedelec edge elements
-    BDM,               // H(div) Brezzi-Douglas-Marini family
-    Bubble,            // Interior bubble functions for enrichment
-    Custom             // User-defined basis
+    Lagrange,          ///< Standard nodal Lagrange basis
+    Hierarchical,      ///< Hierarchical/modal basis
+    Bernstein,         ///< Bernstein polynomials
+    NURBS,             ///< Non-uniform rational B-splines
+    BSpline,           ///< Non-rational B-spline basis
+    Spectral,          ///< Spectral element basis
+    Serendipity,       ///< Serendipity elements
+    Hermite,           ///< Hermite C1 continuity basis
+    RaviartThomas,     ///< H(div) Raviart-Thomas family
+    Nedelec,           ///< H(curl) Nedelec edge elements
+    BDM,               ///< H(div) Brezzi-Douglas-Marini family
+    Bubble,            ///< Interior bubble functions for enrichment
+    Custom             ///< User-defined basis
 };
 
 /**
  * @brief Field types for function spaces
  */
 enum class FieldType : std::uint8_t {
-    Scalar,            // Scalar field (temperature, pressure)
-    Vector,            // Vector field (velocity, displacement)
-    Tensor,            // Tensor field (stress, strain)
-    SymmetricTensor,   // Symmetric tensor field
-    Mixed              // Mixed/composite field
+    Scalar,            ///< Scalar field (temperature, pressure)
+    Vector,            ///< Vector field (velocity, displacement)
+    Tensor,            ///< Tensor field (stress, strain)
+    SymmetricTensor,   ///< Symmetric tensor field
+    Mixed              ///< Mixed/composite field
 };
 
 /**
  * @brief Continuity requirements for function spaces
  */
 enum class Continuity : std::uint8_t {
-    C0,                // Continuous (standard FEM)
-    C1,                // C1 continuous (for plates/shells)
-    L2,                // L2 (discontinuous)
-    H_div,             // H(div) conforming
-    H_curl,            // H(curl) conforming
-    Custom
+    C0,                ///< Continuous (standard FEM)
+    C1,                ///< C1 continuous (for plates/shells)
+    L2,                ///< L2 (discontinuous)
+    H_div,             ///< H(div) conforming
+    H_curl,            ///< H(curl) conforming
+    Custom             ///< User-defined continuity requirement
 };
 
 /**
  * @brief Assembly strategies
  */
 enum class AssemblyStrategy : std::uint8_t {
-    ElementByElement,  // Traditional element loop
-    Vectorized,        // SIMD vectorized assembly
-    MatrixFree,        // Matrix-free operators
-    Hybrid             // Mixed strategy
+    ElementByElement,  ///< Traditional element loop
+    Vectorized,        ///< SIMD vectorized assembly
+    MatrixFree,        ///< Matrix-free operators
+    Hybrid             ///< Mixed strategy
 };
 
 /**
  * @brief Status codes for FE operations
  */
 enum class FEStatus : std::uint8_t {
-    Success           = 0,
-    InvalidArgument   = 1,
-    InvalidElement    = 2,
-    SingularMapping   = 3,
-    QuadratureError   = 4,
-    AssemblyError     = 5,
-    BackendError      = 6,
-    NotImplemented    = 7,
-    ConvergenceError  = 8,
-    AllocationError   = 9,
-    MPIError          = 10,
-    IOError           = 11,
-    Unknown           = 255
+    Success           = 0,    ///< Operation completed successfully
+    InvalidArgument   = 1,    ///< An argument failed validation
+    InvalidElement    = 2,    ///< Unsupported or malformed element
+    SingularMapping   = 3,    ///< Element mapping Jacobian is singular
+    QuadratureError   = 4,    ///< Quadrature rule construction or evaluation failed
+    AssemblyError     = 5,    ///< Global assembly failure
+    BackendError      = 6,    ///< Linear-algebra backend failure
+    NotImplemented    = 7,    ///< Requested feature is not implemented
+    ConvergenceError  = 8,    ///< Iterative process failed to converge
+    AllocationError   = 9,    ///< Memory allocation failure
+    MPIError          = 10,   ///< MPI communication failure
+    IOError           = 11,   ///< File or stream I/O failure
+    Unknown           = 255   ///< Unclassified error
 };
 
 // ============================================================================
@@ -303,6 +345,7 @@ enum class FEStatus : std::uint8_t {
 
 /**
  * @brief Point in reference element coordinates
+ * @tparam Dim Reference-space dimension
  */
 template<int Dim>
 using ReferencePoint = std::array<Real, static_cast<std::size_t>(Dim)>;
@@ -314,6 +357,8 @@ using PhysicalPoint = std::array<Real, 3>;
 
 /**
  * @brief Jacobian matrix type
+ * @tparam SpatialDim Physical-space dimension (rows)
+ * @tparam ReferenceDim Reference-space dimension (columns)
  */
 template<int SpatialDim, int ReferenceDim = SpatialDim>
 using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
@@ -327,31 +372,51 @@ using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceD
  *
  * Prevents accidental mixing of conceptually different types that have
  * the same underlying representation.
+ *
+ * @tparam T Underlying value type
+ * @tparam Tag Empty tag type that distinguishes otherwise identical wrappers
  */
 template<typename T, typename Tag>
 class StrongType {
 public:
+    /// @brief Underlying value type.
     using ValueType = T;
 
+    /// @brief Value-initialize the wrapped value.
     constexpr StrongType() noexcept(std::is_nothrow_default_constructible_v<T>)
         : value_{} {}
 
+    /// @brief Wrap an explicit value.
+    /// @param value Value to store.
     constexpr explicit StrongType(T value) noexcept(std::is_nothrow_move_constructible_v<T>)
         : value_(std::move(value)) {}
 
+    /// @brief Access the wrapped value.
+    /// @return Reference to the wrapped value.
     constexpr T& get() noexcept { return value_; }
+    /// @brief Access the wrapped value.
+    /// @return Reference to the wrapped value.
     constexpr const T& get() const noexcept { return value_; }
 
-    // Explicit conversion
+    /// @brief Explicitly convert back to the underlying type.
+    /// @return Copy of the wrapped value.
     constexpr explicit operator T() const noexcept { return value_; }
 
-    // Comparison operators
+    /// @brief Compare wrapped values for equality.
+    /// @param other Wrapper to compare against.
+    /// @return True when the wrapped values are equal.
     constexpr bool operator==(const StrongType& other) const noexcept {
         return value_ == other.value_;
     }
+    /// @brief Compare wrapped values for inequality.
+    /// @param other Wrapper to compare against.
+    /// @return True when the wrapped values differ.
     constexpr bool operator!=(const StrongType& other) const noexcept {
         return value_ != other.value_;
     }
+    /// @brief Order by wrapped value.
+    /// @param other Wrapper to compare against.
+    /// @return True when this wrapped value orders before the other.
     constexpr bool operator<(const StrongType& other) const noexcept {
         return value_ < other.value_;
     }
@@ -361,12 +426,14 @@ class StrongType {
 };
 
 // Specific strong types for common use cases
-struct QuadraturePointTag {};
-struct QuadratureWeightTag {};
-struct BasisValueTag {};
-struct BasisGradientTag {};
+struct QuadraturePointTag {};   ///< Tag type for quadrature-point indices.
+struct QuadratureWeightTag {};  ///< Tag type for quadrature weights.
+struct BasisValueTag {};        ///< Tag type for basis-function values.
+struct BasisGradientTag {};     ///< Tag type for basis-function gradients.
 
+/// Type-safe index of a quadrature point within a rule.
 using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
+/// Type-safe quadrature weight value.
 using QuadratureWeight = StrongType<Real, QuadratureWeightTag>;
 
 // ============================================================================
@@ -388,6 +455,7 @@ struct is_index_type<GlobalIndex> : std::true_type {};
 template<>
 struct is_index_type<DofIndex> : std::true_type {};
 
+/// Convenience variable template for is_index_type.
 template<typename T>
 inline constexpr bool is_index_type_v = is_index_type<T>::value;
 
@@ -400,6 +468,7 @@ struct is_field_type : std::false_type {};
 template<>
 struct is_field_type<FieldType> : std::true_type {};
 
+/// Convenience variable template for is_field_type.
 template<typename T>
 inline constexpr bool is_field_type_v = is_field_type<T>::value;
 
@@ -409,6 +478,8 @@ inline constexpr bool is_field_type_v = is_field_type<T>::value;
 
 /**
  * @brief Convert FE ElementType to Mesh CellFamily
+ * @param elem Element type to classify.
+ * @return Cell family of the element's linear topology; Point for unknown types.
  */
 constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
     switch(elem) {
@@ -454,6 +525,8 @@ constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
 
 /**
  * @brief Get spatial dimension of element type
+ * @param elem Element type to query.
+ * @return Reference dimension from 0 (point) to 3 (volume); -1 for unknown types.
  */
 constexpr int element_dimension(ElementType elem) noexcept {
     switch(elem) {
@@ -487,6 +560,8 @@ constexpr int element_dimension(ElementType elem) noexcept {
 
 /**
  * @brief Convert status code to string for error reporting
+ * @param status Status code to describe.
+ * @return Static human-readable description of the status.
  */
 inline const char* status_to_string(FEStatus status) noexcept {
     switch(status) {
@@ -506,6 +581,8 @@ inline const char* status_to_string(FEStatus status) noexcept {
     }
 }
 
+/// @}
+
 } // namespace FE
 } // namespace svmp
 

From 917c638668e816199f23018cb6920d0670fafb0a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 19:06:10 -0700
Subject: [PATCH 15/91] aligning exception throws and raises with the
 function-template calls and using `SVMP_HERE` for file, line, and function
 source location information

---
 Code/Source/solver/FE/Basis/BasisExceptions.h | 40 ----------
 Code/Source/solver/FE/Basis/BasisFactory.cpp  | 38 ++++------
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  8 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 46 +++++-------
 .../FE/Basis/NodeOrderingConventions.cpp      | 28 +++----
 .../solver/FE/Basis/SerendipityBasis.cpp      | 73 ++++++++-----------
 6 files changed, 78 insertions(+), 155 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index c1af17049..8f8fd3c3c 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -83,46 +83,6 @@ class BasisConstructionException : public BasisException {
         : BasisException(message, file, line, function, StatusCode::InternalError) {}
 };
 
-#define BASIS_CHECK_CONFIG(condition, message)                                                 \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisConfigurationException((message),                    \
-                                                                  __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_COMPAT(condition, message)                                                 \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisElementCompatibilityException((message),             \
-                                                                         __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_EVAL(condition, message)                                                   \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisEvaluationException((message),                       \
-                                                               __FILE__, __LINE__, __func__);  \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_NODE_ORDER(condition, message)                                             \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisNodeOrderingException((message),                     \
-                                                                 __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_CONSTRUCTION(condition, message)                                           \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisConstructionException((message),                     \
-                                                                 __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index b48e25536..c3130d16f 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -16,28 +16,20 @@ namespace {
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
-    if (!req.order.has_value()) {
-        throw BasisConfigurationException(missing_message,
-                                          __FILE__, __LINE__, __func__);
-    }
-    if (*req.order < 0) {
-        throw BasisConfigurationException(negative_message,
-                                          __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(!req.order.has_value(), SVMP_HERE,
+                                              missing_message);
+    FE::throw_if<BasisConfigurationException>(*req.order < 0, SVMP_HERE,
+                                              negative_message);
     return *req.order;
 }
 
 void require_scalar_c0_request(const BasisRequest& req) {
-    if (req.field_type != FieldType::Scalar) {
-        throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases support scalar fields only",
-            __FILE__, __LINE__, __func__);
-    }
-    if (req.continuity != Continuity::C0) {
-        throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases support C0 continuity only",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(
+        req.field_type != FieldType::Scalar, SVMP_HERE,
+        "BasisFactory: Lagrange/Serendipity bases support scalar fields only");
+    FE::throw_if<BasisConfigurationException>(
+        req.continuity != Continuity::C0, SVMP_HERE,
+        "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
 }
 
 std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
@@ -69,9 +61,8 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
         case BasisType::Serendipity:
             return create_serendipity(req);
         default:
-            throw BasisConfigurationException(
-                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope");
     }
 }
 
@@ -90,9 +81,8 @@ BasisRequest default_basis_request(ElementType element_type) {
             if (order >= 0) {
                 return BasisRequest{element_type, BasisType::Lagrange, order};
             }
-            throw BasisElementCompatibilityException(
-                "BasisFactory: no default basis is defined for the requested element type",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "BasisFactory: no default basis is defined for the requested element type");
         }
     }
 }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index b98a36292..591f6751a 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -28,16 +28,16 @@ void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
     (void)gradients;
-    throw BasisEvaluationException("Analytic gradient evaluation is not implemented for this basis",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic gradient evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
                                       std::vector<Hessian>& hessians) const {
     (void)xi;
     (void)hessians;
-    throw BasisEvaluationException("Analytic Hessian evaluation is not implemented for this basis",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic Hessian evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 4f8c15bb1..b32199d03 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -44,10 +44,8 @@ struct NormalizedLagrangeRequest {
 // Validate and return the supported basis topology for a Lagrange element type.
 BasisTopology supported_lagrange_topology(ElementType type) {
     const BasisTopology top = topology(type);
-    if (top == BasisTopology::Unknown) {
-        throw BasisElementCompatibilityException("LagrangeBasis: unsupported element type",
-                                                __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
+                                                     "LagrangeBasis: unsupported element type");
     return top;
 }
 
@@ -67,23 +65,19 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
         case ElementType::Wedge18:
             return {ElementType::Wedge6, std::max(order, 2)};
         case ElementType::Quad8:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
         case ElementType::Hex20:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis");
         case ElementType::Wedge15:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis");
         case ElementType::Pyramid5:
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: pyramid support is not within the current solver basis scope",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: pyramid support is not within the current solver basis scope");
         default:
             return {element_type, order};
     }
@@ -315,10 +309,8 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     const auto normalized = normalize_lagrange_request(element_type_, order_);
     element_type_ = normalized.element_type;
     order_ = normalized.order;
-    if (order_ < 0) {
-        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
-                                          __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
+                                              "LagrangeBasis requires non-negative polynomial order");
 
     topology_ = supported_lagrange_topology(element_type_);
     dimension_ = reference_dimension(element_type_);
@@ -366,8 +358,8 @@ void LagrangeBasis::init_nodes() {
             break;
     }
 
-    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
-                                             __FILE__, __LINE__, __func__);
+    FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+        "Unsupported element type in LagrangeBasis::init_nodes");
 }
 
 // Build the single reference node for a point basis.
@@ -419,10 +411,8 @@ void LagrangeBasis::build_wedge_nodes() {
         const auto tri_exp =
             simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
         auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
-        if (it == simplex_exponents_.end()) {
-            throw BasisConstructionException("LagrangeBasis: wedge node triangle index lookup failed",
-                                             __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConstructionException>(it == simplex_exponents_.end(), SVMP_HERE,
+                                                 "LagrangeBasis: wedge node triangle index lookup failed");
         const std::size_t tri_index =
             static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
         wedge_indices_.push_back({tri_index, axis_index_pm_one(node[2], order_)});
@@ -555,8 +545,8 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
         return;
     }
 
-    throw BasisEvaluationException("Unsupported element in LagrangeBasis evaluation",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Unsupported element in LagrangeBasis evaluation");
 }
 
 void LagrangeBasis::evaluate_values(const Vec3& xi,
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 76662abe1..850f8cd0a 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -318,10 +318,8 @@ std::vector<Point> generate_wedge_nodes(int order) {
 }
 
 std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
-    if (order < 0) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout requires non-negative Lagrange order",
-                                         __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
+                                             "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
         case ElementType::Point1:
@@ -339,11 +337,11 @@ std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: unsupported Lagrange topology",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: unsupported Lagrange topology");
     }
 }
 
@@ -370,11 +368,11 @@ std::vector<Point> element_nodes(ElementType elem_type) {
             return nodes;
         }
         case ElementType::Pyramid13:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: unknown element type",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: unknown element type");
     }
 }
 
@@ -383,10 +381,8 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
-    if (local_node >= nodes.size()) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout::get_node_coords: node index out of range",
-                                         __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
+                                             "ReferenceNodeLayout::get_node_coords: node index out of range");
     return nodes[local_node];
 }
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 30eac9c38..006d43fdc 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -115,11 +115,9 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
         nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
     }
 
-    if (nodes.size() > total_size) {
-        throw BasisConstructionException(
-            "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        nodes.size() > total_size, SVMP_HERE,
+        "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size");
 
     const std::size_t interior_count = total_size - nodes.size();
     if (interior_count == 0u) {
@@ -157,11 +155,9 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
                   return a[0] < b[0];
               });
 
-    if (interior_count > interior_candidates.size()) {
-        throw BasisConstructionException(
-            "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        interior_count > interior_candidates.size(), SVMP_HERE,
+        "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order");
 
     nodes.insert(nodes.end(),
                  interior_candidates.begin(),
@@ -181,11 +177,9 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
     std::span<const std::array<int, 2>> exponents,
     int order) {
     const int n = static_cast<int>(nodes.size());
-    if (n == 0 || exponents.size() != nodes.size()) {
-        throw BasisConstructionException(
-            "SerendipityBasis: invalid quadrilateral serendipity interpolation setup",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        n == 0 || exponents.size() != nodes.size(), SVMP_HERE,
+        "SerendipityBasis: invalid quadrilateral serendipity interpolation setup");
 
     std::vector<Real> vandermonde(static_cast<std::size_t>(n * n), Real(0));
     auto idx = [n](int row, int col) -> std::size_t {
@@ -499,19 +493,15 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         if (order_ < 1) {
             order_ = 1;
         }
-        if (type == ElementType::Quad8 && order_ != 2) {
-            throw BasisConfigurationException(
-                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConfigurationException>(
+            type == ElementType::Quad8 && order_ != 2, SVMP_HERE,
+            "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
         quad_monomial_exponents_ = quad_serendipity_exponents(order_);
         size_ = quad_monomial_exponents_.size();
         nodes_ = quad_serendipity_nodes(order_, size_);
-        if (nodes_.size() != size_) {
-            throw BasisConstructionException(
-                "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConstructionException>(
+            nodes_.size() != size_, SVMP_HERE,
+            "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
         quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
     } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
         dimension_ = 3;
@@ -521,9 +511,8 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         } else if (order_ == 2) {
             size_ = 20;
         } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on hexahedra",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "SerendipityBasis supports up to quadratic on hexahedra");
         }
     } else if (type == ElementType::Wedge15) {
         dimension_ = 3;
@@ -533,13 +522,12 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         if (order_ == 2) {
             size_ = 15;
         } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on wedge15",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "SerendipityBasis supports up to quadratic on wedge15");
         }
     } else {
-        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements",
-                                                 __FILE__, __LINE__, __func__);
+        FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
     }
 
     if (nodes_.empty()) {
@@ -573,12 +561,11 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
     const Real z = xi[2];
 
     if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisEvaluationException>(
+            quad_monomial_exponents_.size() != size_ ||
+                quad_inv_vandermonde_.size() != size_ * size_,
+            SVMP_HERE,
+            "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation");
 
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
@@ -632,8 +619,8 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 
     if (element_type_ == ElementType::Hex20) {
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
+        FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
+                                               "Hex20 mesh-to-basis ordering is not registered");
 
         if (values_out) {
             Real internal_vals[20];
@@ -681,8 +668,8 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_all_to: unsupported serendipity configuration",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "SerendipityBasis::evaluate_all_to: unsupported serendipity configuration");
 }
 
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,

From 4819f595e920e95a88be0c5895be3e1d5dc055d0 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 19:35:11 -0700
Subject: [PATCH 16/91] fixing doxygen layout to allow for visible topic
 sections since modules in doxygen are now reserved for c++ modules

---
 Documentation/DoxygenLayout.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/DoxygenLayout.xml b/Documentation/DoxygenLayout.xml
index df0146828..f056df891 100644
--- a/Documentation/DoxygenLayout.xml
+++ b/Documentation/DoxygenLayout.xml
@@ -3,7 +3,11 @@
   <!-- Navigation index tabs for HTML output -->
   <navindex>
     <!-- <tab type="pages" visible="yes" title="" intro=""/> -->
+    <!-- Doxygen <= 1.9.7 renders grouping pages via the "modules" tab; 1.9.8+
+         renamed them to "topics" and reuses "modules" for C++20 modules.
+         Declare both so the tab appears regardless of the doxygen version. -->
     <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="topics" visible="yes" title="" intro=""/>
     <tab type="namespaces" visible="yes" title="">
       <tab type="namespacelist" visible="yes" title="" intro=""/>
       <tab type="namespacemembers" visible="yes" title="" intro=""/>

From dfd5aff359ee357771550c228e303f58d5f862b2 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 20:46:58 -0700
Subject: [PATCH 17/91] added topology evaluation helpers and cleaned up static
 cast helpers

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |   4 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  14 +
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 361 +++++++++---------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  17 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      |  14 +-
 5 files changed, 215 insertions(+), 195 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 591f6751a..d847a9cca 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -63,9 +63,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
-        gradients_out[i * 3u + 0u] = tmp[i][0];
-        gradients_out[i * 3u + 1u] = tmp[i][1];
-        gradients_out[i * 3u + 2u] = tmp[i][2];
+        store_gradient(tmp[i], gradients_out + i * 3u);
     }
 }
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index e7de2bf01..832926199 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -49,6 +49,20 @@ using Hessian  = math::Matrix<Real, 3, 3>;
     return hessian;
 }
 
+inline void store_gradient(const Gradient& gradient, Real* dst) noexcept {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
+}
+
+[[nodiscard]] inline Gradient load_gradient(const Real* src) noexcept {
+    Gradient gradient;
+    gradient[0] = src[0];
+    gradient[1] = src[1];
+    gradient[2] = src[2];
+    return gradient;
+}
+
 inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
     dst[0] = hessian(0, 0);
     dst[1] = hessian(0, 1);
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index b32199d03..4ec970b86 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <limits>
 
 namespace svmp {
 namespace FE {
@@ -121,6 +122,9 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
     return e;
 }
 
+// Sentinel node index meaning "skip nothing" in product_excluding below.
+constexpr std::size_t kNoSkip = std::numeric_limits<std::size_t>::max();
+
 // Evaluate 1D Lagrange polynomials and derivatives at a point.
 void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
     const std::size_t n = nodes.size();
@@ -134,6 +138,19 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
     }
 
     for (std::size_t i = 0; i < n; ++i) {
+        // Product of (x - nodes[j]) over all j except i and the listed skips.
+        // Each derivative order drops one additional factor from the product.
+        const auto product_excluding = [&](std::size_t skip1 = kNoSkip,
+                                           std::size_t skip2 = kNoSkip) {
+            Real product = Real(1);
+            for (std::size_t j = 0; j < n; ++j) {
+                if (j != i && j != skip1 && j != skip2) {
+                    product *= x - nodes[j];
+                }
+            }
+            return product;
+        };
+
         Real denom = Real(1);
         for (std::size_t j = 0; j < n; ++j) {
             if (j != i) {
@@ -141,26 +158,13 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
             }
         }
 
-        Real value = Real(1);
-        for (std::size_t j = 0; j < n; ++j) {
-            if (j != i) {
-                value *= x - nodes[j];
-            }
-        }
-        out.value[i] = value / denom;
+        out.value[i] = product_excluding() / denom;
 
         Real first = Real(0);
         for (std::size_t m = 0; m < n; ++m) {
-            if (m == i) {
-                continue;
+            if (m != i) {
+                first += product_excluding(m);
             }
-            Real product = Real(1);
-            for (std::size_t j = 0; j < n; ++j) {
-                if (j != i && j != m) {
-                    product *= x - nodes[j];
-                }
-            }
-            first += product;
         }
         out.first[i] = first / denom;
 
@@ -170,16 +174,9 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
                 continue;
             }
             for (std::size_t l = 0; l < n; ++l) {
-                if (l == i || l == m) {
-                    continue;
-                }
-                Real product = Real(1);
-                for (std::size_t j = 0; j < n; ++j) {
-                    if (j != i && j != m && j != l) {
-                        product *= x - nodes[j];
-                    }
+                if (l != i && l != m) {
+                    second += product_excluding(m, l);
                 }
-                second += product;
             }
         }
         out.second[i] = second / denom;
@@ -222,7 +219,7 @@ void evaluate_simplex(const Vec3& xi,
         return;
     }
 
-    const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
+    const std::size_t bary_count = top == BasisTopology::Triangle ? 3u : 4u;
     std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
     std::array<Gradient, 4> lambda_grad;
     lambda_grad.fill(Gradient::Zero());
@@ -246,48 +243,40 @@ void evaluate_simplex(const Vec3& xi,
 
     for (std::size_t i = 0; i < n; ++i) {
         std::array<std::array<Real, 3>, 4> f{};
-        for (int a = 0; a < bary_count; ++a) {
-            f[static_cast<std::size_t>(a)] =
-                simplex_factor(exponents[i][static_cast<std::size_t>(a)],
-                               lambda[static_cast<std::size_t>(a)],
-                               order);
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            f[a] = simplex_factor(exponents[i][a], lambda[a], order);
         }
 
         Real value = Real(1);
-        for (int a = 0; a < bary_count; ++a) {
-            value *= f[static_cast<std::size_t>(a)][0];
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            value *= f[a][0];
         }
         out.value[i] = value;
 
-        for (int a = 0; a < bary_count; ++a) {
-            Real product = f[static_cast<std::size_t>(a)][1];
-            for (int b = 0; b < bary_count; ++b) {
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            Real product = f[a][1];
+            for (std::size_t b = 0; b < bary_count; ++b) {
                 if (b != a) {
-                    product *= f[static_cast<std::size_t>(b)][0];
+                    product *= f[b][0];
                 }
             }
             for (std::size_t c = 0; c < 3u; ++c) {
-                out.gradient[i][c] += product * lambda_grad[static_cast<std::size_t>(a)][c];
+                out.gradient[i][c] += product * lambda_grad[a][c];
             }
         }
 
-        for (int a = 0; a < bary_count; ++a) {
-            for (int b = 0; b < bary_count; ++b) {
-                Real product = (a == b)
-                    ? f[static_cast<std::size_t>(a)][2]
-                    : f[static_cast<std::size_t>(a)][1] *
-                      f[static_cast<std::size_t>(b)][1];
-                for (int c = 0; c < bary_count; ++c) {
-                    if (c != a && c != b) {
-                        product *= f[static_cast<std::size_t>(c)][0];
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            for (std::size_t b = 0; b < bary_count; ++b) {
+                Real product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
+                for (std::size_t k = 0; k < bary_count; ++k) {
+                    if (k != a && k != b) {
+                        product *= f[k][0];
                     }
                 }
                 for (std::size_t r = 0; r < 3u; ++r) {
                     for (std::size_t c = 0; c < 3u; ++c) {
                         out.hessian[i](r, c) +=
-                            product *
-                            lambda_grad[static_cast<std::size_t>(a)][r] *
-                            lambda_grad[static_cast<std::size_t>(b)][c];
+                            product * lambda_grad[a][r] * lambda_grad[b][c];
                     }
                 }
             }
@@ -295,13 +284,6 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
-// Store a gradient in the flat buffer layout used by fast evaluators.
-void store_gradient(const Gradient& gradient, Real* dst) {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
 } // namespace
 
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
@@ -339,13 +321,9 @@ void LagrangeBasis::init_nodes() {
             build_point_nodes();
             return;
         case BasisTopology::Line:
-            build_tensor_product_nodes(1);
-            return;
         case BasisTopology::Quadrilateral:
-            build_tensor_product_nodes(2);
-            return;
         case BasisTopology::Hexahedron:
-            build_tensor_product_nodes(3);
+            build_tensor_product_nodes();
             return;
         case BasisTopology::Triangle:
         case BasisTopology::Tetrahedron:
@@ -368,17 +346,17 @@ void LagrangeBasis::build_point_nodes() {
 }
 
 // Build nodes and axis indices for tensor-product elements.
-void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+void LagrangeBasis::build_tensor_product_nodes() {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
     tensor_indices_.reserve(nodes_.size());
     for (const auto& node : nodes_) {
         TensorNodeIndex idx{0u, 0u, 0u};
         idx[0] = axis_index_pm_one(node[0], order_);
-        if (dimensions >= 2) {
+        if (dimension_ >= 2) {
             idx[1] = axis_index_pm_one(node[1], order_);
         }
-        if (dimensions >= 3) {
+        if (dimension_ >= 3) {
             idx[2] = axis_index_pm_one(node[2], order_);
         }
         tensor_indices_.push_back(idx);
@@ -419,130 +397,159 @@ void LagrangeBasis::build_wedge_nodes() {
     }
 }
 
-// Evaluate requested basis quantities into caller-provided flat buffers.
-void LagrangeBasis::evaluate_all_to(const Vec3& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
-    if (topology_ == BasisTopology::Point) {
+// Evaluate the constant point basis.
+void LagrangeBasis::evaluate_point_to(Real* SVMP_RESTRICT values_out,
+                                      Real* SVMP_RESTRICT gradients_out,
+                                      Real* SVMP_RESTRICT hessians_out) const {
+    if (values_out) {
+        values_out[0] = Real(1);
+    }
+    if (gradients_out) {
+        gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+    }
+    if (hessians_out) {
+        std::fill_n(hessians_out, 9u, Real(0));
+    }
+}
+
+// Evaluate line, quadrilateral, and hexahedron bases as axis-polynomial products.
+void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
+                                               Real* SVMP_RESTRICT values_out,
+                                               Real* SVMP_RESTRICT gradients_out,
+                                               Real* SVMP_RESTRICT hessians_out) const {
+    AxisEval ax;
+    AxisEval ay;
+    AxisEval az;
+    evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
+    if (dimension_ >= 2) {
+        evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+    }
+    if (dimension_ >= 3) {
+        evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+    }
+
+    for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
+        const auto& idx = tensor_indices_[node];
+        const Real vx = ax.value[idx[0]];
+        const Real dx = ax.first[idx[0]];
+        const Real d2x = ax.second[idx[0]];
+        const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
+        const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
+        const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
+        const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
+        const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
+        const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
+
         if (values_out) {
-            values_out[0] = Real(1);
+            values_out[node] = vx * vy * vz;
         }
         if (gradients_out) {
-            gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+            Real* g = gradients_out + node * 3u;
+            g[0] = dx * vy * vz;
+            g[1] = vx * dy * vz;
+            g[2] = vx * vy * dz;
         }
         if (hessians_out) {
-            std::fill_n(hessians_out, 9u, Real(0));
+            Real* h = hessians_out + node * 9u;
+            h[0] = d2x * vy * vz;
+            h[1] = dx * dy * vz;
+            h[2] = dx * vy * dz;
+            h[3] = h[1];
+            h[4] = vx * d2y * vz;
+            h[5] = vx * dy * dz;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = vx * vy * d2z;
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Line ||
-        topology_ == BasisTopology::Quadrilateral ||
-        topology_ == BasisTopology::Hexahedron) {
-        AxisEval ax;
-        AxisEval ay;
-        AxisEval az;
-        evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
-        if (dimension_ >= 2) {
-            evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+// Evaluate triangle and tetrahedron bases from barycentric factors.
+void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out) const {
+    SimplexEval simplex;
+    evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
+    for (std::size_t i = 0; i < simplex.value.size(); ++i) {
+        if (values_out) {
+            values_out[i] = simplex.value[i];
         }
-        if (dimension_ >= 3) {
-            evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+        if (gradients_out) {
+            store_gradient(simplex.gradient[i], gradients_out + i * 3u);
         }
-
-        for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
-            const auto& idx = tensor_indices_[node];
-            const Real vx = ax.value[idx[0]];
-            const Real dx = ax.first[idx[0]];
-            const Real d2x = ax.second[idx[0]];
-            const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
-            const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
-            const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
-            const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
-            const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
-            const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
-
-            if (values_out) {
-                values_out[node] = vx * vy * vz;
-            }
-            if (gradients_out) {
-                Real* g = gradients_out + node * 3u;
-                g[0] = dx * vy * vz;
-                g[1] = vx * dy * vz;
-                g[2] = vx * vy * dz;
-            }
-            if (hessians_out) {
-                Real* h = hessians_out + node * 9u;
-                h[0] = d2x * vy * vz;
-                h[1] = dx * dy * vz;
-                h[2] = dx * vy * dz;
-                h[3] = h[1];
-                h[4] = vx * d2y * vz;
-                h[5] = vx * dy * dz;
-                h[6] = h[2];
-                h[7] = h[5];
-                h[8] = vx * vy * d2z;
-            }
+        if (hessians_out) {
+            store_hessian(simplex.hessian[i], hessians_out + i * 9u);
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Triangle || topology_ == BasisTopology::Tetrahedron) {
-        SimplexEval simplex;
-        evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
-        for (std::size_t i = 0; i < simplex.value.size(); ++i) {
-            if (values_out) {
-                values_out[i] = simplex.value[i];
-            }
-            if (gradients_out) {
-                store_gradient(simplex.gradient[i], gradients_out + i * 3u);
-            }
-            if (hessians_out) {
-                store_hessian(simplex.hessian[i], hessians_out + i * 9u);
-            }
+// Evaluate wedge bases as triangle/through-axis products.
+void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
+                                      Real* SVMP_RESTRICT values_out,
+                                      Real* SVMP_RESTRICT gradients_out,
+                                      Real* SVMP_RESTRICT hessians_out) const {
+    SimplexEval tri;
+    AxisEval z_axis;
+    evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
+    evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
+
+    for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
+        const auto [tri_idx, z_idx] = wedge_indices_[node];
+        const Real tv = tri.value[tri_idx];
+        const Real zv = z_axis.value[z_idx];
+        const Real dz = z_axis.first[z_idx];
+        const Real d2z = z_axis.second[z_idx];
+
+        if (values_out) {
+            values_out[node] = tv * zv;
+        }
+        if (gradients_out) {
+            Real* g = gradients_out + node * 3u;
+            g[0] = tri.gradient[tri_idx][0] * zv;
+            g[1] = tri.gradient[tri_idx][1] * zv;
+            g[2] = tv * dz;
+        }
+        if (hessians_out) {
+            Real* h = hessians_out + node * 9u;
+            const Hessian& th = tri.hessian[tri_idx];
+            const Gradient& tg = tri.gradient[tri_idx];
+            h[0] = th(0, 0) * zv;
+            h[1] = th(0, 1) * zv;
+            h[2] = tg[0] * dz;
+            h[3] = h[1];
+            h[4] = th(1, 1) * zv;
+            h[5] = tg[1] * dz;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = tv * d2z;
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Wedge) {
-        SimplexEval tri;
-        AxisEval z_axis;
-        evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
-        evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
-
-        for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
-            const auto [tri_idx, z_idx] = wedge_indices_[node];
-            const Real tv = tri.value[tri_idx];
-            const Real zv = z_axis.value[z_idx];
-            const Real dz = z_axis.first[z_idx];
-            const Real d2z = z_axis.second[z_idx];
-
-            if (values_out) {
-                values_out[node] = tv * zv;
-            }
-            if (gradients_out) {
-                Real* g = gradients_out + node * 3u;
-                g[0] = tri.gradient[tri_idx][0] * zv;
-                g[1] = tri.gradient[tri_idx][1] * zv;
-                g[2] = tv * dz;
-            }
-            if (hessians_out) {
-                Real* h = hessians_out + node * 9u;
-                const Hessian& th = tri.hessian[tri_idx];
-                const Gradient& tg = tri.gradient[tri_idx];
-                h[0] = th(0, 0) * zv;
-                h[1] = th(0, 1) * zv;
-                h[2] = tg[0] * dz;
-                h[3] = h[1];
-                h[4] = th(1, 1) * zv;
-                h[5] = tg[1] * dz;
-                h[6] = h[2];
-                h[7] = h[5];
-                h[8] = tv * d2z;
-            }
-        }
-        return;
+// Evaluate requested basis quantities into caller-provided flat buffers.
+void LagrangeBasis::evaluate_all_to(const Vec3& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    switch (topology_) {
+        case BasisTopology::Point:
+            evaluate_point_to(values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Line:
+        case BasisTopology::Quadrilateral:
+        case BasisTopology::Hexahedron:
+            evaluate_tensor_product_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Triangle:
+        case BasisTopology::Tetrahedron:
+            evaluate_simplex_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Wedge:
+            evaluate_wedge_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        default:
+            break;
     }
 
     FE::raise<BasisEvaluationException>(SVMP_HERE,
@@ -561,9 +568,7 @@ void LagrangeBasis::evaluate_gradients(const Vec3& xi,
     std::vector<Real> flat(size() * 3u, Real(0));
     evaluate_gradients_to(xi, flat.data());
     for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i][0] = flat[i * 3u + 0u];
-        gradients[i][1] = flat[i * 3u + 1u];
-        gradients[i][2] = flat[i * 3u + 2u];
+        gradients[i] = load_gradient(flat.data() + i * 3u);
     }
 }
 
@@ -588,9 +593,7 @@ void LagrangeBasis::evaluate_all(const Vec3& xi,
     std::vector<Real> flat_h(size() * 9u, Real(0));
     evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
     for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i][0] = flat_g[i * 3u + 0u];
-        gradients[i][1] = flat_g[i * 3u + 1u];
-        gradients[i][2] = flat_g[i * 3u + 2u];
+        gradients[i] = load_gradient(flat_g.data() + i * 3u);
         hessians[i] = load_hessian(flat_h.data() + i * 9u);
     }
 }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 3bb1a5e74..cd0ca6058 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -218,7 +218,7 @@ class LagrangeBasis : public BasisFunction {
 
     void init_nodes();
     void build_point_nodes();
-    void build_tensor_product_nodes(int dimensions);
+    void build_tensor_product_nodes();
     void build_simplex_nodes();
     void build_wedge_nodes();
     void init_equispaced_1d_nodes();
@@ -227,6 +227,21 @@ class LagrangeBasis : public BasisFunction {
                          Real* SVMP_RESTRICT values_out,
                          Real* SVMP_RESTRICT gradients_out,
                          Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_point_to(Real* SVMP_RESTRICT values_out,
+                           Real* SVMP_RESTRICT gradients_out,
+                           Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_tensor_product_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_simplex_to(const math::Vector<Real, 3>& xi,
+                             Real* SVMP_RESTRICT values_out,
+                             Real* SVMP_RESTRICT gradients_out,
+                             Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_wedge_to(const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT values_out,
+                           Real* SVMP_RESTRICT gradients_out,
+                           Real* SVMP_RESTRICT hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 006d43fdc..fd5f99cbc 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -18,12 +18,6 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
-void store_gradient(const Gradient& gradient, Real* dst) {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
 void evaluate_hex8_reference(Real r,
                              Real s,
                              Real t,
@@ -684,9 +678,7 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
     std::vector<Real> flat(size_ * 3u, Real(0));
     evaluate_gradients_to(xi, flat.data());
     for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i][0] = flat[i * 3u + 0u];
-        gradients[i][1] = flat[i * 3u + 1u];
-        gradients[i][2] = flat[i * 3u + 2u];
+        gradients[i] = load_gradient(flat.data() + i * 3u);
     }
 }
 
@@ -711,9 +703,7 @@ void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
     std::vector<Real> flat_hessians(size_ * 9u, Real(0));
     evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
     for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i][0] = flat_gradients[i * 3u + 0u];
-        gradients[i][1] = flat_gradients[i * 3u + 1u];
-        gradients[i][2] = flat_gradients[i * 3u + 2u];
+        gradients[i] = load_gradient(flat_gradients.data() + i * 3u);
         hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
     }
 }

From ddb509ac88fc6b28147e464f2333e99b0c305b61 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 10 Jun 2026 11:50:24 -0700
Subject: [PATCH 18/91] aligning throw and raise to use function-template
 helpers for svmp

---
 Code/Source/solver/nn.cpp | 141 +++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 85 deletions(-)

diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 60fcddf81..547310703 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -133,9 +133,8 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
 
   const auto fe_type = to_fe_element_type(eType);
   if (!fe_type) {
-    throw febasis::BasisElementCompatibilityException(
-        "No FE Basis selection for solver element " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+        "No FE Basis selection for solver element " + solver_element_name(eType));
   }
 
   const std::lock_guard<std::mutex> lock(cache_mutex);
@@ -177,10 +176,9 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
 {
   if (solver_node < 0) {
-    throw febasis::BasisNodeOrderingException(
+    fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
         "Solver node " + std::to_string(solver_node) +
-            " is outside node map for " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+            " is outside node map for " + solver_element_name(eType));
   }
 
   const auto node = static_cast<std::size_t>(solver_node);
@@ -191,10 +189,9 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   if (node < map.size()) {
     return map[node];
   }
-  throw febasis::BasisNodeOrderingException(
+  fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
       "Solver node " + std::to_string(solver_node) +
-          " is outside node map for " + solver_element_name(eType),
-      __FILE__, __LINE__, __func__);
+          " is outside node map for " + solver_element_name(eType));
 }
 
 fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
@@ -202,11 +199,10 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
                                                const Array<double>& xi)
 {
   if (xi.nrows() < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "xi has " + std::to_string(xi.nrows()) +
             " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
-            " reference coordinates",
-        __FILE__, __LINE__, __func__);
+            " reference coordinates");
   }
 
   // Inactive trailing components must be zero for lower-dimensional elements;
@@ -227,26 +223,23 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         Array3<double>& Nx)
 {
   if (values.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis value count " + std::to_string(values.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
   if (gradients.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis gradient count " + std::to_string(gradients.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   for (int a = 0; a < eNoN; ++a) {
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= values.size() || basis_index >= gradients.size()) {
-      throw febasis::BasisNodeOrderingException(
+      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis node " +
               std::to_string(basis_index) + " outside basis output for " +
-              solver_element_name(eType),
-          __FILE__, __LINE__, __func__);
+              solver_element_name(eType));
     }
 
     N(a, g) = values[basis_index];
@@ -271,10 +264,9 @@ void evaluate_basis_values_and_gradients(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
-        __FILE__, __LINE__, __func__);
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const auto point = make_basis_point(basis, g, xi);
@@ -309,9 +301,8 @@ int required_nxx_components_for_dimension(const int dimension)
     case 3:
       return 6;
     default:
-      throw febasis::BasisConfigurationException(
-          "Unsupported FE Basis reference dimension " + std::to_string(dimension),
-          __FILE__, __LINE__, __func__);
+      fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+          "Unsupported FE Basis reference dimension " + std::to_string(dimension));
   }
 }
 
@@ -323,18 +314,16 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
                                        Array3<double>& Nxx)
 {
   if (hessians.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis Hessian count " + std::to_string(hessians.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   const int required_components = required_nxx_components_for_dimension(dimension);
   if (Nxx.nrows() < required_components) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver Nxx has " + std::to_string(Nxx.nrows()) +
-            " rows but FE Basis Hessian packing requires " + std::to_string(required_components),
-        __FILE__, __LINE__, __func__);
+            " rows but FE Basis Hessian packing requires " + std::to_string(required_components));
   }
 
   for (int a = 0; a < eNoN; ++a) {
@@ -344,11 +333,10 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
 
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= hessians.size()) {
-      throw febasis::BasisNodeOrderingException(
+      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
               std::to_string(basis_index) + " outside basis output for " +
-              solver_element_name(eType),
-          __FILE__, __LINE__, __func__);
+              solver_element_name(eType));
     }
 
     const auto& hessian = hessians[basis_index];
@@ -376,18 +364,16 @@ void evaluate_basis_hessians(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
-        __FILE__, __LINE__, __func__);
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver ind2 " + std::to_string(ind2) +
-            " is smaller than packed Hessian component count " + std::to_string(required_components),
-        __FILE__, __LINE__, __func__);
+            " is smaller than packed Hessian component count " + std::to_string(required_components));
   }
 
   const auto point = make_basis_point(basis, gaus_pt, xi);
@@ -415,9 +401,9 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'get_element_gauss_int_data'",
-        solver_element_name(eType), __FILE__, __LINE__, __func__);
+        solver_element_name(eType));
   }
 }
 
@@ -430,9 +416,9 @@ void get_gip(mshType& mesh)
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_element_gauss_int_data'",
-        solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(mesh.eType));
   }
 }
 
@@ -441,9 +427,9 @@ void get_gip(Simulation* simulation, faceType& face)
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_face_gauss_int_data'",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(face.eType));
   }
 }
 
@@ -453,9 +439,8 @@ void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const in
     Array<double>& N, Array3<double>& Nx)
 {
   if (!use_basis_adapter_for(eType)) {
-    throw febasis::BasisElementCompatibilityException(
-        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
   }
 
   evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
@@ -488,11 +473,8 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  if (face.eType == ElementType::NRB) {
-    throw fe::NotImplementedException(
-        "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis",
-        __FILE__, __LINE__, __func__);
-  }
+  fe::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, SVMP_HERE,
+      "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis");
 
   if (face.eType == ElementType::PNT) {
     set_point_face_shape_data(gaus_pt, face);
@@ -505,9 +487,8 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
     return;
   }
 
-  throw febasis::BasisElementCompatibilityException(
-      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType),
-      __FILE__, __LINE__, __func__);
+  fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType));
 }
 
 /// @brief Returns second order derivatives at given natural coords.
@@ -523,10 +504,9 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
   }
 
   if (!use_basis_adapter_for(eType)) {
-    throw febasis::BasisElementCompatibilityException(
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
-            solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+            solver_element_name(eType));
   }
 
   evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
@@ -713,11 +693,8 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
 
   l1 = (l1 && l2 && l3 && l4);
 
-  if (!l1) {
-    throw fe::InvalidArgumentException(
-        "Error in computing shape functions",
-        __FILE__, __LINE__, __func__);
-  }
+  fe::throw_if<fe::InvalidArgumentException>(!l1, SVMP_HERE,
+      "Error in computing shape functions");
 }
 
 /// @brief Inverse maps {xp} to {$\xi$} in an element with coordinates {xl} using Newton's method
@@ -965,11 +942,10 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      throw fe::InvalidArgumentException(
+      fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
           "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
               std::to_string(Ac) + " could not be matched to a node in the '" +
-              msh.name + "' volume mesh.",
-          __FILE__, __LINE__, __func__);
+              msh.name + "' volume mesh.");
     }
 
     ptr(a) = b;
@@ -1018,9 +994,8 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          throw fe::InvalidArgumentException(
-              "gnnb: invalid MechanicalConfigurationType provided",
-              __FILE__, __LINE__, __func__);
+          fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
+              "gnnb: invalid MechanicalConfigurationType provided");
       }
     }
   }
@@ -1208,10 +1183,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    if (INFO != 0) {
-      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
-          __FILE__, __LINE__, __func__);
-    }
+    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
 
@@ -1280,10 +1253,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    if (INFO != 0) {
-      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
-          __FILE__, __LINE__, __func__);
-    }
+    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
   }
@@ -1330,10 +1301,10 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      throw fe::InvalidElementException(
+      fe::raise<fe::InvalidElementException>(SVMP_HERE,
           "[select_ele] No support for " + std::to_string(mesh.eNoN) +
               " noded " + std::to_string(insd) + "D elements.",
-          solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
+          solver_element_name(mesh.eType));
   }
 
   // Set mesh 'w' and 'xi' arrays used for Gauss integration.
@@ -1389,10 +1360,10 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support for " + std::to_string(face.eNoN) + " noded " +
             std::to_string(insd) + "D elements in 'set_face_element_props'.",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(face.eType));
   }
 
   // Set face 'w' and 'xi' arrays used for Gauss integration.

From 9d6266b0a268569fca104b82d38dcf1b0230e4f2 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 11 Jun 2026 09:41:23 -0700
Subject: [PATCH 19/91] improving doxygen documentation for the basis topic

---
 Code/Source/solver/FE/Basis/BasisFunction.h | 118 ++++++++++++++++++--
 Code/Source/solver/FE/FE.h                  |  22 ++++
 2 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 Code/Source/solver/FE/FE.h

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 832926199..9b8e29aaa 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -12,16 +12,115 @@
 #include <cstddef>
 #include <vector>
 
-/// \defgroup FE FE Library
-/// \brief Finite-element interfaces and utilities used by the solver.
-///
-/// The FE library groups basis functions, math utilities, assembly interfaces,
-/// and related support code that can be built and consumed as a coherent
-/// finite-element component.
-
 /// \defgroup FE_Basis Basis
 /// \ingroup FE
 /// \brief Basis-function interfaces, concrete basis families, and reference-node conventions.
+///
+/// \details
+/// ## Scope
+///
+/// The Basis module owns reference-element shape functions. It provides the
+/// number of basis functions and the values and derivatives,
+/// \f$N_i\f$, \f$\partial N_i / \partial \xi_j\f$, and
+/// \f$\partial^2 N_i / \partial \xi_j \partial \xi_k\f$ at reference
+/// points. It does not own mesh storage, quadrature selection, field
+/// formulation policy, or transformation of derivatives to physical
+/// coordinates. Those decisions stay with the solver layer that has the mesh,
+/// material model, and equation context.
+///
+/// The main pieces are:
+/// - BasisFunction (BasisFunction.h): the abstract query and evaluation
+///   contract for code that does not need to know the concrete family.
+/// - \ref FE_LagrangeBasis "LagrangeBasis" and
+///   \ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
+///   families, including analytical first and second derivatives in reference
+///   coordinates.
+/// - basis_factory (BasisFactory.h): runtime construction from a BasisRequest.
+///   basis_factory::default_basis_request() centralizes the family/order that
+///   matches each supported element's public node layout.
+/// - ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
+///   coordinates and the output ordering used by every basis evaluator.
+/// - BasisTraits.h and BasisExceptions.h: topology classification,
+///   compile-time helpers, and module-specific exception types.
+///
+/// ## Object and evaluation contract
+///
+/// A basis object is immutable after construction. It represents one reference
+/// topology, basis family, and effective polynomial order, and can be shared
+/// safely across evaluations. Construction may build node lattices or invert
+/// interpolation matrices, so callers should construct through basis_factory
+/// and cache one instance for each distinct basis request instead of rebuilding
+/// inside element loops.
+///
+/// Every evaluator takes a three-component reference coordinate. For
+/// lower-dimensional elements, only the first dimension() components are
+/// active. Returned gradients always have three components and Hessians are
+/// always 3-by-3 matrices; inactive reference directions are expected to be
+/// zero for conforming lower-dimensional bases. The std::vector overloads are
+/// convenient for setup, tests, and adapter code. The *_to overloads write to
+/// caller-owned flat buffers and are the allocation-free path for assembly.
+///
+/// Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
+/// solver's native node order. A caller that stores elements in another local
+/// ordering must apply the appropriate permutation at the boundary between the
+/// basis module and that storage format.
+///
+/// ## Inputs and ownership
+///
+/// Constructing and evaluating a basis combines several independent choices:
+///
+/// - **Element topology comes from the mesh.** The mesh cell type is translated
+///   to ElementType, which defines the reference topology and public node
+///   layout. This is structural information, not a complete discretization
+///   policy.
+/// - **Geometry interpolation follows the mesh nodes.** The basis used for the
+///   reference-to-physical map must be compatible with the element's node
+///   count and ordering. For that case, callers normally use
+///   basis_factory::create_default_for(element_type), which selects the
+///   Lagrange or serendipity space associated with that element layout. A
+///   Tetra10 mesh therefore implies a quadratic geometry map; a Hex20 mesh
+///   implies the supported Hex20 serendipity geometry basis.
+/// - **Field approximation is chosen by the formulation.** Field bases do not
+///   have to match the geometry map. Mixed formulations, stabilized methods,
+///   enrichment, and convergence studies may use different families or orders
+///   for different fields on the same mesh topology. Those bases should be
+///   requested explicitly with basis_factory::create() and a BasisRequest
+///   naming the desired family and order.
+/// - **Evaluation points come from the caller.** Quadrature rules, probe
+///   points, interpolation targets, and error-sampling locations are outside
+///   this module. The basis only evaluates at the reference coordinates it is
+///   given.
+///
+/// \dot "Basis inputs and responsibilities"
+/// digraph fe_basis_information_flow {
+///   rankdir=LR;
+///   node [shape=box, fontname=Helvetica, fontsize=10];
+///   mesh     [label="Mesh element type"];
+///   request  [label="BasisRequest\nfamily + order"];
+///   topology [label="Reference topology\nand node layout"];
+///   basis    [label="Basis object", style=filled, fillcolor=lightgray];
+///   points   [label="Reference points"];
+///   outputs  [label="Reference values\nand derivatives"];
+///   mesh -> topology;
+///   request -> basis;
+///   topology -> basis;
+///   basis -> outputs;
+///   points -> outputs;
+/// }
+/// \enddot
+///
+/// ## Reference scope and the solver adapter
+///
+/// The solver-facing adapter in nn.cpp is the boundary between this reference
+/// basis contract and legacy solver storage. It translates solver element
+/// enums to ElementType, obtains cached default bases for mesh/face shape
+/// tables, permutes from ReferenceNodeLayout order into solver node order, and
+/// stores N, Nx, and, where needed, packed Nxx at Gauss points. At that stage
+/// Nx and Nxx are still derivatives with respect to reference coordinates.
+/// Physical-coordinate derivatives are formed later, for a particular
+/// configuration and element geometry, by composing the cached reference data
+/// with the mapping Jacobian (nn::gnn for first derivatives and nn::gn_nxx for
+/// second derivatives).
 
 namespace svmp {
 namespace FE {
@@ -105,7 +204,10 @@ inline void add_scaled_hessian(Hessian& target,
 /// BasisFunction defines the common query and evaluation API used by solver
 /// code that does not need to know the concrete basis implementation. Derived
 /// classes provide values at minimum and can override analytical gradients,
-/// Hessians, combined evaluation, and flat-buffer output paths.
+/// Hessians, combined evaluation, and flat-buffer output paths. The interface
+/// is deliberately limited to reference-space quantities; callers own node
+/// ordering translation, physical mapping, and any field-level discretization
+/// policy.
 class BasisFunction {
 public:
     /// \brief Destroy a basis function through the abstract interface.
diff --git a/Code/Source/solver/FE/FE.h b/Code/Source/solver/FE/FE.h
new file mode 100644
index 000000000..1d3bba72b
--- /dev/null
+++ b/Code/Source/solver/FE/FE.h
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef SVMP_FE_FE_H
+#define SVMP_FE_FE_H
+
+/// \file FE.h
+/// \brief Library-level Doxygen group for the finite-element support code.
+///
+/// This header intentionally contains no declarations. It gives Doxygen a
+/// header-based home for the top-level FE group; submodule groups attach to it
+/// from their own headers, including FE_Basis (Basis/BasisFunction.h),
+/// FE_Common (Common/Types.h), and FE_Math (Math/Vector.h).
+
+/// \defgroup FE FE Library
+/// \brief Finite-element interfaces and utilities used by the solver.
+///
+/// The FE library groups basis functions, math utilities, assembly interfaces,
+/// and related support code that can be built and consumed as a coherent
+/// finite-element component.
+
+#endif // SVMP_FE_FE_H

From bd7c2ad86d687606319d768d9abd4ab85c997d63 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 10:10:14 -0700
Subject: [PATCH 20/91] removing chrono guard from Eigen

---
 .../eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor         | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
index 45b176fe7..0938bb554 100644
--- a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -34,9 +34,7 @@
   */
 
 #include <atomic>
-#ifdef EIGEN_USE_GPU
 #include <chrono>
-#endif
 #include <cmath>
 #include <cstddef>
 #include <cstring>

From 282626996ec2332a78c3789207e8ba7fccb67d6d Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 11:06:34 -0700
Subject: [PATCH 21/91] reverting chrono replacement code changes

---
 Code/Source/solver/Timer.h      | 8 +++-----
 Code/Source/solver/load_msh.cpp | 1 +
 Code/Source/solver/utils.cpp    | 8 +++-----
 tests/unitTests/test_common.h   | 1 +
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/Code/Source/solver/Timer.h b/Code/Source/solver/Timer.h
index b8ffa29df..1a55d7516 100644
--- a/Code/Source/solver/Timer.h
+++ b/Code/Source/solver/Timer.h
@@ -4,7 +4,7 @@
 #ifndef TIMER_H 
 #define TIMER_H 
 
-#include <sys/time.h>
+#include <chrono>
 
 /// @brief Keep track of time
 class Timer 
@@ -18,10 +18,8 @@ class Timer
 
     double get_time() const
     {
-      timeval now{};
-      gettimeofday(&now, nullptr);
-      return static_cast<double>(now.tv_sec) +
-             static_cast<double>(now.tv_usec) * 1.0e-6;
+      const auto now = std::chrono::steady_clock::now();
+      return std::chrono::duration<double>(now.time_since_epoch()).count();
     }
 
     void set_time()
diff --git a/Code/Source/solver/load_msh.cpp b/Code/Source/solver/load_msh.cpp
index 50d0ca858..05648b52d 100644
--- a/Code/Source/solver/load_msh.cpp
+++ b/Code/Source/solver/load_msh.cpp
@@ -13,6 +13,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include <chrono>
 #include <unordered_map>
 #include <string>
 #include <iomanip>
diff --git a/Code/Source/solver/utils.cpp b/Code/Source/solver/utils.cpp
index 0fb062e8f..fb7874f95 100644
--- a/Code/Source/solver/utils.cpp
+++ b/Code/Source/solver/utils.cpp
@@ -4,6 +4,7 @@
 #include "utils.h"
 
 #include <bitset>
+#include <chrono>
 #include <cmath> 
 #include <limits>
 
@@ -12,7 +13,6 @@
 #include <iostream>
 #include <fstream>
 #include <sys/resource.h>
-#include <sys/time.h>
 
 #include "FE/Common/FEException.h"
 
@@ -37,10 +37,8 @@ int CountBits(int n)
 
 double cput()
 {
-  timeval now{};
-  gettimeofday(&now, nullptr);
-  return static_cast<double>(now.tv_sec) +
-         static_cast<double>(now.tv_usec) * 1.0e-6;
+  const auto now = std::chrono::system_clock::now();
+  return std::chrono::duration<double>(now.time_since_epoch()).count();
 }
 
 Vector<double> 
diff --git a/tests/unitTests/test_common.h b/tests/unitTests/test_common.h
index 7227b2beb..ce6ffed4b 100644
--- a/tests/unitTests/test_common.h
+++ b/tests/unitTests/test_common.h
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <random>
+#include <chrono>
 #include "CepMod.h"
 #include "ComMod.h"
 #include "gtest/gtest.h"

From f734094a5d5a9f175904866f2c49e05fcde01f48 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 15:18:46 -0700
Subject: [PATCH 22/91] swapping out raw pointers for span support in the
 non-owning buffer access

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  27 ++-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  63 ++----
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 174 +++++++++-------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  67 +++---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 194 +++++++++---------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  27 +--
 Code/Source/solver/FE/Common/Types.h          |  11 -
 .../solver/FE/Math/DenseTransformKernels.h    |  27 ++-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  20 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  56 ++---
 10 files changed, 332 insertions(+), 334 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index d847a9cca..1c8c31e5d 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -4,6 +4,7 @@
 #include "BasisFunction.h"
 
 #include <algorithm>
+#include <string>
 
 namespace svmp {
 namespace FE {
@@ -22,6 +23,13 @@ BasisFunctionScratch& scratch() {
     return data;
 }
 
+void require_span_size(std::size_t actual,
+                       std::size_t expected,
+                       const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string("BasisFunction::") + label + ": output span is smaller than basis size");
+}
+
 } // namespace
 
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -50,31 +58,30 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 }
 
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
+                                       std::span<Real> values_out) const {
+    require_span_size(values_out.size(), size(), "evaluate_values_to");
     auto& tmp = scratch().values;
     tmp.resize(size());
     evaluate_values(xi, tmp);
-    std::copy_n(tmp.data(), tmp.size(), values_out);
+    std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
 }
 
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
+                                          std::span<Gradient> gradients_out) const {
+    require_span_size(gradients_out.size(), size(), "evaluate_gradients_to");
     auto& tmp = scratch().gradients;
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
-    for (std::size_t i = 0; i < tmp.size(); ++i) {
-        store_gradient(tmp[i], gradients_out + i * 3u);
-    }
+    std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
 }
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                         Real* SVMP_RESTRICT hessians_out) const {
+                                         std::span<Hessian> hessians_out) const {
+    require_span_size(hessians_out.size(), size(), "evaluate_hessians_to");
     auto& tmp = scratch().hessians;
     tmp.resize(size());
     evaluate_hessians(xi, tmp);
-    for (std::size_t i = 0; i < tmp.size(); ++i) {
-        store_hessian(tmp[i], hessians_out + i * 9u);
-    }
+    std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
 }
 
 void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 9b8e29aaa..8327ffda9 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -10,6 +10,7 @@
 #include "Types.h"
 
 #include <cstddef>
+#include <span>
 #include <vector>
 
 /// \defgroup FE_Basis Basis
@@ -58,7 +59,7 @@
 /// always 3-by-3 matrices; inactive reference directions are expected to be
 /// zero for conforming lower-dimensional bases. The std::vector overloads are
 /// convenient for setup, tests, and adapter code. The *_to overloads write to
-/// caller-owned flat buffers and are the allocation-free path for assembly.
+/// caller-owned spans and are the allocation-free path for assembly.
 ///
 /// Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
 /// solver's native node order. A caller that stores elements in another local
@@ -148,46 +149,6 @@ using Hessian  = math::Matrix<Real, 3, 3>;
     return hessian;
 }
 
-inline void store_gradient(const Gradient& gradient, Real* dst) noexcept {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
-[[nodiscard]] inline Gradient load_gradient(const Real* src) noexcept {
-    Gradient gradient;
-    gradient[0] = src[0];
-    gradient[1] = src[1];
-    gradient[2] = src[2];
-    return gradient;
-}
-
-inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
-    dst[0] = hessian(0, 0);
-    dst[1] = hessian(0, 1);
-    dst[2] = hessian(0, 2);
-    dst[3] = hessian(1, 0);
-    dst[4] = hessian(1, 1);
-    dst[5] = hessian(1, 2);
-    dst[6] = hessian(2, 0);
-    dst[7] = hessian(2, 1);
-    dst[8] = hessian(2, 2);
-}
-
-[[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
-    Hessian hessian = Hessian::Zero();
-    hessian(0, 0) = src[0];
-    hessian(0, 1) = src[1];
-    hessian(0, 2) = src[2];
-    hessian(1, 0) = src[3];
-    hessian(1, 1) = src[4];
-    hessian(1, 2) = src[5];
-    hessian(2, 0) = src[6];
-    hessian(2, 1) = src[7];
-    hessian(2, 2) = src[8];
-    return hessian;
-}
-
 inline void add_scaled_hessian(Hessian& target,
                                const Hessian& source,
                                Real scale) noexcept {
@@ -204,7 +165,7 @@ inline void add_scaled_hessian(Hessian& target,
 /// BasisFunction defines the common query and evaluation API used by solver
 /// code that does not need to know the concrete basis implementation. Derived
 /// classes provide values at minimum and can override analytical gradients,
-/// Hessians, combined evaluation, and flat-buffer output paths. The interface
+/// Hessians, combined evaluation, and span output paths. The interface
 /// is deliberately limited to reference-space quantities; callers own node
 /// ordering translation, physical mapping, and any field-level discretization
 /// policy.
@@ -263,23 +224,23 @@ class BasisFunction {
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
-    /// \brief Evaluate basis values into a flat caller-provided buffer.
+    /// \brief Evaluate basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out) const;
+                                    std::span<Real> values_out) const;
 
-    /// \brief Evaluate basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT gradients_out) const;
+                                       std::span<Gradient> gradients_out) const;
 
-    /// \brief Evaluate basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                      Real* SVMP_RESTRICT hessians_out) const;
+                                      std::span<Hessian> hessians_out) const;
 
 protected:
     /// \brief Approximate gradients by centered finite differences of values.
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 4ec970b86..ab5e73ac7 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -8,6 +8,8 @@
 #include <array>
 #include <cmath>
 #include <limits>
+#include <span>
+#include <string>
 
 namespace svmp {
 namespace FE {
@@ -284,6 +286,22 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
+void require_output_span_size(std::size_t actual,
+                              std::size_t expected,
+                              const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string(label) + ": output span is smaller than basis size");
+}
+
+template<typename T>
+void require_requested_span_size(std::span<T> output,
+                                 std::size_t expected,
+                                 const char* label) {
+    if (!output.empty()) {
+        require_output_span_size(output.size(), expected, label);
+    }
+}
+
 } // namespace
 
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
@@ -398,25 +416,25 @@ void LagrangeBasis::build_wedge_nodes() {
 }
 
 // Evaluate the constant point basis.
-void LagrangeBasis::evaluate_point_to(Real* SVMP_RESTRICT values_out,
-                                      Real* SVMP_RESTRICT gradients_out,
-                                      Real* SVMP_RESTRICT hessians_out) const {
-    if (values_out) {
+void LagrangeBasis::evaluate_point_to(std::span<Real> values_out,
+                                      std::span<Gradient> gradients_out,
+                                      std::span<Hessian> hessians_out) const {
+    if (!values_out.empty()) {
         values_out[0] = Real(1);
     }
-    if (gradients_out) {
-        gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+    if (!gradients_out.empty()) {
+        gradients_out[0] = Gradient::Zero();
     }
-    if (hessians_out) {
-        std::fill_n(hessians_out, 9u, Real(0));
+    if (!hessians_out.empty()) {
+        hessians_out[0] = Hessian::Zero();
     }
 }
 
 // Evaluate line, quadrilateral, and hexahedron bases as axis-polynomial products.
 void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
-                                               Real* SVMP_RESTRICT values_out,
-                                               Real* SVMP_RESTRICT gradients_out,
-                                               Real* SVMP_RESTRICT hessians_out) const {
+                                               std::span<Real> values_out,
+                                               std::span<Gradient> gradients_out,
+                                               std::span<Hessian> hessians_out) const {
     AxisEval ax;
     AxisEval ay;
     AxisEval az;
@@ -440,55 +458,55 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
         const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
         const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
 
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[node] = vx * vy * vz;
         }
-        if (gradients_out) {
-            Real* g = gradients_out + node * 3u;
+        if (!gradients_out.empty()) {
+            Gradient& g = gradients_out[node];
             g[0] = dx * vy * vz;
             g[1] = vx * dy * vz;
             g[2] = vx * vy * dz;
         }
-        if (hessians_out) {
-            Real* h = hessians_out + node * 9u;
-            h[0] = d2x * vy * vz;
-            h[1] = dx * dy * vz;
-            h[2] = dx * vy * dz;
-            h[3] = h[1];
-            h[4] = vx * d2y * vz;
-            h[5] = vx * dy * dz;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = vx * vy * d2z;
+        if (!hessians_out.empty()) {
+            Hessian& h = hessians_out[node];
+            h(0, 0) = d2x * vy * vz;
+            h(0, 1) = dx * dy * vz;
+            h(0, 2) = dx * vy * dz;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = vx * d2y * vz;
+            h(1, 2) = vx * dy * dz;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = vx * vy * d2z;
         }
     }
 }
 
 // Evaluate triangle and tetrahedron bases from barycentric factors.
 void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out) const {
+                                        std::span<Real> values_out,
+                                        std::span<Gradient> gradients_out,
+                                        std::span<Hessian> hessians_out) const {
     SimplexEval simplex;
     evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
     for (std::size_t i = 0; i < simplex.value.size(); ++i) {
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[i] = simplex.value[i];
         }
-        if (gradients_out) {
-            store_gradient(simplex.gradient[i], gradients_out + i * 3u);
+        if (!gradients_out.empty()) {
+            gradients_out[i] = simplex.gradient[i];
         }
-        if (hessians_out) {
-            store_hessian(simplex.hessian[i], hessians_out + i * 9u);
+        if (!hessians_out.empty()) {
+            hessians_out[i] = simplex.hessian[i];
         }
     }
 }
 
 // Evaluate wedge bases as triangle/through-axis products.
 void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
-                                      Real* SVMP_RESTRICT values_out,
-                                      Real* SVMP_RESTRICT gradients_out,
-                                      Real* SVMP_RESTRICT hessians_out) const {
+                                      std::span<Real> values_out,
+                                      std::span<Gradient> gradients_out,
+                                      std::span<Hessian> hessians_out) const {
     SimplexEval tri;
     AxisEval z_axis;
     evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
@@ -501,37 +519,45 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
         const Real dz = z_axis.first[z_idx];
         const Real d2z = z_axis.second[z_idx];
 
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[node] = tv * zv;
         }
-        if (gradients_out) {
-            Real* g = gradients_out + node * 3u;
+        if (!gradients_out.empty()) {
+            Gradient& g = gradients_out[node];
             g[0] = tri.gradient[tri_idx][0] * zv;
             g[1] = tri.gradient[tri_idx][1] * zv;
             g[2] = tv * dz;
         }
-        if (hessians_out) {
-            Real* h = hessians_out + node * 9u;
+        if (!hessians_out.empty()) {
+            Hessian& h = hessians_out[node];
             const Hessian& th = tri.hessian[tri_idx];
             const Gradient& tg = tri.gradient[tri_idx];
-            h[0] = th(0, 0) * zv;
-            h[1] = th(0, 1) * zv;
-            h[2] = tg[0] * dz;
-            h[3] = h[1];
-            h[4] = th(1, 1) * zv;
-            h[5] = tg[1] * dz;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = tv * d2z;
+            h(0, 0) = th(0, 0) * zv;
+            h(0, 1) = th(0, 1) * zv;
+            h(0, 2) = tg[0] * dz;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = th(1, 1) * zv;
+            h(1, 2) = tg[1] * dz;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = tv * d2z;
         }
     }
 }
 
-// Evaluate requested basis quantities into caller-provided flat buffers.
+// Evaluate requested basis quantities into caller-provided spans.
 void LagrangeBasis::evaluate_all_to(const Vec3& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
+                                    std::span<Real> values_out,
+                                    std::span<Gradient> gradients_out,
+                                    std::span<Hessian> hessians_out) const {
+    require_requested_span_size(values_out, size(), "LagrangeBasis::evaluate_all_to values");
+    require_requested_span_size(gradients_out, size(), "LagrangeBasis::evaluate_all_to gradients");
+    require_requested_span_size(hessians_out, size(), "LagrangeBasis::evaluate_all_to hessians");
+
+    if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
+        return;
+    }
+
     switch (topology_) {
         case BasisTopology::Point:
             evaluate_point_to(values_out, gradients_out, hessians_out);
@@ -559,27 +585,19 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
 void LagrangeBasis::evaluate_values(const Vec3& xi,
                                     std::vector<Real>& values) const {
     values.resize(size());
-    evaluate_values_to(xi, values.data());
+    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
 }
 
 void LagrangeBasis::evaluate_gradients(const Vec3& xi,
                                        std::vector<Gradient>& gradients) const {
     gradients.resize(size());
-    std::vector<Real> flat(size() * 3u, Real(0));
-    evaluate_gradients_to(xi, flat.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i] = load_gradient(flat.data() + i * 3u);
-    }
+    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
 void LagrangeBasis::evaluate_hessians(const Vec3& xi,
                                       std::vector<Hessian>& hessians) const {
     hessians.resize(size());
-    std::vector<Real> flat(size() * 9u, Real(0));
-    evaluate_hessians_to(xi, flat.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        hessians[i] = load_hessian(flat.data() + i * 9u);
-    }
+    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void LagrangeBasis::evaluate_all(const Vec3& xi,
@@ -589,28 +607,28 @@ void LagrangeBasis::evaluate_all(const Vec3& xi,
     values.resize(size());
     gradients.resize(size());
     hessians.resize(size());
-    std::vector<Real> flat_g(size() * 3u, Real(0));
-    std::vector<Real> flat_h(size() * 9u, Real(0));
-    evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i] = load_gradient(flat_g.data() + i * 3u);
-        hessians[i] = load_hessian(flat_h.data() + i * 9u);
-    }
+    evaluate_all_to(xi,
+                    std::span<Real>(values.data(), values.size()),
+                    std::span<Gradient>(gradients.data(), gradients.size()),
+                    std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
-    evaluate_all_to(xi, values_out, nullptr, nullptr);
+                                       std::span<Real> values_out) const {
+    require_output_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
+    evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
-    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+                                          std::span<Gradient> gradients_out) const {
+    require_output_span_size(gradients_out.size(), size(), "LagrangeBasis::evaluate_gradients_to");
+    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
-                                         Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
+                                         std::span<Hessian> hessians_out) const {
+    require_output_span_size(hessians_out.size(), size(), "LagrangeBasis::evaluate_hessians_to");
+    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index cd0ca6058..6137a557a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -9,6 +9,7 @@
 
 #include <array>
 #include <cstddef>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -58,8 +59,8 @@ namespace basis {
 /// \f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)\f$.
 ///
 /// The vector-returning evaluators are convenient API wrappers. The `*_to`
-/// methods write to caller-provided flat buffers and are intended for assembly
-/// paths that avoid temporary allocations.
+/// methods write to caller-provided spans and are intended for assembly paths
+/// that avoid temporary allocations.
 class LagrangeBasis : public BasisFunction {
 public:
     /// \brief Axis-index tuple for tensor-product reference nodes.
@@ -171,38 +172,36 @@ class LagrangeBasis : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate Lagrange basis values into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis values into caller-provided storage.
     ///
     /// \details This is the low-allocation API intended for element assembly
-    /// loops. The buffer is filled in basis-node order and no vector resizing
-    /// is performed.
+    /// loops. The span is filled in basis-node order and no vector resizing is
+    /// performed.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            Real* SVMP_RESTRICT values_out) const final;
+                            std::span<Real> values_out) const final;
 
-    /// \brief Evaluate Lagrange basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis gradients into caller-provided storage.
     ///
-    /// \details Gradients are written in node-major order with three
-    /// reference-coordinate components per node. For node \f$i\f$ and component
-    /// \f$c\f$, the entry is `gradients_out[i * 3 + c]`.
+    /// \details Gradients are written in basis-node order with one
+    /// three-component gradient per node.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                               Real* SVMP_RESTRICT gradients_out) const final;
+                               std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate Lagrange basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis Hessians into caller-provided storage.
     ///
-    /// \details Hessians are written in node-major row-major order. For node
-    /// \f$i\f$ and Hessian component \f$(r,c)\f$, the entry is
-    /// `hessians_out[i * 9 + r * 3 + c]`.
+    /// \details Hessians are written in basis-node order with one 3-by-3
+    /// Hessian per node.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT hessians_out) const final;
+                              std::span<Hessian> hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -224,24 +223,24 @@ class LagrangeBasis : public BasisFunction {
     void init_equispaced_1d_nodes();
 
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         Real* SVMP_RESTRICT values_out,
-                         Real* SVMP_RESTRICT gradients_out,
-                         Real* SVMP_RESTRICT hessians_out) const;
-    void evaluate_point_to(Real* SVMP_RESTRICT values_out,
-                           Real* SVMP_RESTRICT gradients_out,
-                           Real* SVMP_RESTRICT hessians_out) const;
+                         std::span<Real> values_out,
+                         std::span<Gradient> gradients_out,
+                         std::span<Hessian> hessians_out) const;
+    void evaluate_point_to(std::span<Real> values_out,
+                           std::span<Gradient> gradients_out,
+                           std::span<Hessian> hessians_out) const;
     void evaluate_tensor_product_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const;
+                                    std::span<Real> values_out,
+                                    std::span<Gradient> gradients_out,
+                                    std::span<Hessian> hessians_out) const;
     void evaluate_simplex_to(const math::Vector<Real, 3>& xi,
-                             Real* SVMP_RESTRICT values_out,
-                             Real* SVMP_RESTRICT gradients_out,
-                             Real* SVMP_RESTRICT hessians_out) const;
+                             std::span<Real> values_out,
+                             std::span<Gradient> gradients_out,
+                             std::span<Hessian> hessians_out) const;
     void evaluate_wedge_to(const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT values_out,
-                           Real* SVMP_RESTRICT gradients_out,
-                           Real* SVMP_RESTRICT hessians_out) const;
+                           std::span<Real> values_out,
+                           std::span<Gradient> gradients_out,
+                           std::span<Hessian> hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index fd5f99cbc..ae505c2cf 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -21,9 +21,9 @@ using Vec3 = math::Vector<Real, 3>;
 void evaluate_hex8_reference(Real r,
                              Real s,
                              Real t,
-                             Real* values,
-                             Real* gradients,
-                             Real* hessians) {
+                             std::span<Real> values,
+                             std::span<Gradient> gradients,
+                             std::span<Hessian> hessians) {
     static constexpr int signs[8][3] = {
         {-1, -1, -1},
         { 1, -1, -1},
@@ -43,26 +43,26 @@ void evaluate_hex8_reference(Real r,
         const Real bs = Real(1) + b * s;
         const Real ct = Real(1) + c * t;
 
-        if (values) {
+        if (!values.empty()) {
             values[i] = Real(0.125) * ar * bs * ct;
         }
-        if (gradients) {
-            Real* g = gradients + i * 3u;
+        if (!gradients.empty()) {
+            Gradient& g = gradients[i];
             g[0] = Real(0.125) * a * bs * ct;
             g[1] = Real(0.125) * b * ar * ct;
             g[2] = Real(0.125) * c * ar * bs;
         }
-        if (hessians) {
-            Real* h = hessians + i * 9u;
-            h[0] = Real(0);
-            h[1] = Real(0.125) * a * b * ct;
-            h[2] = Real(0.125) * a * c * bs;
-            h[3] = h[1];
-            h[4] = Real(0);
-            h[5] = Real(0.125) * b * c * ar;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = Real(0);
+        if (!hessians.empty()) {
+            Hessian& h = hessians[i];
+            h(0, 0) = Real(0);
+            h(0, 1) = Real(0.125) * a * b * ct;
+            h(0, 2) = Real(0.125) * a * c * bs;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = Real(0);
+            h(1, 2) = Real(0.125) * b * c * ar;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = Real(0);
         }
     }
 }
@@ -262,7 +262,7 @@ inline std::array<Real, 3> quadratic_powers(Real x) {
     return {Real(1), x, x * x};
 }
 
-void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
+void eval_hex20_internal(Real r, Real s, Real t, std::span<Real> internal_vals) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -284,7 +284,7 @@ void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
     }
 }
 
-void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads) {
+void eval_hex20_grad_internal(Real r, Real s, Real t, std::span<Gradient> internal_grads) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -321,7 +321,7 @@ void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads)
     }
 }
 
-void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians) {
+void eval_hex20_hess_internal(Real r, Real s, Real t, std::span<Hessian> internal_hessians) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -384,9 +384,9 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians
 void eval_wedge15_polynomial(Real r,
                              Real s,
                              Real t,
-                             Real* values,
-                             Gradient* gradients,
-                             Hessian* hessians) {
+                             std::span<Real> values,
+                             std::span<Gradient> gradients,
+                             std::span<Hessian> hessians) {
     Real phi[15]{};
     Real dr[15]{};
     Real ds[15]{};
@@ -415,15 +415,15 @@ void eval_wedge15_polynomial(Real r,
         const Real sb = sp[bs];
         const Real tc = tp[ct];
 
-        if (values) {
+        if (!values.empty()) {
             phi[j] = ra * sb * tc;
         }
-        if (gradients) {
+        if (!gradients.empty()) {
             dr[j] = (a > 0) ? Real(a) * rp[ar - 1u] * sb * tc : Real(0);
             ds[j] = (b > 0) ? ra * Real(b) * sp[bs - 1u] * tc : Real(0);
             dt[j] = (c > 0) ? ra * sb * Real(c) * tp[ct - 1u] : Real(0);
         }
-        if (hessians) {
+        if (!hessians.empty()) {
             drr[j] = (a > 1) ? Real(a * (a - 1)) * rp[ar - 2u] * sb * tc : Real(0);
             dss[j] = (b > 1) ? ra * Real(b * (b - 1)) * sp[bs - 2u] * tc : Real(0);
             dtt[j] = (c > 1) ? ra * sb * Real(c * (c - 1)) * tp[ct - 2u] : Real(0);
@@ -442,15 +442,15 @@ void eval_wedge15_polynomial(Real r,
         for (int j = 0; j < 15; ++j) {
             const Real coefficient =
                 kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
-            if (values) {
+            if (!values.empty()) {
                 value += coefficient * phi[j];
             }
-            if (gradients) {
+            if (!gradients.empty()) {
                 gr += coefficient * dr[j];
                 gs += coefficient * ds[j];
                 gt += coefficient * dt[j];
             }
-            if (hessians) {
+            if (!hessians.empty()) {
                 H(0, 0) += coefficient * drr[j];
                 H(1, 1) += coefficient * dss[j];
                 H(2, 2) += coefficient * dtt[j];
@@ -461,15 +461,15 @@ void eval_wedge15_polynomial(Real r,
         }
 
         const std::size_t index = static_cast<std::size_t>(i);
-        if (values) {
+        if (!values.empty()) {
             values[index] = value;
         }
-        if (gradients) {
+        if (!gradients.empty()) {
             gradients[index][0] = gr;
             gradients[index][1] = gs;
             gradients[index][2] = gt;
         }
-        if (hessians) {
+        if (!hessians.empty()) {
             H(1, 0) = H(0, 1);
             H(2, 0) = H(0, 2);
             H(2, 1) = H(1, 2);
@@ -478,6 +478,22 @@ void eval_wedge15_polynomial(Real r,
     }
 }
 
+void require_output_span_size(std::size_t actual,
+                              std::size_t expected,
+                              const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string(label) + ": output span is smaller than basis size");
+}
+
+template<typename T>
+void require_requested_span_size(std::span<T> output,
+                                 std::size_t expected,
+                                 const char* label) {
+    if (!output.empty()) {
+        require_output_span_size(output.size(), expected, label);
+    }
+}
+
 } // namespace
 
 SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mode)
@@ -533,21 +549,25 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out,
-                                       Real* SVMP_RESTRICT gradients_out,
-                                       Real* SVMP_RESTRICT hessians_out) const {
-    if (!values_out && !gradients_out && !hessians_out) {
+                                       std::span<Real> values_out,
+                                       std::span<Gradient> gradients_out,
+                                       std::span<Hessian> hessians_out) const {
+    require_requested_span_size(values_out, size_, "SerendipityBasis::evaluate_all_to values");
+    require_requested_span_size(gradients_out, size_, "SerendipityBasis::evaluate_all_to gradients");
+    require_requested_span_size(hessians_out, size_, "SerendipityBasis::evaluate_all_to hessians");
+
+    if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
         return;
     }
 
-    if (values_out) {
-        std::fill_n(values_out, size_, Real(0));
+    if (!values_out.empty()) {
+        std::fill(values_out.begin(), values_out.end(), Real(0));
     }
-    if (gradients_out) {
-        std::fill_n(gradients_out, size_ * 3u, Real(0));
+    if (!gradients_out.empty()) {
+        std::fill(gradients_out.begin(), gradients_out.end(), Gradient::Zero());
     }
-    if (hessians_out) {
-        std::fill_n(hessians_out, size_ * 9u, Real(0));
+    if (!hessians_out.empty()) {
+        std::fill(hessians_out.begin(), hessians_out.end(), Hessian::Zero());
     }
 
     const Real x = xi[0];
@@ -581,20 +601,20 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 
             for (std::size_t i = 0; i < size_; ++i) {
                 const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                if (values_out) {
+                if (!values_out.empty()) {
                     values_out[i] += value * coeff;
                 }
-                if (gradients_out) {
-                    Real* g = gradients_out + i * 3u;
+                if (!gradients_out.empty()) {
+                    Gradient& g = gradients_out[i];
                     g[0] += dx * coeff;
                     g[1] += dy * coeff;
                 }
-                if (hessians_out) {
-                    Real* h = hessians_out + i * 9u;
-                    h[0] += dxx * coeff;
-                    h[1] += dxy * coeff;
-                    h[3] += dxy * coeff;
-                    h[4] += dyy * coeff;
+                if (!hessians_out.empty()) {
+                    Hessian& h = hessians_out[i];
+                    h(0, 0) += dxx * coeff;
+                    h(0, 1) += dxy * coeff;
+                    h(1, 0) += dxy * coeff;
+                    h(1, 1) += dyy * coeff;
                 }
             }
         }
@@ -616,49 +636,37 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
                                                "Hex20 mesh-to-basis ordering is not registered");
 
-        if (values_out) {
-            Real internal_vals[20];
+        if (!values_out.empty()) {
+            std::array<Real, 20u> internal_vals{};
             eval_hex20_internal(x, y, z, internal_vals);
             for (std::size_t i = 0; i < 20u; ++i) {
                 values_out[i] = internal_vals[mesh_to_basis[i]];
             }
         }
-        if (gradients_out) {
-            Gradient internal_grads[20];
+        if (!gradients_out.empty()) {
+            std::array<Gradient, 20u> internal_grads{};
             eval_hex20_grad_internal(x, y, z, internal_grads);
             for (std::size_t i = 0; i < 20u; ++i) {
-                store_gradient(internal_grads[mesh_to_basis[i]], gradients_out + i * 3u);
+                gradients_out[i] = internal_grads[mesh_to_basis[i]];
             }
         }
-        if (hessians_out) {
-            Hessian internal_hessians[20];
+        if (!hessians_out.empty()) {
+            std::array<Hessian, 20u> internal_hessians{};
             eval_hex20_hess_internal(x, y, z, internal_hessians);
             for (std::size_t i = 0; i < 20u; ++i) {
-                store_hessian(internal_hessians[mesh_to_basis[i]], hessians_out + i * 9u);
+                hessians_out[i] = internal_hessians[mesh_to_basis[i]];
             }
         }
         return;
     }
 
     if (element_type_ == ElementType::Wedge15) {
-        std::array<Gradient, 15u> wedge_gradients{};
-        std::array<Hessian, 15u> wedge_hessians{};
         eval_wedge15_polynomial(x,
                                  y,
                                  z,
                                  values_out,
-                                 gradients_out ? wedge_gradients.data() : nullptr,
-                                 hessians_out ? wedge_hessians.data() : nullptr);
-        if (gradients_out) {
-            for (std::size_t i = 0; i < 15u; ++i) {
-                store_gradient(wedge_gradients[i], gradients_out + i * 3u);
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t i = 0; i < 15u; ++i) {
-                store_hessian(wedge_hessians[i], hessians_out + i * 9u);
-            }
-        }
+                                 gradients_out,
+                                 hessians_out);
         return;
     }
 
@@ -669,27 +677,19 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
                                        std::vector<Real>& values) const {
     values.resize(size_);
-    evaluate_values_to(xi, values.data());
+    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
 }
 
 void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                           std::vector<Gradient>& gradients) const {
     gradients.resize(size_);
-    std::vector<Real> flat(size_ * 3u, Real(0));
-    evaluate_gradients_to(xi, flat.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i] = load_gradient(flat.data() + i * 3u);
-    }
+    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
 void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
                                          std::vector<Hessian>& hessians) const {
     hessians.resize(size_);
-    std::vector<Real> flat(size_ * 9u, Real(0));
-    evaluate_hessians_to(xi, flat.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        hessians[i] = load_hessian(flat.data() + i * 9u);
-    }
+    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
@@ -699,28 +699,28 @@ void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
     values.resize(size_);
     gradients.resize(size_);
     hessians.resize(size_);
-    std::vector<Real> flat_gradients(size_ * 3u, Real(0));
-    std::vector<Real> flat_hessians(size_ * 9u, Real(0));
-    evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i] = load_gradient(flat_gradients.data() + i * 3u);
-        hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
-    }
+    evaluate_all_to(xi,
+                    std::span<Real>(values.data(), values.size()),
+                    std::span<Gradient>(gradients.data(), gradients.size()),
+                    std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT values_out) const {
-    evaluate_all_to(xi, values_out, nullptr, nullptr);
+                                          std::span<Real> values_out) const {
+    require_output_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
+    evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                             Real* SVMP_RESTRICT gradients_out) const {
-    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+                                             std::span<Gradient> gradients_out) const {
+    require_output_span_size(gradients_out.size(), size_, "SerendipityBasis::evaluate_gradients_to");
+    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                            Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
+                                            std::span<Hessian> hessians_out) const {
+    require_output_span_size(hessians_out.size(), size_, "SerendipityBasis::evaluate_hessians_to");
+    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 9c55c8eec..e231ed833 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -12,6 +12,7 @@
 #include "BasisFunction.h"
 
 #include <array>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -153,7 +154,7 @@ class SerendipityBasis final : public BasisFunction {
 
     /// \brief Evaluate serendipity values, gradients, and Hessians together.
     ///
-    /// \details This vector API is backed by the same flat-buffer evaluator as
+    /// \details This vector API is backed by the same span-based evaluator as
     /// the assembly-oriented `*_to` methods, so topology-specific polynomial
     /// setup can be shared for a quadrature point.
     ///
@@ -166,23 +167,23 @@ class SerendipityBasis final : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate serendipity basis values into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            Real* SVMP_RESTRICT values_out) const final;
+                            std::span<Real> values_out) const final;
 
-    /// \brief Evaluate serendipity basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                               Real* SVMP_RESTRICT gradients_out) const final;
+                               std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate serendipity basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT hessians_out) const final;
+                              std::span<Hessian> hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -199,9 +200,9 @@ class SerendipityBasis final : public BasisFunction {
     bool geometry_mode_;
 
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         Real* SVMP_RESTRICT values_out,
-                         Real* SVMP_RESTRICT gradients_out,
-                         Real* SVMP_RESTRICT hessians_out) const;
+                         std::span<Real> values_out,
+                         std::span<Gradient> gradients_out,
+                         std::span<Hessian> hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 1f57ffcc5..462b7ca76 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -58,17 +58,6 @@ enum class CellFamily {
 #include <type_traits>
 #include <limits>
 
-#if defined(_MSC_VER)
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT __restrict
-#elif defined(__clang__) || defined(__GNUC__)
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT __restrict__
-#else
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT
-#endif
-
 /// \defgroup FE_Common Common
 /// \ingroup FE
 /// \brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 2ddb9cefa..f6639dcd3 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -4,11 +4,13 @@
 #ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 #define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 
+#include "FEException.h"
 #include "Types.h"
 
 #include <Eigen/Core>
 
 #include <cstddef>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -22,18 +24,31 @@ namespace math {
 /// (row stride output_row_stride). Strides may exceed rhs_count for padded
 /// layouts; padding entries are left untouched.
 inline void dense_transform_batched_row_major(
-    const Real* SVMP_RESTRICT matrix,
+    std::span<const Real> matrix,
     std::size_t rows,
     std::size_t cols,
-    const Real* SVMP_RESTRICT input,
+    std::span<const Real> input,
     std::size_t input_row_stride,
-    Real* SVMP_RESTRICT output,
+    std::span<Real> output,
     std::size_t output_row_stride,
     std::size_t rhs_count) {
     if (rows == 0u || cols == 0u || rhs_count == 0u) {
         return;
     }
 
+    FE::throw_if<FEException>(matrix.size() < rows * cols, SVMP_HERE,
+                              "dense_transform_batched_row_major: matrix span is too small");
+    FE::throw_if<FEException>(input_row_stride < rhs_count, SVMP_HERE,
+                              "dense_transform_batched_row_major: input stride is smaller than RHS count");
+    FE::throw_if<FEException>(output_row_stride < rhs_count, SVMP_HERE,
+                              "dense_transform_batched_row_major: output stride is smaller than RHS count");
+    FE::throw_if<FEException>(
+        input.size() < (cols - 1u) * input_row_stride + rhs_count, SVMP_HERE,
+        "dense_transform_batched_row_major: input span is too small");
+    FE::throw_if<FEException>(
+        output.size() < (rows - 1u) * output_row_stride + rhs_count, SVMP_HERE,
+        "dense_transform_batched_row_major: output span is too small");
+
     using RowMajorMatrix =
         Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     using ConstMap = Eigen::Map<const RowMajorMatrix>;
@@ -42,16 +57,16 @@ inline void dense_transform_batched_row_major(
     using StridedMap =
         Eigen::Map<RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
 
-    const ConstMap matrix_map(matrix,
+    const ConstMap matrix_map(matrix.data(),
                               static_cast<Eigen::Index>(rows),
                               static_cast<Eigen::Index>(cols));
     const ConstStridedMap input_map(
-        input,
+        input.data(),
         static_cast<Eigen::Index>(cols),
         static_cast<Eigen::Index>(rhs_count),
         Eigen::OuterStride<>(static_cast<Eigen::Index>(input_row_stride)));
     StridedMap output_map(
-        output,
+        output.data(),
         static_cast<Eigen::Index>(rows),
         static_cast<Eigen::Index>(rhs_count),
         Eigen::OuterStride<>(static_cast<Eigen::Index>(output_row_stride)));
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 60ca72114..edeca5ac5 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -255,29 +255,29 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     }
 }
 
-TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesSpanOutputs) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
 
-    std::vector<Real> flat_values(basis.size());
-    std::vector<Real> flat_gradients(basis.size() * 3u);
-    std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(point, flat_values.data());
-    basis.evaluate_gradients_to(point, flat_gradients.data());
-    basis.evaluate_hessians_to(point, flat_hessians.data());
+    std::vector<Real> span_values(basis.size());
+    std::vector<Gradient> span_gradients(basis.size());
+    std::vector<Hessian> span_hessians(basis.size());
+    basis.evaluate_values_to(point, span_values);
+    basis.evaluate_gradients_to(point, span_gradients);
+    basis.evaluate_hessians_to(point, span_hessians);
 
     std::vector<Real> expected_values;
     std::vector<Gradient> expected_gradients;
     std::vector<Hessian> expected_hessians;
     basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
     for (std::size_t d = 0; d < basis.size(); ++d) {
-        EXPECT_EQ(flat_values[d], expected_values[d]);
+        EXPECT_EQ(span_values[d], expected_values[d]);
         for (std::size_t c = 0; c < 3u; ++c) {
-            EXPECT_EQ(flat_gradients[d * 3u + c], expected_gradients[d][c]);
+            EXPECT_EQ(span_gradients[d][c], expected_gradients[d][c]);
         }
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_EQ(flat_hessians[d * 9u + r * 3u + c], expected_hessians[d](r, c));
+                EXPECT_EQ(span_hessians[d](r, c), expected_hessians[d](r, c));
             }
         }
     }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 8a1f43c58..68232d216 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <array>
+#include <span>
 #include <tuple>
 #include <vector>
 
@@ -116,27 +117,27 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
     }
 }
 
-void expect_raw_sinks_match_vector_evaluation(const LagrangeBasis& basis,
-                                              const Point& xi)
+void expect_span_sinks_match_vector_evaluation(const LagrangeBasis& basis,
+                                               const Point& xi)
 {
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
     basis.evaluate_all(xi, values, gradients, hessians);
 
-    std::vector<Real> flat_values(basis.size());
-    std::vector<Real> flat_gradients(basis.size() * 3u);
-    std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(xi, flat_values.data());
-    basis.evaluate_gradients_to(xi, flat_gradients.data());
-    basis.evaluate_hessians_to(xi, flat_hessians.data());
+    std::vector<Real> span_values(basis.size());
+    std::vector<Gradient> span_gradients(basis.size());
+    std::vector<Hessian> span_hessians(basis.size());
+    basis.evaluate_values_to(xi, span_values);
+    basis.evaluate_gradients_to(xi, span_gradients);
+    basis.evaluate_hessians_to(xi, span_hessians);
 
     for (std::size_t i = 0; i < basis.size(); ++i) {
-        EXPECT_NEAR(flat_values[i], values[i], Real(1e-14));
+        EXPECT_NEAR(span_values[i], values[i], Real(1e-14));
         for (std::size_t d = 0; d < 3u; ++d) {
-            EXPECT_NEAR(flat_gradients[i * 3u + d], gradients[i][d], Real(1e-14));
+            EXPECT_NEAR(span_gradients[i][d], gradients[i][d], Real(1e-14));
             for (std::size_t e = 0; e < 3u; ++e) {
-                EXPECT_NEAR(flat_hessians[i * 9u + d * 3u + e],
+                EXPECT_NEAR(span_hessians[i](d, e),
                             hessians[i](d, e),
                             Real(1e-14));
             }
@@ -251,10 +252,10 @@ TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
     }
 }
 
-TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+TEST(LagrangeBasis, SpanOutputSinksMatchVectorEvaluationAcrossTopologies) {
     for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        expect_raw_sinks_match_vector_evaluation(basis, c.points.front());
+        expect_span_sinks_match_vector_evaluation(basis, c.points.front());
     }
 }
 
@@ -461,19 +462,26 @@ TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
         }
     }
 
-    Real flat_value = Real(-1);
-    Real flat_gradient[3] = {Real(-1), Real(-1), Real(-1)};
-    Real flat_hessian[9];
-    std::fill_n(flat_hessian, 9u, Real(-1));
-    basis.evaluate_values_to(xi, &flat_value);
-    basis.evaluate_gradients_to(xi, flat_gradient);
-    basis.evaluate_hessians_to(xi, flat_hessian);
-    EXPECT_EQ(flat_value, Real(1));
+    Real span_value = Real(-1);
+    Gradient span_gradient;
+    span_gradient[0] = span_gradient[1] = span_gradient[2] = Real(-1);
+    Hessian span_hessian;
     for (std::size_t d = 0; d < 3u; ++d) {
-        EXPECT_EQ(flat_gradient[d], Real(0));
+        for (std::size_t e = 0; e < 3u; ++e) {
+            span_hessian(d, e) = Real(-1);
+        }
+    }
+    basis.evaluate_values_to(xi, std::span<Real>(&span_value, 1u));
+    basis.evaluate_gradients_to(xi, std::span<Gradient>(&span_gradient, 1u));
+    basis.evaluate_hessians_to(xi, std::span<Hessian>(&span_hessian, 1u));
+    EXPECT_EQ(span_value, Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(span_gradient[d], Real(0));
     }
-    for (std::size_t e = 0; e < 9u; ++e) {
-        EXPECT_EQ(flat_hessian[e], Real(0));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        for (std::size_t e = 0; e < 3u; ++e) {
+            EXPECT_EQ(span_hessian(d, e), Real(0));
+        }
     }
 }
 

From 2b25acd93c2d9b605c411a7dd87e679d14ec5d7e Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 10:11:26 -0700
Subject: [PATCH 23/91] clarifying the populate_reference_hessians function for
 the element face basis and canonicalizing exception calls

---
 Code/Source/solver/FE/Basis/BasisExceptions.h | 36 +++++-----
 Code/Source/solver/FE/Common/FEException.h    | 24 ++++++-
 .../solver/FE/Math/DenseLinearAlgebra.cpp     | 71 ++++++++++---------
 Code/Source/solver/cep_ion.cpp                |  2 +-
 Code/Source/solver/fs.cpp                     | 47 ++++--------
 Code/Source/solver/ionic_model.cpp            | 10 +--
 Code/Source/solver/mat_fun.h                  |  4 +-
 Code/Source/solver/nn.cpp                     |  6 +-
 Code/Source/solver/post.cpp                   |  4 +-
 Code/Source/solver/read_files.cpp             |  3 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         | 53 +++++++++++++-
 11 files changed, 157 insertions(+), 103 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index 8f8fd3c3c..e4afc3153 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -16,9 +16,9 @@ namespace basis {
 class BasisException : public FEException {
 public:
     BasisException(const std::string& message,
-                   const char* file = "",
-                   int line = 0,
-                   const char* function = "",
+                   const char* file,
+                   int line,
+                   const char* function,
                    StatusCode status = StatusCode::Unknown)
         : FEException(message, status, file, line, function) {}
 };
@@ -29,9 +29,9 @@ class BasisException : public FEException {
 class BasisConfigurationException : public BasisException {
 public:
     BasisConfigurationException(const std::string& message,
-                                const char* file = "",
-                                int line = 0,
-                                const char* function = "")
+                                const char* file,
+                                int line,
+                                const char* function)
         : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
 };
 
@@ -41,9 +41,9 @@ class BasisConfigurationException : public BasisException {
 class BasisElementCompatibilityException : public BasisException {
 public:
     BasisElementCompatibilityException(const std::string& message,
-                                       const char* file = "",
-                                       int line = 0,
-                                       const char* function = "")
+                                       const char* file,
+                                       int line,
+                                       const char* function)
         : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
 };
 
@@ -53,9 +53,9 @@ class BasisElementCompatibilityException : public BasisException {
 class BasisEvaluationException : public BasisException {
 public:
     BasisEvaluationException(const std::string& message,
-                             const char* file = "",
-                             int line = 0,
-                             const char* function = "")
+                             const char* file,
+                             int line,
+                             const char* function)
         : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
 };
 
@@ -65,9 +65,9 @@ class BasisEvaluationException : public BasisException {
 class BasisNodeOrderingException : public BasisException {
 public:
     BasisNodeOrderingException(const std::string& message,
-                               const char* file = "",
-                               int line = 0,
-                               const char* function = "")
+                               const char* file,
+                               int line,
+                               const char* function)
         : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
 };
 
@@ -77,9 +77,9 @@ class BasisNodeOrderingException : public BasisException {
 class BasisConstructionException : public BasisException {
 public:
     BasisConstructionException(const std::string& message,
-                               const char* file = "",
-                               int line = 0,
-                               const char* function = "")
+                               const char* file,
+                               int line,
+                               const char* function)
         : BasisException(message, file, line, function, StatusCode::InternalError) {}
 };
 
diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index 033b85eb1..84192e293 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -33,6 +33,21 @@ namespace FE {
 /// actionable diagnostics. The free helper templates raise(), throw_if(),
 /// check_arg(), check_not_null(), and check_index() wrap common validation
 /// patterns with source-location capture.
+///
+/// Canonical FE code should throw through this helper layer instead of calling
+/// the core ::svmp helpers or constructing exceptions directly:
+///
+/// \code
+/// FE::raise<ExceptionT>(SVMP_HERE, message);
+/// FE::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
+/// FE::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
+/// FE::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
+/// FE::check_index<ExceptionT>(index, size, SVMP_HERE);
+/// FE::not_implemented(feature, SVMP_HERE);
+/// \endcode
+///
+/// throw_if() is failure-condition based. check_arg() is
+/// success-condition based.
 /// @{
 
 /**
@@ -466,8 +481,9 @@ inline void throw_if(bool condition, SourceLocation location, Args&&... args)
 template <class ExceptionT = InvalidArgumentException, class... Args>
 inline void check_arg(bool condition, SourceLocation location, Args&&... args)
 {
-    ::svmp::check_arg<ExceptionT>(condition, location,
-                                  std::forward<Args>(args)...);
+    if (!condition) {
+        ::svmp::FE::raise<ExceptionT>(location, std::forward<Args>(args)...);
+    }
 }
 
 /**
@@ -484,7 +500,9 @@ template <class ExceptionT = InvalidArgumentException, class PointerT,
 inline void check_not_null(PointerT ptr, SourceLocation location,
                            Args&&... args)
 {
-    ::svmp::check_not_null<ExceptionT>(ptr, location, std::forward<Args>(args)...);
+    if (ptr == nullptr) {
+        ::svmp::FE::raise<ExceptionT>(location, std::forward<Args>(args)...);
+    }
 }
 
 /**
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index fb27ad7bf..c5ed67bec 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -13,9 +13,6 @@
 #include <string>
 #include <utility>
 
-#define DENSE_LINALG_CHECK(condition, message) \
-    ::svmp::FE::throw_if<::svmp::FE::FEException>(!(condition), SVMP_HERE, (message))
-
 namespace svmp {
 namespace FE {
 namespace math {
@@ -86,12 +83,15 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs) const {
 
 void DenseLUSolver::solve_in_place(std::span<Real> rhs,
                                    std::size_t rhs_count) const {
-    DENSE_LINALG_CHECK(rhs_count > 0,
-                             label + ": dense solve requires at least one right-hand side");
-    DENSE_LINALG_CHECK(rhs.size() == n * rhs_count,
-                             label + ": dense multi-RHS solve size mismatch");
-    DENSE_LINALG_CHECK(lu.rows() == static_cast<Eigen::Index>(n),
-                             label + ": dense solver is not factorized");
+    ::svmp::FE::check_arg<FEException>(
+        rhs_count > 0, SVMP_HERE,
+        label + ": dense solve requires at least one right-hand side");
+    ::svmp::FE::check_arg<FEException>(
+        rhs.size() == n * rhs_count, SVMP_HERE,
+        label + ": dense multi-RHS solve size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
+        label + ": dense solver is not factorized");
     if (n == 0) {
         return;
     }
@@ -115,10 +115,12 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
-    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
-                             std::string(label) + ": diagnostic size mismatch");
-    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
-                             std::string(label) + ": diagnostics require a nonempty matrix");
+    ::svmp::FE::check_arg<FEException>(
+        matrix.size() == rows * cols, SVMP_HERE,
+        std::string(label) + ": diagnostic size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        rows > 0 && cols > 0, SVMP_HERE,
+        std::string(label) + ": diagnostics require a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense);
@@ -153,8 +155,9 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
 DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
                                   std::size_t n,
                                   std::string_view label) {
-    DENSE_LINALG_CHECK(matrix.size() == n * n,
-                             std::string(label) + ": dense factorization size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        matrix.size() == n * n, SVMP_HERE,
+        std::string(label) + ": dense factorization size mismatch");
 
     DenseLUSolver solver;
     solver.n = n;
@@ -172,8 +175,8 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
     const auto diagonal = solver.lu.matrixLU().diagonal();
     for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
         const Real pivot_magnitude = std::abs(diagonal[col]);
-        DENSE_LINALG_CHECK(
-            pivot_magnitude > solver.pivot_tolerance,
+        ::svmp::FE::check_arg<FEException>(
+            pivot_magnitude > solver.pivot_tolerance, SVMP_HERE,
             solver.label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
                 ", pivot below scale-aware tolerance " +
@@ -198,8 +201,9 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
     std::vector<Real> matrix,
     std::size_t n,
     std::string_view label) {
-    DENSE_LINALG_CHECK(matrix.size() == n * n,
-                             std::string(label) + ": dense inverse size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        matrix.size() == n * n, SVMP_HERE,
+        std::string(label) + ": dense inverse size mismatch");
     std::vector<Real> matrix_for_lu = matrix;
     const DenseLUSolver solver =
         factor_dense_matrix(std::move(matrix_for_lu), n, label);
@@ -219,8 +223,8 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
                                                       static_cast<Eigen::Index>(n));
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
-            DENSE_LINALG_CHECK(
-                singular_values[i] > solver.diagnostics.tolerance,
+            ::svmp::FE::check_arg<FEException>(
+                singular_values[i] > solver.diagnostics.tolerance, SVMP_HERE,
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = Real(1) / singular_values[i];
         }
@@ -240,8 +244,8 @@ void validate_dense_inverse_diagnostics(
     std::size_t expected_rank,
     std::string_view label,
     Real max_condition) {
-    DENSE_LINALG_CHECK(
-        result.diagnostics.rank == expected_rank,
+    ::svmp::FE::check_arg<FEException>(
+        result.diagnostics.rank == expected_rank, SVMP_HERE,
         std::string(label) + ": rank-deficient matrix (rank " +
             std::to_string(result.diagnostics.rank) + " of " +
             std::to_string(expected_rank) + ")");
@@ -250,8 +254,8 @@ void validate_dense_inverse_diagnostics(
         return;
     }
 
-    DENSE_LINALG_CHECK(
-        result.diagnostics.condition_estimate <= max_condition,
+    ::svmp::FE::check_arg<FEException>(
+        result.diagnostics.condition_estimate <= max_condition, SVMP_HERE,
         std::string(label) + ": condition estimate " +
             std::to_string(result.diagnostics.condition_estimate) +
             " exceeds supported threshold " + std::to_string(max_condition));
@@ -270,8 +274,9 @@ std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
 std::size_t dense_matrix_rank(std::vector<Real> matrix,
                               std::size_t rows,
                               std::size_t cols) {
-    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
-                             "dense_matrix_rank: size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        matrix.size() == rows * cols, SVMP_HERE,
+        "dense_matrix_rank: size mismatch");
 
     const DenseMatrix dense =
         map_row_major(std::span<const Real>(matrix.data(), matrix.size()), rows, cols);
@@ -297,10 +302,12 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
-    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
-                             std::string(label) + ": pseudo-inverse size mismatch");
-    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
-                             std::string(label) + ": pseudo-inverse requires a nonempty matrix");
+    ::svmp::FE::check_arg<FEException>(
+        matrix.size() == rows * cols, SVMP_HERE,
+        std::string(label) + ": pseudo-inverse size mismatch");
+    ::svmp::FE::check_arg<FEException>(
+        rows > 0 && cols > 0, SVMP_HERE,
+        std::string(label) + ": pseudo-inverse requires a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
@@ -334,5 +341,3 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
 } // namespace math
 } // namespace FE
 } // namespace svmp
-
-#undef DENSE_LINALG_CHECK
diff --git a/Code/Source/solver/cep_ion.cpp b/Code/Source/solver/cep_ion.cpp
index 8c91a54fd..eb64d7ffd 100644
--- a/Code/Source/solver/cep_ion.cpp
+++ b/Code/Source/solver/cep_ion.cpp
@@ -330,7 +330,7 @@ void cep_integ_l(CepMod &cep_mod, cepModelType &cep, Vector<double> &X,
   dmsg << "cep.odes.tIntTyp: " << cep.odes.tIntType;
   #endif
 
-  svmp::check_not_null<svmp::FE::NotInitializedException>(
+  svmp::FE::check_not_null<svmp::FE::NotInitializedException>(
       cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
 
   const double eps = std::numeric_limits<double>::epsilon();
diff --git a/Code/Source/solver/fs.cpp b/Code/Source/solver/fs.cpp
index abe1992df..8ea252a04 100644
--- a/Code/Source/solver/fs.cpp
+++ b/Code/Source/solver/fs.cpp
@@ -27,33 +27,15 @@ std::string element_name(consts::ElementType eType)
   return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
 }
 
-bool supports_reference_hessians(consts::ElementType eType)
-{
-  using namespace consts;
-
-  switch (eType) {
-    case ElementType::LIN1:
-    case ElementType::LIN2:
-    case ElementType::TRI3:
-    case ElementType::TRI6:
-    case ElementType::QUD4:
-    case ElementType::QUD8:
-    case ElementType::QUD9:
-    case ElementType::TET4:
-    case ElementType::TET10:
-    case ElementType::HEX8:
-    case ElementType::HEX20:
-    case ElementType::HEX27:
-    case ElementType::WDG:
-      return true;
-    default:
-      return false;
-  }
-}
-
-void populate_reference_hessians_if_supported(fsType& fs, const int insd)
+/// @brief Populate reference-space Hessians (fs.Nxx) at every Gauss point.
+///
+/// Element-type support is owned by nn::get_gn_nxx: it evaluates analytic
+/// reference Hessians for every element the FE Basis supports.
+/// Families without FE Basis Hessian support include (NA/PNT/NRB),
+/// their zero-initialized Nxx remain untouched.
+void populate_reference_hessians(fsType& fs, const int insd)
 {
-  if (fs.Nxx.size() == 0 || !supports_reference_hessians(fs.eType)) {
+  if (fs.Nxx.size() == 0) {
     return;
   }
 
@@ -159,7 +141,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[1].eType, fs[1].eNoN, g, fs[1].xi, fs[1].N, fs[1].Nx);
       }
       nn::get_nn_bnds(nsd, fs[1].eType, fs[1].eNoN, fs[1].xib, fs[1].Nb);
-      populate_reference_hessians_if_supported(fs[1], nsd);
+      populate_reference_hessians(fs[1], nsd);
 
     } else if (iOpt == 2) {
       fs[1].nG    = lM.fs[1].nG;
@@ -190,7 +172,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[0].eType, fs[0].eNoN, g, fs[0].xi, fs[0].N, fs[0].Nx);
       }
       nn::get_nn_bnds(nsd, fs[0].eType, fs[0].eNoN, fs[0].xib, fs[0].Nb);
-      populate_reference_hessians_if_supported(fs[0], nsd);
+      populate_reference_hessians(fs[0], nsd);
     }
   }
 }
@@ -333,7 +315,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
     lM.fs[0].Nb  = lM.Nb;
     lM.fs[0].Nx  = lM.Nx;
   }
-  populate_reference_hessians_if_supported(lM.fs[0], insd);
+  populate_reference_hessians(lM.fs[0], insd);
 
   // Sets Taylor-Hood basis [fluid, stokes, ustruct, FSI)
   if (lM.nFs == 2) {
@@ -342,7 +324,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
 
     // Initialize the function space
     init_fs(lM.fs[1], nsd, insd);
-    populate_reference_hessians_if_supported(lM.fs[1], insd);
+    populate_reference_hessians(lM.fs[1], insd);
   }
 }
 
@@ -395,9 +377,8 @@ void set_thood_fs(fsType& fs, consts::ElementType eType)
     break;
 
     default:
-      throw fe::InvalidElementException("Cannot choose Taylor-Hood basis",
-          element_name(eType), __FILE__, __LINE__, __func__);
-    break;
+      fe::raise<fe::InvalidElementException>(
+          SVMP_HERE, "Cannot choose Taylor-Hood basis", element_name(eType));
   }
 }
 
diff --git a/Code/Source/solver/ionic_model.cpp b/Code/Source/solver/ionic_model.cpp
index 9bc846b5c..e1f3251ad 100644
--- a/Code/Source/solver/ionic_model.cpp
+++ b/Code/Source/solver/ionic_model.cpp
@@ -40,7 +40,7 @@ void IonicModel::distribute_parameters(const CmMod &cm_mod, const cmType &cm) {
 
 void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
   if (initial_X.size() != X.size()) {
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE, "Initial conditions size for X does not match vector size.");
   }
 
@@ -48,7 +48,7 @@ void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
     X[i] = initial_X[i].second;
 
   if (initial_Xg.size() != Xg.size()) {
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "Initial conditions size for Xg does not match vector size.");
   }
@@ -76,7 +76,7 @@ void IonicModel::integ(const odeType &ode_solver_params, const int zone_id,
     break;
 
   default:
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "Unknown time integration type: " +
             std::to_string(static_cast<int>(ode_solver_params.tIntType)));
@@ -263,7 +263,7 @@ IonicModelFactory::create_model(const std::string &name) {
 
   auto iter = factory_instance.children.find(name);
   if (iter == factory_instance.children.end()) {
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE, "No model with name '" + name +
                        "' was registered in the ionic model factory.");
   }
@@ -279,4 +279,4 @@ void IonicModelFactory::visit(
     std::unique_ptr<IonicModel> dummy = builder();
     f(name, *dummy);
   }
-}
\ No newline at end of file
+}
diff --git a/Code/Source/solver/mat_fun.h b/Code/Source/solver/mat_fun.h
index db50da6cb..d0c63d272 100644
--- a/Code/Source/solver/mat_fun.h
+++ b/Code/Source/solver/mat_fun.h
@@ -52,7 +52,8 @@ namespace mat_fun {
         if ((mat.rows() != dest.nrows()) || (mat.cols() != dest.ncols())) { 
           auto mat_dims = (std::stringstream() << "(" << mat.rows()  << "x" << mat.cols() << ")").str();
           auto dest_dims = (std::stringstream() << "(" << dest.nrows()  << "x" << dest.ncols() << ")").str();
-          svmp::raise<svmp::FE::InvalidArgumentException>( SVMP_HERE, "The 'mat" + mat_dims + "' and 'dest" + dest_dims + 
+          svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+              SVMP_HERE, "The 'mat" + mat_dims + "' and 'dest" + dest_dims +
               "' arrays have incompatible sizes.");
         }
 
@@ -258,4 +259,3 @@ namespace mat_fun {
 };
 
 #endif
-
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 547310703..b3b9dd2a8 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -498,8 +498,10 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
 {
   using namespace consts;
 
-  // NRB/PNT and face-only Hessian paths remain intentionally unsupported here.
-  if (eType == ElementType::NRB || eType == ElementType::PNT) {
+  // NA/NRB/PNT have no FE Basis Hessian support (NA is unassigned; NRB/PNT are
+  // outside the current FE Basis scope). Leave Nxx at its zero-initialized
+  // state so callers may invoke this for every element type unconditionally.
+  if (eType == ElementType::NA || eType == ElementType::NRB || eType == ElementType::PNT) {
     return;
   }
 
diff --git a/Code/Source/solver/post.cpp b/Code/Source/solver/post.cpp
index 84b2c23c8..50872304e 100644
--- a/Code/Source/solver/post.cpp
+++ b/Code/Source/solver/post.cpp
@@ -804,13 +804,13 @@ void fib_stretch_rate(const ComMod& com_mod, const int iEq, const mshType& lM, c
   int nNo = lM.nNo;
 
   if (dt <= 0.0) {
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "[fib_stretch_rate] Expected com_mod.dt > 0, but got " + std::to_string(dt) + ".");
   }
 
   if (res.size() != nNo) {
-    svmp::raise<svmp::FE::InvalidArgumentException>(
+    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "[fib_stretch_rate] Expected res size " + std::to_string(nNo) + ", but got " + std::to_string(res.size()) + ".");
   }
diff --git a/Code/Source/solver/read_files.cpp b/Code/Source/solver/read_files.cpp
index fdfbe36ec..f5812bf3d 100644
--- a/Code/Source/solver/read_files.cpp
+++ b/Code/Source/solver/read_files.cpp
@@ -2384,7 +2384,7 @@ void read_outputs(Simulation* simulation, EquationParameters* eq_params, eqType&
         if (dmn.phys != consts::EquationType::phys_CEP)
           continue;
 
-        svmp::check_not_null<svmp::FE::NotInitializedException>(
+        svmp::FE::check_not_null<svmp::FE::NotInitializedException>(
             dmn.cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
 
         const auto registered_outputs =
@@ -3327,4 +3327,3 @@ void set_equation_properties(Simulation* simulation, EquationParameters* eq_para
 }
 
 };
-
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index edeca5ac5..1b9e63329 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -12,6 +12,7 @@
 #include "FE/Basis/NodeOrderingConventions.h"
 #include "FE/Basis/SerendipityBasis.h"
 
+#include <string>
 #include <vector>
 
 using namespace svmp::FE;
@@ -129,6 +130,24 @@ class CompleteFallbackBasis : public BasisFunction {
     }
 };
 
+void expect_source_location(const FEException& e)
+{
+    EXPECT_NE(e.context().file().find("test_BasisErrorPaths.cpp"), std::string::npos);
+    EXPECT_GT(e.context().line(), 0);
+    EXPECT_FALSE(e.context().function().empty());
+}
+
+template <class Thrower>
+void expect_fe_helper_preserves_source_location(Thrower&& thrower)
+{
+    try {
+        thrower();
+        FAIL() << "Expected an FEException";
+    } catch (const FEException& e) {
+        expect_source_location(e);
+    }
+}
+
 } // namespace
 
 TEST(BasisErrorPaths, LagrangeInvalidRequestsThrowBasisExceptions) {
@@ -187,18 +206,48 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
 
 TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
     try {
-        throw BasisConfigurationException("invalid config", __FILE__, __LINE__, __func__);
+        svmp::FE::raise<BasisConfigurationException>(SVMP_HERE, "invalid config");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InvalidArgument);
     }
 
     try {
-        throw BasisConstructionException("construction failure", __FILE__, __LINE__, __func__);
+        svmp::FE::raise<BasisConstructionException>(SVMP_HERE, "construction failure");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InternalError);
     }
 }
 
+TEST(BasisErrorPaths, FEHelpersPreserveSourceLocation) {
+    expect_fe_helper_preserves_source_location([] {
+        svmp::FE::raise<BasisEvaluationException>(SVMP_HERE, "raise location");
+    });
+
+    expect_fe_helper_preserves_source_location([] {
+        svmp::FE::throw_if<BasisEvaluationException>(
+            true, SVMP_HERE, "throw_if location");
+    });
+
+    expect_fe_helper_preserves_source_location([] {
+        svmp::FE::check_arg<BasisEvaluationException>(
+            false, SVMP_HERE, "check_arg location");
+    });
+
+    expect_fe_helper_preserves_source_location([] {
+        const int* ptr = nullptr;
+        svmp::FE::check_not_null<BasisEvaluationException>(
+            ptr, SVMP_HERE, "check_not_null location");
+    });
+
+    expect_fe_helper_preserves_source_location([] {
+        svmp::FE::check_index<BasisEvaluationException>(1, 1, SVMP_HERE);
+    });
+
+    expect_fe_helper_preserves_source_location([] {
+        svmp::FE::not_implemented("test feature", SVMP_HERE);
+    });
+}
+
 TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
     EXPECT_THROW((void)ReferenceNodeLayout::get_node_coords(ElementType::Quad8, 99u),
                  BasisNodeOrderingException);

From 1d0a817732bd68dd13dd4dd051dbe495078916df Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 11:22:42 -0700
Subject: [PATCH 24/91] adding docstrings to nn source functions and inlining
 to_fe_element_type to remove the use_basis_adapter_for helper function

---
 Code/Source/solver/nn.cpp | 61 +++++++++++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index b3b9dd2a8..332d21c89 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -71,7 +71,9 @@ std::string solver_element_name(consts::ElementType eType)
 /// choice of basis family and polynomial order for each element type
 /// (basis_factory::default_basis_request). The switch deliberately has no
 /// default case so that compilers building with -Wswitch flag any newly added
-/// solver element type that is missing a mapping here.
+/// solver element type that is missing a mapping here. Returns std::nullopt for
+/// element types the FE Basis does not implement (NA/PNT/NRB); callers test FE
+/// Basis support with has_value().
 std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
 {
   switch (eType) {
@@ -99,11 +101,12 @@ std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
   return std::nullopt;
 }
 
-bool use_basis_adapter_for(consts::ElementType eType)
-{
-  return to_fe_element_type(eType).has_value();
-}
-
+/// Whether the FE Basis face adapter can evaluate face shape functions for
+/// eType. An element face is always a point, line, or surface topology, so the
+/// switch restricts support to those types (a volume element never appears as a
+/// face); it then defers to to_fe_element_type to confirm the FE Basis library
+/// actually provides a mapping for that face type. The face get_gnn uses this
+/// to choose between the FE Basis path and the explicit paths.
 bool supports_face_basis_adapter_for(consts::ElementType eType)
 {
   switch (eType) {
@@ -114,7 +117,7 @@ bool supports_face_basis_adapter_for(consts::ElementType eType)
     case consts::ElementType::QUD4:
     case consts::ElementType::QUD8:
     case consts::ElementType::QUD9:
-      return use_basis_adapter_for(eType);
+      return to_fe_element_type(eType).has_value();
     default:
       return false;
   }
@@ -145,6 +148,17 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
   return *it->second;
 }
 
+/// Permutation from a solver element's local node ordering to the FE Basis
+/// ReferenceNodeLayout ordering, indexed by the solver-local node number:
+/// map[solver_node] is the matching FE Basis node. The solver and the FE Basis
+/// library number element nodes with different conventions, so this table
+/// reconciles them at the adapter boundary. An empty span means the two
+/// orderings already coincide (identity) and no permutation is applied, which
+/// holds for every element type not listed below (lines, Quad4/8/9, Hex8/20).
+/// Wedge6 (WDG) reuses the Triangle6 table: its two triangular node triples are
+/// reordered exactly like a 6-node triangle.
+/// \note These tables must stay consistent with the FE Basis lattice ordering;
+/// a mismatch would silently assign shape functions to the wrong nodes.
 std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 {
   static constexpr std::array<std::size_t, 3> tri3{1, 2, 0};
@@ -173,6 +187,10 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
   }
 }
 
+/// Map a single solver-local node index to its FE Basis node index for eType by
+/// applying solver_to_basis_node_map (identity when no permutation is
+/// registered). Throws BasisNodeOrderingException when solver_node is negative
+/// or falls outside the element's node map.
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
 {
   if (solver_node < 0) {
@@ -194,6 +212,10 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
           " is outside node map for " + solver_element_name(eType));
 }
 
+/// Build a 3-component FE Basis reference coordinate from column g of the solver
+/// xi array, zero-filling the trailing components that are inactive for
+/// lower-dimensional elements. Throws BasisConfigurationException when xi has
+/// fewer rows than the basis reference dimension.
 fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
                                                const int g,
                                                const Array<double>& xi)
@@ -214,6 +236,10 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
   return point;
 }
 
+/// Scatter FE Basis values and gradients (in ReferenceNodeLayout order) into the
+/// solver N and Nx arrays at Gauss point g, permuting into solver node order via
+/// basis_index_for_solver_node. Validates the value and gradient counts against
+/// eNoN and zeroes unused gradient rows.
 void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         const int eNoN,
                                         const int g,
@@ -254,6 +280,9 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
   }
 }
 
+/// Evaluate the cached FE Basis for eType at Gauss point g and write the solver
+/// N and Nx arrays. Nx holds reference-space gradients only; physical-coordinate
+/// derivatives are formed later by the solver from the mapping Jacobian.
 void evaluate_basis_values_and_gradients(const int insd,
                                          consts::ElementType eType,
                                          const int eNoN,
@@ -279,6 +308,8 @@ void evaluate_basis_values_and_gradients(const int insd,
   copy_basis_values_to_solver_arrays(eType, eNoN, g, values, gradients, N, Nx);
 }
 
+/// evaluate_basis_values_and_gradients specialized to a faceType, using the
+/// face's own reference dimension (xi rows) and N/Nx storage.
 void evaluate_face_basis_values_and_gradients(const int gaus_pt, faceType& face)
 {
   evaluate_basis_values_and_gradients(
@@ -291,6 +322,9 @@ void evaluate_face_basis_values_and_gradients(const int gaus_pt, faceType& face)
       face.Nx);
 }
 
+/// Number of packed second-derivative components the solver Nxx stores for a
+/// given reference dimension: 1 in 1D, 3 in 2D, 6 in 3D. Throws
+/// BasisConfigurationException for any other dimension.
 int required_nxx_components_for_dimension(const int dimension)
 {
   switch (dimension) {
@@ -306,6 +340,10 @@ int required_nxx_components_for_dimension(const int dimension)
   }
 }
 
+/// Scatter FE Basis Hessians (in ReferenceNodeLayout order) into the packed
+/// solver Nxx array at Gauss point g, permuting into solver node order. Packing
+/// is [dxx, dyy, dxy] in 2D and [dxx, dyy, dzz, dxy, dyz, dxz] in 3D. Validates
+/// the Hessian count against eNoN and the Nxx row count against the dimension.
 void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
                                        const int eNoN,
                                        const int g,
@@ -354,6 +392,9 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
   }
 }
 
+/// Evaluate the cached FE Basis Hessians for eType at Gauss point gaus_pt and
+/// write the packed solver Nxx array. Validates insd and ind2 against the basis
+/// reference dimension and the required packed-component count.
 void evaluate_basis_hessians(const int insd,
                              const int ind2,
                              consts::ElementType eType,
@@ -384,6 +425,8 @@ void evaluate_basis_hessians(const int insd,
   copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis.dimension(), hessians, Nxx);
 }
 
+/// Shape data for a point (0-D) face: a single unit basis value with zero
+/// derivatives. Used for the PNT face case, which has no FE Basis evaluator.
 void set_point_face_shape_data(const int gaus_pt, faceType& face)
 {
   face.N(0, gaus_pt) = 1.0;
@@ -438,7 +481,7 @@ void get_gip(Simulation* simulation, faceType& face)
 void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const int g, Array<double>& xi, 
     Array<double>& N, Array3<double>& Nx)
 {
-  if (!use_basis_adapter_for(eType)) {
+  if (!to_fe_element_type(eType).has_value()) {
     fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
   }
@@ -505,7 +548,7 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
     return;
   }
 
-  if (!use_basis_adapter_for(eType)) {
+  if (!to_fe_element_type(eType).has_value()) {
     fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
             solver_element_name(eType));

From 34d4e68ff6eed286ec6485a13a0280edd7061331 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 12:03:16 -0700
Subject: [PATCH 25/91] conforming hex27 face node ordering to right-hand rule
 vtk style ordering in reference layout

---
 .../FE/Basis/NodeOrderingConventions.cpp      | 40 ++++++++++++-------
 Code/Source/solver/nn.cpp                     | 15 +++----
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 850f8cd0a..946bc6f59 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -211,34 +211,46 @@ std::vector<Point> generate_hex_nodes(int order) {
         }
     }
 
-    for (int j = 1; j < order; ++j) {
-        for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(-1)});
+    // Face-interior nodes, emitted in VTK face order so the layout matches the
+    // VTK cell node numbering the solver ingests from .vtu meshes:
+    //   -X, +X, -Y, +Y, -Z, +Z  (e.g. Hex27 face centers become nodes 20..25).
+    // For order >= 3 the within-face node sequence follows the loops below; only
+    // the face order is normalized to VTK, which is all the supported Hex8/20/27
+    // elements require.
+    // -X face (x = -1)
+    for (int k = 1; k < order; ++k) {
+        for (int j = order - 1; j >= 1; --j) {
+            nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
-    for (int j = 1; j < order; ++j) {
-        for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(1)});
+    // +X face (x = +1)
+    for (int k = 1; k < order; ++k) {
+        for (int j = 1; j < order; ++j) {
+            nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
+    // -Y face (y = -1)
     for (int k = 1; k < order; ++k) {
         for (int i = 1; i < order; ++i) {
             nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), line_coord_pm_one(k, order)});
         }
     }
-    for (int k = 1; k < order; ++k) {
-        for (int j = 1; j < order; ++j) {
-            nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
-        }
-    }
+    // +Y face (y = +1)
     for (int k = 1; k < order; ++k) {
         for (int i = order - 1; i >= 1; --i) {
             nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), line_coord_pm_one(k, order)});
         }
     }
-    for (int k = 1; k < order; ++k) {
-        for (int j = order - 1; j >= 1; --j) {
-            nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+    // -Z face (z = -1)
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(-1)});
+        }
+    }
+    // +Z face (z = +1)
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(1)});
         }
     }
     for (int k = 1; k < order; ++k) {
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 332d21c89..2e08768e4 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -154,9 +154,12 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
 /// library number element nodes with different conventions, so this table
 /// reconciles them at the adapter boundary. An empty span means the two
 /// orderings already coincide (identity) and no permutation is applied, which
-/// holds for every element type not listed below (lines, Quad4/8/9, Hex8/20).
-/// Wedge6 (WDG) reuses the Triangle6 table: its two triangular node triples are
-/// reordered exactly like a 6-node triangle.
+/// holds for the line, Quad4/8/9, and the entire hex family (Hex8/20/27): the FE
+/// Basis exposes those in the same VTK-based ordering the solver ingests from
+/// .vtu meshes. Only the simplex families need a permutation, because the solver
+/// labels simplex corners origin-last while the FE Basis lattice is origin-first;
+/// Wedge6 (WDG) reuses the Triangle6 table, since its two triangular node triples
+/// are reordered exactly like a 6-node triangle.
 /// \note These tables must stay consistent with the FE Basis lattice ordering;
 /// a mismatch would silently assign shape functions to the wrong nodes.
 std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
@@ -165,10 +168,6 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
   static constexpr std::array<std::size_t, 6> tri6{1, 2, 0, 4, 5, 3};
   static constexpr std::array<std::size_t, 4> tet4{1, 2, 3, 0};
   static constexpr std::array<std::size_t, 10> tet10{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
-  static constexpr std::array<std::size_t, 27> hex27{
-      0, 1, 2, 3, 4, 5, 6, 7,
-      8, 9, 10, 11, 12, 13, 14, 15,
-      16, 17, 18, 19, 25, 23, 22, 24, 20, 21, 26};
 
   switch (eType) {
     case consts::ElementType::TRI3:
@@ -180,8 +179,6 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
       return tet4;
     case consts::ElementType::TET10:
       return tet10;
-    case consts::ElementType::HEX27:
-      return hex27;
     default:
       return {};
   }

From a875bdbde64fae55aa5248cc867f6a429d855137 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 12:52:38 -0700
Subject: [PATCH 26/91] removing geometry_mode bool from SerendipityBasis to
 clean up construction

---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  9 +---
 .../Source/solver/FE/Basis/SerendipityBasis.h | 23 ++-------
 .../FE/Basis/test_SerendipityTensorModal.cpp  | 51 -------------------
 3 files changed, 7 insertions(+), 76 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index ae505c2cf..9767c7701 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -496,8 +496,8 @@ void require_requested_span_size(std::span<T> output,
 
 } // namespace
 
-SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mode)
-    : element_type_(type), dimension_(0), order_(order), size_(0), geometry_mode_(geometry_mode) {
+SerendipityBasis::SerendipityBasis(ElementType type, int order)
+    : element_type_(type), dimension_(0), order_(order), size_(0) {
     if (type == ElementType::Quad4 || type == ElementType::Quad8) {
         dimension_ = 2;
         if (order_ < 1) {
@@ -626,11 +626,6 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
-        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
-        return;
-    }
-
     if (element_type_ == ElementType::Hex20) {
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
         FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index e231ed833..c395fcea1 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -57,11 +57,6 @@ namespace basis {
 /// Hessians are obtained by differentiating those monomials. Hex20 evaluation
 /// is reordered through ReferenceNodeLayout so the output matches the public
 /// basis ordering.
-///
-/// When `geometry_mode` is enabled for Hex20, the basis uses the trilinear
-/// Hex8 corner functions for geometry mapping and assigns zero contribution to
-/// the quadratic edge nodes. This preserves the public Hex20 node count while
-/// intentionally reducing the geometry interpolation order.
 class SerendipityBasis final : public BasisFunction {
 public:
     /// \brief Construct a serendipity basis for an element type and polynomial order.
@@ -76,10 +71,9 @@ class SerendipityBasis final : public BasisFunction {
     ///
     /// \param type Element type used to determine topology and reference-node layout.
     /// \param order Requested polynomial order.
-    /// \param geometry_mode When true, allow reduced geometry-mapping behavior for supported elements.
-    /// \throws BasisConfigurationException If the requested order or mode is invalid.
+    /// \throws BasisConfigurationException If the requested order is invalid.
     /// \throws BasisElementCompatibilityException If the element type is unsupported.
-    SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
+    SerendipityBasis(ElementType type, int order);
 
     /// \copydoc BasisFunction::basis_type()
     BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
@@ -114,8 +108,7 @@ class SerendipityBasis final : public BasisFunction {
     /// monomial vector and multiplies by the inverse Vandermonde matrix to
     /// obtain nodal shape-function values. For Hex8, values are the standard
     /// trilinear corner products. For Hex20 and Wedge15, values are evaluated
-    /// from the stored polynomial coefficient tables. In Hex20 geometry mode,
-    /// only the first eight corner values are nonzero and they match Hex8.
+    /// from the stored polynomial coefficient tables.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
@@ -129,8 +122,7 @@ class SerendipityBasis final : public BasisFunction {
     /// before applying the inverse Vandermonde coefficients. Hex8 gradients are
     /// direct derivatives of the trilinear corner products. Hex20 and Wedge15
     /// gradients are computed by differentiating the tabulated monomial
-    /// expansions. In Hex20 geometry mode, edge-node gradients are zero and the
-    /// corner gradients match Hex8.
+    /// expansions.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
@@ -144,8 +136,7 @@ class SerendipityBasis final : public BasisFunction {
     /// derivatives of the monomial vector and inverse Vandermonde coefficients.
     /// Hex8 Hessians are delegated to the linear Lagrange Hex8 basis. Hex20 and
     /// Wedge15 Hessians are computed by differentiating their polynomial
-    /// coefficient tables twice. In Hex20 geometry mode, only the corner
-    /// Hessians from the Hex8 geometry mapping are populated.
+    /// coefficient tables twice.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
@@ -195,10 +186,6 @@ class SerendipityBasis final : public BasisFunction {
     // Row-major inverse Vandermonde, indexed as [monomial, basis].
     std::vector<Real> quad_inv_vandermonde_;
 
-    // When true, this basis is used purely for geometry mapping and may use
-    // reduced polynomial order (e.g., Hex20 geometry as Hex8).
-    bool geometry_mode_;
-
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
                          std::span<Real> values_out,
                          std::span<Gradient> gradients_out,
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
index 235dc8c40..12062e4f6 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -236,54 +236,3 @@ TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
         }
     }
 }
-
-// Geometry mode keeps the public Hex20 node count while mapping geometry with
-// the trilinear corner functions: corners must match the Hex8 basis exactly
-// and the quadratic edge nodes must contribute nothing.
-TEST(SerendipityBasis, Hex20GeometryModeUsesTrilinearCornersOnly) {
-    SerendipityBasis geometry(ElementType::Hex20, 2, true);
-    SerendipityBasis trilinear(ElementType::Hex8, 1);
-
-    EXPECT_EQ(geometry.size(), 20u);
-    EXPECT_EQ(geometry.order(), 2);
-
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.2), Real(-0.1), Real(0.3)},
-        {Real(-0.35), Real(0.25), Real(-0.15)},
-    };
-    for (const auto& xi : points) {
-        std::vector<Real> g_values;
-        std::vector<Gradient> g_gradients;
-        std::vector<Hessian> g_hessians;
-        geometry.evaluate_all(xi, g_values, g_gradients, g_hessians);
-        ASSERT_EQ(g_values.size(), 20u);
-
-        std::vector<Real> t_values;
-        std::vector<Gradient> t_gradients;
-        std::vector<Hessian> t_hessians;
-        trilinear.evaluate_all(xi, t_values, t_gradients, t_hessians);
-
-        Real value_sum = Real(0);
-        for (std::size_t i = 0; i < 20u; ++i) {
-            value_sum += g_values[i];
-            if (i < 8u) {
-                EXPECT_NEAR(g_values[i], t_values[i], Real(1e-13)) << "corner=" << i;
-                for (std::size_t d = 0; d < 3u; ++d) {
-                    EXPECT_NEAR(g_gradients[i][d], t_gradients[i][d], Real(1e-13));
-                    for (std::size_t e = 0; e < 3u; ++e) {
-                        EXPECT_NEAR(g_hessians[i](d, e), t_hessians[i](d, e), Real(1e-13));
-                    }
-                }
-            } else {
-                EXPECT_EQ(g_values[i], Real(0)) << "edge node=" << i;
-                for (std::size_t d = 0; d < 3u; ++d) {
-                    EXPECT_EQ(g_gradients[i][d], Real(0));
-                    for (std::size_t e = 0; e < 3u; ++e) {
-                        EXPECT_EQ(g_hessians[i](d, e), Real(0));
-                    }
-                }
-            }
-        }
-        EXPECT_NEAR(value_sum, Real(1), Real(1e-13));
-    }
-}

From 35124f71c0fb2e3709a88c58c2d80753c4495fe1 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 13:03:39 -0700
Subject: [PATCH 27/91] C-arrays to the same constexpr std::array for hex20
 table in SerendipityBasis

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 96 +++++++++----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 9767c7701..ec5b18bab 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -228,35 +228,35 @@ constexpr std::array<std::array<Real, 15>, 15> kWedge15Coefficients = {{
     {{-1, -1, -0, 1, 1, -0, 2, -0, -0, -2, -0, -0, -0, -0, -0}}
 }};
 
-static const int hex20_monomial_exponents[20][3] = {
-    {0, 0, 0}, {0, 0, 1}, {0, 0, 2}, {0, 1, 0}, {0, 1, 1},
-    {0, 1, 2}, {0, 2, 0}, {0, 2, 1}, {1, 0, 0}, {1, 0, 1},
-    {1, 0, 2}, {1, 1, 0}, {1, 1, 1}, {1, 1, 2}, {1, 2, 0},
-    {1, 2, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}
-};
-
-static const Real hex20_coeffs[20][20] = {
-    {-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25},
-    {0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0},
-    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25},
-    {0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0, -0.25, -0.25, 0.25, 0.25},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25},
-    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0},
-    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0},
-    {0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0, 0, 0, 0, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0},
-    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25},
-    {-0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25},
-    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0},
-    {0.125, -0.125, -0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0},
-    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
-    {0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}
-};
+constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
+    {{0, 0, 0}}, {{0, 0, 1}}, {{0, 0, 2}}, {{0, 1, 0}}, {{0, 1, 1}},
+    {{0, 1, 2}}, {{0, 2, 0}}, {{0, 2, 1}}, {{1, 0, 0}}, {{1, 0, 1}},
+    {{1, 0, 2}}, {{1, 1, 0}}, {{1, 1, 1}}, {{1, 1, 2}}, {{1, 2, 0}},
+    {{1, 2, 1}}, {{2, 0, 0}}, {{2, 0, 1}}, {{2, 1, 0}}, {{2, 1, 1}}
+}};
+
+constexpr std::array<std::array<Real, 20>, 20> kHex20Coefficients = {{
+    {{-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25}},
+    {{0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0}},
+    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25}},
+    {{0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0, -0.25, -0.25, 0.25, 0.25}},
+    {{0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25}},
+    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0}},
+    {{-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0}},
+    {{0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0, 0, 0, 0, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25}},
+    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0}},
+    {{-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25}},
+    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25}},
+    {{-0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25}},
+    {{-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0}},
+    {{0.125, -0.125, -0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0}},
+    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}}
+}};
 
 inline std::array<Real, 3> quadratic_powers(Real x) {
     return {Real(1), x, x * x};
@@ -268,9 +268,9 @@ void eval_hex20_internal(Real r, Real s, Real t, std::span<Real> internal_vals)
     const auto tp = quadratic_powers(t);
     Real phi[20];
     for (int j = 0; j < 20; ++j) {
-        const int a = hex20_monomial_exponents[j][0];
-        const int b = hex20_monomial_exponents[j][1];
-        const int c = hex20_monomial_exponents[j][2];
+        const int a = kHex20MonomialExponents[j][0];
+        const int b = kHex20MonomialExponents[j][1];
+        const int c = kHex20MonomialExponents[j][2];
         phi[j] = rp[static_cast<std::size_t>(a)] *
                  sp[static_cast<std::size_t>(b)] *
                  tp[static_cast<std::size_t>(c)];
@@ -278,7 +278,7 @@ void eval_hex20_internal(Real r, Real s, Real t, std::span<Real> internal_vals)
     for (int i = 0; i < 20; ++i) {
         Real v = Real(0);
         for (int j = 0; j < 20; ++j) {
-            v += hex20_coeffs[j][i] * phi[j];
+            v += kHex20Coefficients[j][i] * phi[j];
         }
         internal_vals[i] = v;
     }
@@ -290,9 +290,9 @@ void eval_hex20_grad_internal(Real r, Real s, Real t, std::span<Gradient> intern
     const auto tp = quadratic_powers(t);
     Real dphi_dr[20], dphi_ds[20], dphi_dt[20];
     for (int j = 0; j < 20; ++j) {
-        const int a = hex20_monomial_exponents[j][0];
-        const int b = hex20_monomial_exponents[j][1];
-        const int c = hex20_monomial_exponents[j][2];
+        const int a = kHex20MonomialExponents[j][0];
+        const int b = kHex20MonomialExponents[j][1];
+        const int c = kHex20MonomialExponents[j][2];
 
         dphi_dr[j] = (a > 0) ? Real(a) * rp[static_cast<std::size_t>(a - 1)] *
                                     sp[static_cast<std::size_t>(b)] *
@@ -311,9 +311,9 @@ void eval_hex20_grad_internal(Real r, Real s, Real t, std::span<Gradient> intern
     for (int i = 0; i < 20; ++i) {
         Real gr = Real(0), gs = Real(0), gt = Real(0);
         for (int j = 0; j < 20; ++j) {
-            gr += hex20_coeffs[j][i] * dphi_dr[j];
-            gs += hex20_coeffs[j][i] * dphi_ds[j];
-            gt += hex20_coeffs[j][i] * dphi_dt[j];
+            gr += kHex20Coefficients[j][i] * dphi_dr[j];
+            gs += kHex20Coefficients[j][i] * dphi_ds[j];
+            gt += kHex20Coefficients[j][i] * dphi_dt[j];
         }
         internal_grads[i][0] = gr;
         internal_grads[i][1] = gs;
@@ -328,9 +328,9 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, std::span<Hessian> interna
     Real d2phi_drr[20], d2phi_dss[20], d2phi_dtt[20];
     Real d2phi_drs[20], d2phi_drt[20], d2phi_dst[20];
     for (int j = 0; j < 20; ++j) {
-        const int a = hex20_monomial_exponents[j][0];
-        const int b = hex20_monomial_exponents[j][1];
-        const int c = hex20_monomial_exponents[j][2];
+        const int a = kHex20MonomialExponents[j][0];
+        const int b = kHex20MonomialExponents[j][1];
+        const int c = kHex20MonomialExponents[j][2];
 
         d2phi_drr[j] = (a > 1) ? Real(a * (a - 1)) *
                                       rp[static_cast<std::size_t>(a - 2)] *
@@ -367,12 +367,12 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, std::span<Hessian> interna
     for (int i = 0; i < 20; ++i) {
         Hessian H = Hessian::Zero();
         for (int j = 0; j < 20; ++j) {
-            H(0, 0) += hex20_coeffs[j][i] * d2phi_drr[j];
-            H(1, 1) += hex20_coeffs[j][i] * d2phi_dss[j];
-            H(2, 2) += hex20_coeffs[j][i] * d2phi_dtt[j];
-            H(0, 1) += hex20_coeffs[j][i] * d2phi_drs[j];
-            H(0, 2) += hex20_coeffs[j][i] * d2phi_drt[j];
-            H(1, 2) += hex20_coeffs[j][i] * d2phi_dst[j];
+            H(0, 0) += kHex20Coefficients[j][i] * d2phi_drr[j];
+            H(1, 1) += kHex20Coefficients[j][i] * d2phi_dss[j];
+            H(2, 2) += kHex20Coefficients[j][i] * d2phi_dtt[j];
+            H(0, 1) += kHex20Coefficients[j][i] * d2phi_drs[j];
+            H(0, 2) += kHex20Coefficients[j][i] * d2phi_drt[j];
+            H(1, 2) += kHex20Coefficients[j][i] * d2phi_dst[j];
         }
         H(1, 0) = H(0, 1);
         H(2, 0) = H(0, 2);

From e73c13a8ecfa6876679e47787819124fc209535b Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 13:29:30 -0700
Subject: [PATCH 28/91] implementing unified monomial evaluator to decrease
 duplicate code across different SerendipityBasis cases

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 389 ++++++------------
 1 file changed, 120 insertions(+), 269 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index ec5b18bab..3ad253696 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -258,223 +258,109 @@ constexpr std::array<std::array<Real, 20>, 20> kHex20Coefficients = {{
     {{0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}}
 }};
 
-inline std::array<Real, 3> quadratic_powers(Real x) {
-    return {Real(1), x, x * x};
-}
-
-void eval_hex20_internal(Real r, Real s, Real t, std::span<Real> internal_vals) {
-    const auto rp = quadratic_powers(r);
-    const auto sp = quadratic_powers(s);
-    const auto tp = quadratic_powers(t);
-    Real phi[20];
-    for (int j = 0; j < 20; ++j) {
-        const int a = kHex20MonomialExponents[j][0];
-        const int b = kHex20MonomialExponents[j][1];
-        const int c = kHex20MonomialExponents[j][2];
-        phi[j] = rp[static_cast<std::size_t>(a)] *
-                 sp[static_cast<std::size_t>(b)] *
-                 tp[static_cast<std::size_t>(c)];
-    }
-    for (int i = 0; i < 20; ++i) {
-        Real v = Real(0);
-        for (int j = 0; j < 20; ++j) {
-            v += kHex20Coefficients[j][i] * phi[j];
-        }
-        internal_vals[i] = v;
+// Value and first/second derivatives of the 1D monomial x^a. The derivative of
+// a constant or linear term collapses to zero, so negative powers never arise.
+struct MonomialAxis {
+    Real value;   ///< x^a
+    Real first;   ///< d/dx (x^a)     = a x^(a-1)
+    Real second;  ///< d^2/dx^2 (x^a) = a (a-1) x^(a-2)
+};
+
+inline Real integer_power(Real base, int exponent) {
+    Real result = Real(1);
+    for (int k = 0; k < exponent; ++k) {
+        result *= base;
     }
+    return result;
 }
 
-void eval_hex20_grad_internal(Real r, Real s, Real t, std::span<Gradient> internal_grads) {
-    const auto rp = quadratic_powers(r);
-    const auto sp = quadratic_powers(s);
-    const auto tp = quadratic_powers(t);
-    Real dphi_dr[20], dphi_ds[20], dphi_dt[20];
-    for (int j = 0; j < 20; ++j) {
-        const int a = kHex20MonomialExponents[j][0];
-        const int b = kHex20MonomialExponents[j][1];
-        const int c = kHex20MonomialExponents[j][2];
-
-        dphi_dr[j] = (a > 0) ? Real(a) * rp[static_cast<std::size_t>(a - 1)] *
-                                    sp[static_cast<std::size_t>(b)] *
-                                    tp[static_cast<std::size_t>(c)]
-                              : Real(0);
-        dphi_ds[j] = (b > 0) ? rp[static_cast<std::size_t>(a)] *
-                                    Real(b) * sp[static_cast<std::size_t>(b - 1)] *
-                                    tp[static_cast<std::size_t>(c)]
-                              : Real(0);
-        dphi_dt[j] = (c > 0) ? rp[static_cast<std::size_t>(a)] *
-                                    sp[static_cast<std::size_t>(b)] *
-                                    Real(c) * tp[static_cast<std::size_t>(c - 1)]
-                              : Real(0);
-    }
-
-    for (int i = 0; i < 20; ++i) {
-        Real gr = Real(0), gs = Real(0), gt = Real(0);
-        for (int j = 0; j < 20; ++j) {
-            gr += kHex20Coefficients[j][i] * dphi_dr[j];
-            gs += kHex20Coefficients[j][i] * dphi_ds[j];
-            gt += kHex20Coefficients[j][i] * dphi_dt[j];
-        }
-        internal_grads[i][0] = gr;
-        internal_grads[i][1] = gs;
-        internal_grads[i][2] = gt;
-    }
+inline MonomialAxis monomial_axis(Real x, int exponent) {
+    MonomialAxis axis;
+    axis.value = integer_power(x, exponent);
+    axis.first = (exponent > 0) ? Real(exponent) * integer_power(x, exponent - 1) : Real(0);
+    axis.second = (exponent > 1)
+                      ? Real(exponent * (exponent - 1)) * integer_power(x, exponent - 2)
+                      : Real(0);
+    return axis;
 }
 
-void eval_hex20_hess_internal(Real r, Real s, Real t, std::span<Hessian> internal_hessians) {
-    const auto rp = quadratic_powers(r);
-    const auto sp = quadratic_powers(s);
-    const auto tp = quadratic_powers(t);
-    Real d2phi_drr[20], d2phi_dss[20], d2phi_dtt[20];
-    Real d2phi_drs[20], d2phi_drt[20], d2phi_dst[20];
-    for (int j = 0; j < 20; ++j) {
-        const int a = kHex20MonomialExponents[j][0];
-        const int b = kHex20MonomialExponents[j][1];
-        const int c = kHex20MonomialExponents[j][2];
-
-        d2phi_drr[j] = (a > 1) ? Real(a * (a - 1)) *
-                                      rp[static_cast<std::size_t>(a - 2)] *
-                                      sp[static_cast<std::size_t>(b)] *
-                                      tp[static_cast<std::size_t>(c)]
-                                : Real(0);
-        d2phi_dss[j] = (b > 1) ? rp[static_cast<std::size_t>(a)] *
-                                      Real(b * (b - 1)) *
-                                      sp[static_cast<std::size_t>(b - 2)] *
-                                      tp[static_cast<std::size_t>(c)]
-                                : Real(0);
-        d2phi_dtt[j] = (c > 1) ? rp[static_cast<std::size_t>(a)] *
-                                      sp[static_cast<std::size_t>(b)] *
-                                      Real(c * (c - 1)) *
-                                      tp[static_cast<std::size_t>(c - 2)]
-                                : Real(0);
-        d2phi_drs[j] = (a > 0 && b > 0) ? Real(a * b) *
-                                              rp[static_cast<std::size_t>(a - 1)] *
-                                              sp[static_cast<std::size_t>(b - 1)] *
-                                              tp[static_cast<std::size_t>(c)]
-                                        : Real(0);
-        d2phi_drt[j] = (a > 0 && c > 0) ? Real(a * c) *
-                                              rp[static_cast<std::size_t>(a - 1)] *
-                                              sp[static_cast<std::size_t>(b)] *
-                                              tp[static_cast<std::size_t>(c - 1)]
-                                        : Real(0);
-        d2phi_dst[j] = (b > 0 && c > 0) ? rp[static_cast<std::size_t>(a)] *
-                                              Real(b * c) *
-                                              sp[static_cast<std::size_t>(b - 1)] *
-                                              tp[static_cast<std::size_t>(c - 1)]
-                                        : Real(0);
-    }
-
-    for (int i = 0; i < 20; ++i) {
-        Hessian H = Hessian::Zero();
-        for (int j = 0; j < 20; ++j) {
-            H(0, 0) += kHex20Coefficients[j][i] * d2phi_drr[j];
-            H(1, 1) += kHex20Coefficients[j][i] * d2phi_dss[j];
-            H(2, 2) += kHex20Coefficients[j][i] * d2phi_dtt[j];
-            H(0, 1) += kHex20Coefficients[j][i] * d2phi_drs[j];
-            H(0, 2) += kHex20Coefficients[j][i] * d2phi_drt[j];
-            H(1, 2) += kHex20Coefficients[j][i] * d2phi_dst[j];
+// Evaluate a nodal basis defined by a monomial coefficient table at one
+// reference point. For each monomial j the routine forms x^a y^b z^c and the
+// requested derivatives, then accumulates the coefficient-weighted sum into the
+// output slots. `count` is both the number of monomials and the number of basis
+// functions (the coefficient table is square). Outputs are assumed pre-zeroed by
+// the caller; an empty span skips that quantity.
+//
+// `table_to_output_order` maps each output slot to the basis column of the
+// coefficient table. An empty span means the table is already in output (public
+// node) order, i.e. the identity permutation: Hex20 supplies a real permutation
+// because its table is authored in an internal node order, while Wedge15 and the
+// quadrilateral serendipity tables are authored directly in public order.
+template <typename ExponentFn, typename CoeffFn>
+void eval_monomial_basis(Real r, Real s, Real t,
+                         std::size_t count,
+                         ExponentFn&& exponent,
+                         CoeffFn&& coeff,
+                         std::span<const std::size_t> table_to_output_order,
+                         std::span<Real> values,
+                         std::span<Gradient> gradients,
+                         std::span<Hessian> hessians) {
+    const bool want_values = !values.empty();
+    const bool want_gradients = !gradients.empty();
+    const bool want_hessians = !hessians.empty();
+
+    for (std::size_t j = 0; j < count; ++j) {
+        const std::array<int, 3> e = exponent(j);
+        const MonomialAxis ax = monomial_axis(r, e[0]);
+        const MonomialAxis ay = monomial_axis(s, e[1]);
+        const MonomialAxis az = monomial_axis(t, e[2]);
+
+        const Real phi = ax.value * ay.value * az.value;
+
+        Real d_dr = Real(0), d_ds = Real(0), d_dt = Real(0);
+        if (want_gradients || want_hessians) {
+            d_dr = ax.first * ay.value * az.value;
+            d_ds = ax.value * ay.first * az.value;
+            d_dt = ax.value * ay.value * az.first;
         }
-        H(1, 0) = H(0, 1);
-        H(2, 0) = H(0, 2);
-        H(2, 1) = H(1, 2);
-        internal_hessians[i] = H;
-    }
-}
-
-void eval_wedge15_polynomial(Real r,
-                             Real s,
-                             Real t,
-                             std::span<Real> values,
-                             std::span<Gradient> gradients,
-                             std::span<Hessian> hessians) {
-    Real phi[15]{};
-    Real dr[15]{};
-    Real ds[15]{};
-    Real dt[15]{};
-    Real drr[15]{};
-    Real dss[15]{};
-    Real dtt[15]{};
-    Real drs[15]{};
-    Real drt[15]{};
-    Real dst[15]{};
-
-    const auto rp = quadratic_powers(r);
-    const auto sp = quadratic_powers(s);
-    const auto tp = quadratic_powers(t);
-
-    for (int j = 0; j < 15; ++j) {
-        const auto& exponent = kWedge15MonomialExponents[static_cast<std::size_t>(j)];
-        const int a = exponent[0];
-        const int b = exponent[1];
-        const int c = exponent[2];
-        const auto ar = static_cast<std::size_t>(a);
-        const auto bs = static_cast<std::size_t>(b);
-        const auto ct = static_cast<std::size_t>(c);
-
-        const Real ra = rp[ar];
-        const Real sb = sp[bs];
-        const Real tc = tp[ct];
 
-        if (!values.empty()) {
-            phi[j] = ra * sb * tc;
-        }
-        if (!gradients.empty()) {
-            dr[j] = (a > 0) ? Real(a) * rp[ar - 1u] * sb * tc : Real(0);
-            ds[j] = (b > 0) ? ra * Real(b) * sp[bs - 1u] * tc : Real(0);
-            dt[j] = (c > 0) ? ra * sb * Real(c) * tp[ct - 1u] : Real(0);
-        }
-        if (!hessians.empty()) {
-            drr[j] = (a > 1) ? Real(a * (a - 1)) * rp[ar - 2u] * sb * tc : Real(0);
-            dss[j] = (b > 1) ? ra * Real(b * (b - 1)) * sp[bs - 2u] * tc : Real(0);
-            dtt[j] = (c > 1) ? ra * sb * Real(c * (c - 1)) * tp[ct - 2u] : Real(0);
-            drs[j] = (a > 0 && b > 0) ? Real(a * b) * rp[ar - 1u] * sp[bs - 1u] * tc : Real(0);
-            drt[j] = (a > 0 && c > 0) ? Real(a * c) * rp[ar - 1u] * sb * tp[ct - 1u] : Real(0);
-            dst[j] = (b > 0 && c > 0) ? ra * Real(b * c) * sp[bs - 1u] * tp[ct - 1u] : Real(0);
+        Real d_drr = Real(0), d_dss = Real(0), d_dtt = Real(0);
+        Real d_drs = Real(0), d_drt = Real(0), d_dst = Real(0);
+        if (want_hessians) {
+            d_drr = ax.second * ay.value * az.value;
+            d_dss = ax.value * ay.second * az.value;
+            d_dtt = ax.value * ay.value * az.second;
+            d_drs = ax.first * ay.first * az.value;
+            d_drt = ax.first * ay.value * az.first;
+            d_dst = ax.value * ay.first * az.first;
         }
-    }
 
-    for (int i = 0; i < 15; ++i) {
-        Real value = Real(0);
-        Real gr = Real(0);
-        Real gs = Real(0);
-        Real gt = Real(0);
-        Hessian H = Hessian::Zero();
-        for (int j = 0; j < 15; ++j) {
-            const Real coefficient =
-                kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
-            if (!values.empty()) {
-                value += coefficient * phi[j];
+        for (std::size_t slot = 0; slot < count; ++slot) {
+            const std::size_t basis_index =
+                table_to_output_order.empty() ? slot : table_to_output_order[slot];
+            const Real c = coeff(j, basis_index);
+            if (want_values) {
+                values[slot] += c * phi;
             }
-            if (!gradients.empty()) {
-                gr += coefficient * dr[j];
-                gs += coefficient * ds[j];
-                gt += coefficient * dt[j];
+            if (want_gradients) {
+                Gradient& g = gradients[slot];
+                g[0] += c * d_dr;
+                g[1] += c * d_ds;
+                g[2] += c * d_dt;
             }
-            if (!hessians.empty()) {
-                H(0, 0) += coefficient * drr[j];
-                H(1, 1) += coefficient * dss[j];
-                H(2, 2) += coefficient * dtt[j];
-                H(0, 1) += coefficient * drs[j];
-                H(0, 2) += coefficient * drt[j];
-                H(1, 2) += coefficient * dst[j];
+            if (want_hessians) {
+                Hessian& h = hessians[slot];
+                h(0, 0) += c * d_drr;
+                h(1, 1) += c * d_dss;
+                h(2, 2) += c * d_dtt;
+                h(0, 1) += c * d_drs;
+                h(1, 0) += c * d_drs;
+                h(0, 2) += c * d_drt;
+                h(2, 0) += c * d_drt;
+                h(1, 2) += c * d_dst;
+                h(2, 1) += c * d_dst;
             }
         }
-
-        const std::size_t index = static_cast<std::size_t>(i);
-        if (!values.empty()) {
-            values[index] = value;
-        }
-        if (!gradients.empty()) {
-            gradients[index][0] = gr;
-            gradients[index][1] = gs;
-            gradients[index][2] = gt;
-        }
-        if (!hessians.empty()) {
-            H(1, 0) = H(0, 1);
-            H(2, 0) = H(0, 2);
-            H(2, 1) = H(1, 2);
-            hessians[index] = H;
-        }
     }
 }
 
@@ -581,43 +467,20 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
             SVMP_HERE,
             "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation");
 
-        for (std::size_t j = 0; j < size_; ++j) {
-            const auto [ax, ay] = quad_monomial_exponents_[j];
-            const Real value = std::pow(x, ax) * std::pow(y, ay);
-            const Real dx =
-                (ax > 0) ? Real(ax) * std::pow(x, ax - 1) * std::pow(y, ay) : Real(0);
-            const Real dy =
-                (ay > 0) ? std::pow(x, ax) * Real(ay) * std::pow(y, ay - 1) : Real(0);
-            const Real dxx =
-                (ax > 1) ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
-                         : Real(0);
-            const Real dxy =
-                (ax > 0 && ay > 0)
-                    ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
-                    : Real(0);
-            const Real dyy =
-                (ay > 1) ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
-                         : Real(0);
-
-            for (std::size_t i = 0; i < size_; ++i) {
-                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                if (!values_out.empty()) {
-                    values_out[i] += value * coeff;
-                }
-                if (!gradients_out.empty()) {
-                    Gradient& g = gradients_out[i];
-                    g[0] += dx * coeff;
-                    g[1] += dy * coeff;
-                }
-                if (!hessians_out.empty()) {
-                    Hessian& h = hessians_out[i];
-                    h(0, 0) += dxx * coeff;
-                    h(0, 1) += dxy * coeff;
-                    h(1, 0) += dxy * coeff;
-                    h(1, 1) += dyy * coeff;
-                }
-            }
-        }
+        // Quadrilateral serendipity monomials are planar; the through-axis
+        // exponent is zero, so all z derivatives vanish. The inverse Vandermonde
+        // is already in public node order (identity output ordering).
+        eval_monomial_basis(
+            x, y, z, size_,
+            [this](std::size_t j) {
+                const auto& e = quad_monomial_exponents_[j];
+                return std::array<int, 3>{e[0], e[1], 0};
+            },
+            [this](std::size_t j, std::size_t i) {
+                return quad_inv_vandermonde_[j * size_ + i];
+            },
+            std::span<const std::size_t>{},
+            values_out, gradients_out, hessians_out);
         return;
     }
 
@@ -627,41 +490,29 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
     }
 
     if (element_type_ == ElementType::Hex20) {
+        // The Hex20 coefficient table is authored in an internal node order, so
+        // results are remapped into the public node layout through mesh_to_basis.
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
         FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
                                                "Hex20 mesh-to-basis ordering is not registered");
-
-        if (!values_out.empty()) {
-            std::array<Real, 20u> internal_vals{};
-            eval_hex20_internal(x, y, z, internal_vals);
-            for (std::size_t i = 0; i < 20u; ++i) {
-                values_out[i] = internal_vals[mesh_to_basis[i]];
-            }
-        }
-        if (!gradients_out.empty()) {
-            std::array<Gradient, 20u> internal_grads{};
-            eval_hex20_grad_internal(x, y, z, internal_grads);
-            for (std::size_t i = 0; i < 20u; ++i) {
-                gradients_out[i] = internal_grads[mesh_to_basis[i]];
-            }
-        }
-        if (!hessians_out.empty()) {
-            std::array<Hessian, 20u> internal_hessians{};
-            eval_hex20_hess_internal(x, y, z, internal_hessians);
-            for (std::size_t i = 0; i < 20u; ++i) {
-                hessians_out[i] = internal_hessians[mesh_to_basis[i]];
-            }
-        }
+        eval_monomial_basis(
+            x, y, z, size_,
+            [](std::size_t j) { return kHex20MonomialExponents[j]; },
+            [](std::size_t j, std::size_t i) { return kHex20Coefficients[j][i]; },
+            mesh_to_basis,
+            values_out, gradients_out, hessians_out);
         return;
     }
 
     if (element_type_ == ElementType::Wedge15) {
-        eval_wedge15_polynomial(x,
-                                 y,
-                                 z,
-                                 values_out,
-                                 gradients_out,
-                                 hessians_out);
+        // The Wedge15 coefficient table is authored directly in public node
+        // order, so no output reordering is applied (identity permutation).
+        eval_monomial_basis(
+            x, y, z, size_,
+            [](std::size_t j) { return kWedge15MonomialExponents[j]; },
+            [](std::size_t j, std::size_t i) { return kWedge15Coefficients[j][i]; },
+            std::span<const std::size_t>{},
+            values_out, gradients_out, hessians_out);
         return;
     }
 

From 56bd86ce57908fad86cf85547fdc5a9d4133d263 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 13:59:14 -0700
Subject: [PATCH 29/91] adding a `nodes()` method to the base abstraction for
 basis functions and finalizing LagrangeBasis

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp  |  7 +++++++
 Code/Source/solver/FE/Basis/BasisFunction.h    | 12 ++++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.h    | 14 +++++++-------
 Code/Source/solver/FE/Basis/SerendipityBasis.h |  2 +-
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 1c8c31e5d..56a9238c9 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -32,6 +32,13 @@ void require_span_size(std::size_t actual,
 
 } // namespace
 
+const std::vector<math::Vector<Real, 3>>& BasisFunction::nodes() const noexcept {
+    // Default for bases that do not expose interpolation nodes; nodal families
+    // (LagrangeBasis, SerendipityBasis) override this to return their layout.
+    static const std::vector<math::Vector<Real, 3>> kNoNodes;
+    return kNoNodes;
+}
+
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 8327ffda9..0cffe52a9 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -194,6 +194,18 @@ class BasisFunction {
     /// \return Basis function count.
     virtual std::size_t size() const noexcept = 0;
 
+    /// \brief Return the reference interpolation nodes in basis ordering.
+    ///
+    /// \details Nodal families return one reference-element coordinate per basis
+    /// function, in the same order as the evaluator outputs. Bases that do not
+    /// define interpolation nodes (non-nodal families, or abstract base usage)
+    /// return an empty vector. The returned reference is valid for the lifetime
+    /// of the basis object.
+    ///
+    /// \return Reference node coordinates: size() entries for nodal families,
+    ///         empty otherwise.
+    virtual const std::vector<math::Vector<Real, 3>>& nodes() const noexcept;
+
     /// \brief Evaluate basis function values at a reference coordinate.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 6137a557a..9546e64f7 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -61,7 +61,7 @@ namespace basis {
 /// The vector-returning evaluators are convenient API wrappers. The `*_to`
 /// methods write to caller-provided spans and are intended for assembly paths
 /// that avoid temporary allocations.
-class LagrangeBasis : public BasisFunction {
+class LagrangeBasis final : public BasisFunction {
 public:
     /// \brief Axis-index tuple for tensor-product reference nodes.
     using TensorNodeIndex = std::array<std::size_t, 3>;
@@ -88,19 +88,19 @@ class LagrangeBasis : public BasisFunction {
     LagrangeBasis(ElementType type, int order);
 
     /// \copydoc BasisFunction::basis_type()
-    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+    BasisType basis_type() const noexcept final { return BasisType::Lagrange; }
 
     /// \copydoc BasisFunction::element_type()
-    ElementType element_type() const noexcept override { return element_type_; }
+    ElementType element_type() const noexcept final { return element_type_; }
 
     /// \copydoc BasisFunction::dimension()
-    int dimension() const noexcept override { return dimension_; }
+    int dimension() const noexcept final { return dimension_; }
 
     /// \copydoc BasisFunction::order()
-    int order() const noexcept override { return order_; }
+    int order() const noexcept final { return order_; }
 
     /// \copydoc BasisFunction::size()
-    std::size_t size() const noexcept override { return nodes_.size(); }
+    std::size_t size() const noexcept final { return nodes_.size(); }
 
     /// \brief Return the reference interpolation nodes in basis ordering.
     ///
@@ -111,7 +111,7 @@ class LagrangeBasis : public BasisFunction {
     /// reference coordinates with a \f$[-1,1]\f$ through-axis coordinate.
     ///
     /// \return Reference node coordinates, one per basis function.
-    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept final { return nodes_; }
 
     /// \brief Evaluate Lagrange basis function values at a reference coordinate.
     ///
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index c395fcea1..7cab0d4fd 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -100,7 +100,7 @@ class SerendipityBasis final : public BasisFunction {
     /// ReferenceNodeLayout.
     ///
     /// \return Reference node coordinates, one per basis function.
-    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept final { return nodes_; }
 
     /// \brief Evaluate serendipity basis function values at a reference coordinate.
     ///

From bcf2b3d270e15b8af97c72184f56977a3f383580 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 14:44:50 -0700
Subject: [PATCH 30/91] Deleted the unused size-family traits and their tests
 for LagrangeBasis

---
 Code/Source/solver/FE/Basis/BasisTraits.h     | 46 -------------------
 .../FE/Basis/test_ConstexprBasis.cpp          |  7 ---
 2 files changed, 53 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index eca5c1c69..861c7c83d 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -174,52 +174,6 @@ namespace detail {
     }
 }
 
-[[nodiscard]] constexpr std::size_t line_lagrange_size(int order) noexcept {
-    return order >= 0 ? static_cast<std::size_t>(order + 1) : 0u;
-}
-
-[[nodiscard]] constexpr std::size_t triangle_lagrange_size(int order) noexcept {
-    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) / 2) : 0u;
-}
-
-[[nodiscard]] constexpr std::size_t quad_lagrange_size(int order) noexcept {
-    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1)) : 0u;
-}
-
-[[nodiscard]] constexpr std::size_t tetra_lagrange_size(int order) noexcept {
-    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6) : 0u;
-}
-
-[[nodiscard]] constexpr std::size_t hex_lagrange_size(int order) noexcept {
-    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)) : 0u;
-}
-
-[[nodiscard]] constexpr std::size_t wedge_lagrange_size(int order) noexcept {
-    return triangle_lagrange_size(order) * line_lagrange_size(order);
-}
-
-[[nodiscard]] constexpr std::size_t complete_lagrange_alias_size(ElementType type) noexcept {
-    const int order = complete_lagrange_alias_order(type);
-    switch (canonical_lagrange_type(type)) {
-        case ElementType::Point1:
-            return 1u;
-        case ElementType::Line2:
-            return line_lagrange_size(order);
-        case ElementType::Triangle3:
-            return triangle_lagrange_size(order);
-        case ElementType::Quad4:
-            return quad_lagrange_size(order);
-        case ElementType::Tetra4:
-            return tetra_lagrange_size(order);
-        case ElementType::Hex8:
-            return hex_lagrange_size(order);
-        case ElementType::Wedge6:
-            return wedge_lagrange_size(order);
-        default:
-            return 0u;
-    }
-}
-
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index 44e588fdc..31b95eb77 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -39,13 +39,6 @@ static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
 static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
 static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
-static_assert(line_lagrange_size(2) == 3u);
-static_assert(triangle_lagrange_size(2) == 6u);
-static_assert(quad_lagrange_size(2) == 9u);
-static_assert(tetra_lagrange_size(2) == 10u);
-static_assert(hex_lagrange_size(2) == 27u);
-static_assert(wedge_lagrange_size(2) == 18u);
-static_assert(complete_lagrange_alias_size(ElementType::Pyramid14) == 0u);
 static_assert(detail::basis_abs(Real(-2)) == Real(2));
 static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
 static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));

From 57d79b47bf4019417b7c12aefa2624bb87abbdba Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 15:26:57 -0700
Subject: [PATCH 31/91] defaulting BasisRequest's enum fields, swapping the
 shared thread-local evaluation scratch for buffers and validating Lagrange
 node coordinates against the reference lattice

---
 Code/Source/solver/FE/Basis/BasisFactory.h    |  4 ++--
 Code/Source/solver/FE/Basis/BasisFunction.cpp | 20 +++--------------
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 22 +++++++++++++++++--
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index 3922d5ced..b14cc5501 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -20,8 +20,8 @@ namespace FE {
 namespace basis {
 
 struct BasisRequest {
-    ElementType element_type;
-    BasisType basis_type;
+    ElementType element_type{ElementType::Unknown};
+    BasisType basis_type{BasisType::Lagrange};
     std::optional<int> order{};
     Continuity continuity{Continuity::C0};
     FieldType field_type{FieldType::Scalar};
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 56a9238c9..d10be7133 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -12,17 +12,6 @@ namespace basis {
 
 namespace {
 
-struct BasisFunctionScratch {
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-};
-
-BasisFunctionScratch& scratch() {
-    static thread_local BasisFunctionScratch data;
-    return data;
-}
-
 void require_span_size(std::size_t actual,
                        std::size_t expected,
                        const char* label) {
@@ -67,8 +56,7 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
                                        std::span<Real> values_out) const {
     require_span_size(values_out.size(), size(), "evaluate_values_to");
-    auto& tmp = scratch().values;
-    tmp.resize(size());
+    std::vector<Real> tmp(size());
     evaluate_values(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
 }
@@ -76,8 +64,7 @@ void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                           std::span<Gradient> gradients_out) const {
     require_span_size(gradients_out.size(), size(), "evaluate_gradients_to");
-    auto& tmp = scratch().gradients;
-    tmp.resize(size());
+    std::vector<Gradient> tmp(size());
     evaluate_gradients(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
 }
@@ -85,8 +72,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                          std::span<Hessian> hessians_out) const {
     require_span_size(hessians_out.size(), size(), "evaluate_hessians_to");
-    auto& tmp = scratch().hessians;
-    tmp.resize(size());
+    std::vector<Hessian> tmp(size());
     evaluate_hessians(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
 }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index ab5e73ac7..b368ec47f 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -92,7 +92,13 @@ std::size_t axis_index_pm_one(Real coord, int order) {
         return 0u;
     }
     const Real scaled = (coord + Real(1)) * Real(order) / Real(2);
-    return static_cast<std::size_t>(std::llround(scaled));
+    const long long rounded = std::llround(scaled);
+    FE::throw_if<BasisConstructionException>(
+        rounded < 0 || rounded > static_cast<long long>(order) ||
+            !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
+        SVMP_HERE,
+        "LagrangeBasis: tensor-product node coordinate is off the equispaced lattice");
+    return static_cast<std::size_t>(rounded);
 }
 
 // Convert a simplex barycentric coordinate to a lattice index.
@@ -100,7 +106,14 @@ int simplex_lattice_index(Real value, int order) {
     if (order <= 0) {
         return 0;
     }
-    return static_cast<int>(std::llround(value * Real(order)));
+    const Real scaled = value * Real(order);
+    const long long rounded = std::llround(scaled);
+    FE::throw_if<BasisConstructionException>(
+        rounded < 0 || rounded > static_cast<long long>(order) ||
+            !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
+        SVMP_HERE,
+        "LagrangeBasis: simplex node coordinate is off the lattice");
+    return static_cast<int>(rounded);
 }
 
 // Compute simplex interpolation exponents from a reference node.
@@ -121,6 +134,11 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
         e[3] = simplex_lattice_index(p[2], order);
         e[0] = order - e[1] - e[2] - e[3];
     }
+    // e[0] is order minus the other exponents, so the exponents sum to order by
+    // construction; a negative e[0] means the node coordinates are off-lattice.
+    FE::throw_if<BasisConstructionException>(
+        e[0] < 0, SVMP_HERE,
+        "LagrangeBasis: simplex node coordinate yields a negative implied exponent");
     return e;
 }
 

From 6eaa60c29d2b58e30ea7a694e788fc19e1372e04 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 15:43:25 -0700
Subject: [PATCH 32/91] fixing the Hex8 Hessian-delegation doc, renaming the
 serendipity test, stopped the LU factorization from reporting pivot
 magnitudes as SVD singular-value/condition diagnostics

---
 Code/Source/solver/FE/Basis/SerendipityBasis.h  |  6 +++---
 .../solver/FE/Math/DenseLinearAlgebra.cpp       | 17 +++++++----------
 Code/Source/solver/FE/Math/DenseLinearAlgebra.h |  2 ++
 ...ensorModal.cpp => test_SerendipityBasis.cpp} |  4 ++--
 4 files changed, 14 insertions(+), 15 deletions(-)
 rename tests/unitTests/FE/Basis/{test_SerendipityTensorModal.cpp => test_SerendipityBasis.cpp} (98%)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 7cab0d4fd..95a9d0ad9 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -134,9 +134,9 @@ class SerendipityBasis final : public BasisFunction {
     /// \details Hessians are second derivatives in reference coordinates and
     /// are stored as 3-by-3 matrices. Quadrilateral Hessians use second
     /// derivatives of the monomial vector and inverse Vandermonde coefficients.
-    /// Hex8 Hessians are delegated to the linear Lagrange Hex8 basis. Hex20 and
-    /// Wedge15 Hessians are computed by differentiating their polynomial
-    /// coefficient tables twice.
+    /// Hex8 Hessians are computed directly from the trilinear corner products.
+    /// Hex20 and Wedge15 Hessians are computed by differentiating their
+    /// polynomial coefficient tables twice.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index c5ed67bec..0260ad2b1 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -185,15 +185,13 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
         min_pivot_abs = std::min(min_pivot_abs, pivot_magnitude);
     }
 
+    // PartialPivLU is not rank-revealing, so expose only what the pivots
+    // legitimately convey: the factorization passed the pivot-tolerance check
+    // above (full rank) and the pivot magnitudes.
     solver.diagnostics.rank = n;
     solver.diagnostics.tolerance = solver.pivot_tolerance;
-    solver.diagnostics.largest_singular_value = max_abs;
-    solver.diagnostics.smallest_retained_singular_value =
-        std::isfinite(min_pivot_abs) ? min_pivot_abs : Real(0);
-    if (solver.diagnostics.smallest_retained_singular_value > Real(0)) {
-        solver.diagnostics.condition_estimate =
-            max_pivot_abs / solver.diagnostics.smallest_retained_singular_value;
-    }
+    solver.max_pivot = max_pivot_abs;
+    solver.min_pivot = std::isfinite(min_pivot_abs) ? min_pivot_abs : Real(0);
     return solver;
 }
 
@@ -213,8 +211,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
         dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
                                  n, n, label);
 
-    if (std::isfinite(solver.diagnostics.condition_estimate) &&
-        std::isfinite(result.diagnostics.condition_estimate) &&
+    if (std::isfinite(result.diagnostics.condition_estimate) &&
         result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
         const DenseMatrix dense = map_row_major(matrix, n, n);
         Eigen::JacobiSVD<DenseMatrix> svd(dense,
@@ -224,7 +221,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
             ::svmp::FE::check_arg<FEException>(
-                singular_values[i] > solver.diagnostics.tolerance, SVMP_HERE,
+                singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = Real(1) / singular_values[i];
         }
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index d322ef958..08669de74 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -65,6 +65,8 @@ struct DenseLUSolver {
     Eigen::PartialPivLU<DenseMatrix> lu;
     DenseMatrixDiagnostics diagnostics;
     Real pivot_tolerance{0};
+    Real min_pivot{0};
+    Real max_pivot{0};
     std::string label;
 
     [[nodiscard]] bool empty() const noexcept { return n == 0; }
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
similarity index 98%
rename from tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
rename to tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 12062e4f6..d44631734 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -1,6 +1,6 @@
 /**
- * @file test_SerendipityTensorModal.cpp
- * @brief Tests for the migrated Serendipity basis subset.
+ * @file test_SerendipityBasis.cpp
+ * @brief Nodal-delta, partition-of-unity, and polynomial-reproduction tests for SerendipityBasis.
  */
 
 #include <gtest/gtest.h>

From d4d3c4ac17eb81f0546d47893ef8c48cc64486ef Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 18:38:32 -0700
Subject: [PATCH 33/91] deriving the basis topology() and shape predicates from
 the single mesh cell-family classification

---
 Code/Source/solver/FE/Basis/BasisTraits.h | 67 +++++++++++------------
 1 file changed, 32 insertions(+), 35 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 861c7c83d..184aff041 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -55,37 +55,59 @@ namespace detail {
 
 } // namespace detail
 
+// Reference-cell topology is derived from the single mesh cell-family
+// classification (to_mesh_family) so the basis layer never maintains a parallel
+// ElementType->shape switch; adding an ElementType updates only to_mesh_family.
+// ElementType::Unknown must stay Unknown here: CellFamily has no "unknown"
+// member, so to_mesh_family() falls back to Point for unrecognized types.
+[[nodiscard]] constexpr BasisTopology topology(ElementType type) noexcept {
+    if (type == ElementType::Unknown) {
+        return BasisTopology::Unknown;
+    }
+    switch (to_mesh_family(type)) {
+        case CellFamily::Point:    return BasisTopology::Point;
+        case CellFamily::Line:     return BasisTopology::Line;
+        case CellFamily::Triangle: return BasisTopology::Triangle;
+        case CellFamily::Quad:     return BasisTopology::Quadrilateral;
+        case CellFamily::Tetra:    return BasisTopology::Tetrahedron;
+        case CellFamily::Hex:      return BasisTopology::Hexahedron;
+        case CellFamily::Wedge:    return BasisTopology::Wedge;
+        // Pyramid/Polygon/Polyhedron are outside the current basis scope.
+        default:                   return BasisTopology::Unknown;
+    }
+}
+
+// The shape predicates derive from topology() so they share its single source.
 [[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
-    return type == ElementType::Point1;
+    return topology(type) == BasisTopology::Point;
 }
 
 [[nodiscard]] constexpr bool is_line(ElementType type) noexcept {
-    return type == ElementType::Line2 || type == ElementType::Line3;
+    return topology(type) == BasisTopology::Line;
 }
 
 [[nodiscard]] constexpr bool is_triangle(ElementType type) noexcept {
-    return type == ElementType::Triangle3 || type == ElementType::Triangle6;
+    return topology(type) == BasisTopology::Triangle;
 }
 
 [[nodiscard]] constexpr bool is_quadrilateral(ElementType type) noexcept {
-    return type == ElementType::Quad4 || type == ElementType::Quad8 ||
-           type == ElementType::Quad9;
+    return topology(type) == BasisTopology::Quadrilateral;
 }
 
 [[nodiscard]] constexpr bool is_tetrahedron(ElementType type) noexcept {
-    return type == ElementType::Tetra4 || type == ElementType::Tetra10;
+    return topology(type) == BasisTopology::Tetrahedron;
 }
 
 [[nodiscard]] constexpr bool is_hexahedron(ElementType type) noexcept {
-    return type == ElementType::Hex8 || type == ElementType::Hex20 ||
-           type == ElementType::Hex27;
+    return topology(type) == BasisTopology::Hexahedron;
 }
 
 [[nodiscard]] constexpr bool is_wedge(ElementType type) noexcept {
-    return type == ElementType::Wedge6 || type == ElementType::Wedge15 ||
-           type == ElementType::Wedge18;
+    return topology(type) == BasisTopology::Wedge;
 }
 
+// Pyramids are outside the current basis scope, so topology() maps them to
+// Unknown and there is no BasisTopology::Pyramid to test against here.
 [[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
     (void)type;
     return false;
@@ -103,31 +125,6 @@ namespace detail {
     return element_dimension(type);
 }
 
-[[nodiscard]] constexpr BasisTopology topology(ElementType type) noexcept {
-    if (is_point(type)) {
-        return BasisTopology::Point;
-    }
-    if (is_line(type)) {
-        return BasisTopology::Line;
-    }
-    if (is_triangle(type)) {
-        return BasisTopology::Triangle;
-    }
-    if (is_quadrilateral(type)) {
-        return BasisTopology::Quadrilateral;
-    }
-    if (is_tetrahedron(type)) {
-        return BasisTopology::Tetrahedron;
-    }
-    if (is_hexahedron(type)) {
-        return BasisTopology::Hexahedron;
-    }
-    if (is_wedge(type)) {
-        return BasisTopology::Wedge;
-    }
-    return BasisTopology::Unknown;
-}
-
 [[nodiscard]] constexpr ElementType canonical_lagrange_type(ElementType type) noexcept {
     switch (type) {
         case ElementType::Line2:

From 03a69be3724820d79afac2db1ffc154b2dcd0cb0 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 17 Jun 2026 23:49:53 -0700
Subject: [PATCH 34/91] consolidating span-size checks and
 equispaced-coordinate formula

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp | 12 ++--
 Code/Source/solver/FE/Basis/BasisFunction.h   | 16 +++++
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 59 ++++++-------------
 .../FE/Basis/NodeOrderingConventions.cpp      |  7 ---
 .../solver/FE/Basis/NodeOrderingConventions.h | 11 ++++
 .../solver/FE/Basis/SerendipityBasis.cpp      | 22 +------
 6 files changed, 52 insertions(+), 75 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index d10be7133..a349b4f80 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -10,17 +10,13 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-namespace {
-
 void require_span_size(std::size_t actual,
                        std::size_t expected,
                        const char* label) {
     FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
-        std::string("BasisFunction::") + label + ": output span is smaller than basis size");
+        std::string(label) + ": output span is smaller than basis size");
 }
 
-} // namespace
-
 const std::vector<math::Vector<Real, 3>>& BasisFunction::nodes() const noexcept {
     // Default for bases that do not expose interpolation nodes; nodal families
     // (LagrangeBasis, SerendipityBasis) override this to return their layout.
@@ -55,7 +51,7 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
                                        std::span<Real> values_out) const {
-    require_span_size(values_out.size(), size(), "evaluate_values_to");
+    require_span_size(values_out.size(), size(), "BasisFunction::evaluate_values_to");
     std::vector<Real> tmp(size());
     evaluate_values(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
@@ -63,7 +59,7 @@ void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                           std::span<Gradient> gradients_out) const {
-    require_span_size(gradients_out.size(), size(), "evaluate_gradients_to");
+    require_span_size(gradients_out.size(), size(), "BasisFunction::evaluate_gradients_to");
     std::vector<Gradient> tmp(size());
     evaluate_gradients(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
@@ -71,7 +67,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                          std::span<Hessian> hessians_out) const {
-    require_span_size(hessians_out.size(), size(), "evaluate_hessians_to");
+    require_span_size(hessians_out.size(), size(), "BasisFunction::evaluate_hessians_to");
     std::vector<Hessian> tmp(size());
     evaluate_hessians(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 0cffe52a9..77e5dc2f8 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -159,6 +159,22 @@ inline void add_scaled_hessian(Hessian& target,
     }
 }
 
+/// \brief Throw BasisEvaluationException when an output span is smaller than the
+/// basis size. \p label is the full "Class::method" context used in the message,
+/// so each basis family passes its own qualified name.
+void require_span_size(std::size_t actual, std::size_t expected, const char* label);
+
+/// \brief Check a requested output span unless it is empty, following the
+/// "skip this output" convention used by the combined evaluators.
+template <typename T>
+void require_requested_span_size(std::span<T> output,
+                                 std::size_t expected,
+                                 const char* label) {
+    if (!output.empty()) {
+        require_span_size(output.size(), expected, label);
+    }
+}
+
 /// \brief Abstract interface for finite-element basis-function families.
 /// \ingroup FE_Basis
 ///
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index b368ec47f..fdc6d912b 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -19,14 +19,6 @@ namespace {
 
 using Vec3 = math::Vector<Real, 3>;
 
-// Return the equispaced 1D reference coordinate in [-1, 1].
-inline constexpr Real equispaced_pm_one_coord(int i, int order) {
-    if (order <= 0) {
-        return Real(0);
-    }
-    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
-}
-
 struct AxisEval {
     std::vector<Real> value;
     std::vector<Real> first;
@@ -53,20 +45,16 @@ BasisTopology supported_lagrange_topology(ElementType type) {
 }
 
 // Normalize named higher-order element requests to base Lagrange topologies.
+//
+// This function only adds the LagrangeBasis routing policy: serendipity
+// layouts and pyramids are rejected, and a named quadratic alias
+// (Line3, Triangle6, Quad9, Tetra10, Hex27, Wedge18) is floored
+// to at least quadratic order. The floor only raises the
+// order: a higher requested order on an alias is honored, so
+// LagrangeBasis(Hex27, 5) yields an order-5 basis on the Hex8 topology rather
+// than rejecting the over-specified order.
 NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
     switch (element_type) {
-        case ElementType::Line3:
-            return {ElementType::Line2, std::max(order, 2)};
-        case ElementType::Triangle6:
-            return {ElementType::Triangle3, std::max(order, 2)};
-        case ElementType::Quad9:
-            return {ElementType::Quad4, std::max(order, 2)};
-        case ElementType::Tetra10:
-            return {ElementType::Tetra4, std::max(order, 2)};
-        case ElementType::Hex27:
-            return {ElementType::Hex8, std::max(order, 2)};
-        case ElementType::Wedge18:
-            return {ElementType::Wedge6, std::max(order, 2)};
         case ElementType::Quad8:
             FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
@@ -82,8 +70,13 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
             FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: pyramid support is not within the current solver basis scope");
         default:
-            return {element_type, order};
+            break;
     }
+
+    const ElementType canonical = canonical_lagrange_type(element_type);
+    const bool is_quadratic_alias = canonical != element_type;
+    const int normalized_order = is_quadratic_alias ? std::max(order, 2) : order;
+    return {canonical, normalized_order};
 }
 
 // Convert a coordinate on [-1, 1] to an equispaced axis node index.
@@ -304,22 +297,6 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
-void require_output_span_size(std::size_t actual,
-                              std::size_t expected,
-                              const char* label) {
-    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
-        std::string(label) + ": output span is smaller than basis size");
-}
-
-template<typename T>
-void require_requested_span_size(std::span<T> output,
-                                 std::size_t expected,
-                                 const char* label) {
-    if (!output.empty()) {
-        require_output_span_size(output.size(), expected, label);
-    }
-}
-
 } // namespace
 
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
@@ -340,7 +317,7 @@ void LagrangeBasis::init_equispaced_1d_nodes() {
     nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
     for (int i = 0; i <= order_; ++i) {
         nodes_1d_[static_cast<std::size_t>(i)] =
-            equispaced_pm_one_coord(i, order_);
+            line_coord_pm_one(i, order_);
     }
 }
 
@@ -633,19 +610,19 @@ void LagrangeBasis::evaluate_all(const Vec3& xi,
 
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
                                        std::span<Real> values_out) const {
-    require_output_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
+    require_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
     evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
                                           std::span<Gradient> gradients_out) const {
-    require_output_span_size(gradients_out.size(), size(), "LagrangeBasis::evaluate_gradients_to");
+    require_span_size(gradients_out.size(), size(), "LagrangeBasis::evaluate_gradients_to");
     evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
                                          std::span<Hessian> hessians_out) const {
-    require_output_span_size(hessians_out.size(), size(), "LagrangeBasis::evaluate_hessians_to");
+    require_span_size(hessians_out.size(), size(), "LagrangeBasis::evaluate_hessians_to");
     evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 946bc6f59..081637dc1 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -22,13 +22,6 @@ constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     16, 17, 19, 18
 };
 
-Real line_coord_pm_one(int i, int order) {
-    if (order <= 0) {
-        return Real(0);
-    }
-    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
-}
-
 Real line_coord_zero_one(int i, int order) {
     if (order <= 0) {
         return Real(0);
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 4b11cca32..4f9d1525a 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -15,6 +15,17 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \brief Equispaced 1D reference coordinate on [-1, 1]: -1 + 2 i / order.
+///
+/// Shared by the reference-node layout generators and the Lagrange tensor-axis
+/// node initialization so the lattice formula lives in a single place.
+[[nodiscard]] inline constexpr Real line_coord_pm_one(int i, int order) noexcept {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
+
 class ReferenceNodeLayout {
 public:
     static math::Vector<Real, 3> get_node_coords(ElementType elem_type,
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 3ad253696..6c9a66efd 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -364,22 +364,6 @@ void eval_monomial_basis(Real r, Real s, Real t,
     }
 }
 
-void require_output_span_size(std::size_t actual,
-                              std::size_t expected,
-                              const char* label) {
-    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
-        std::string(label) + ": output span is smaller than basis size");
-}
-
-template<typename T>
-void require_requested_span_size(std::span<T> output,
-                                 std::size_t expected,
-                                 const char* label) {
-    if (!output.empty()) {
-        require_output_span_size(output.size(), expected, label);
-    }
-}
-
 } // namespace
 
 SerendipityBasis::SerendipityBasis(ElementType type, int order)
@@ -553,19 +537,19 @@ void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
 
 void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
                                           std::span<Real> values_out) const {
-    require_output_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
+    require_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
     evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                              std::span<Gradient> gradients_out) const {
-    require_output_span_size(gradients_out.size(), size_, "SerendipityBasis::evaluate_gradients_to");
+    require_span_size(gradients_out.size(), size_, "SerendipityBasis::evaluate_gradients_to");
     evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                             std::span<Hessian> hessians_out) const {
-    require_output_span_size(hessians_out.size(), size_, "SerendipityBasis::evaluate_hessians_to");
+    require_span_size(hessians_out.size(), size_, "SerendipityBasis::evaluate_hessians_to");
     evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 

From 50b00c59503e5972e90d39bc560afbb94ce81396 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 00:25:09 -0700
Subject: [PATCH 35/91] removing add_scaled_hessian from the abstract
 basisfunction file and removing the pyramid element type since we are
 currently not supporting it

---
 Code/Source/solver/FE/Basis/BasisFunction.h      | 10 ----------
 Code/Source/solver/FE/Basis/BasisTraits.h        |  9 +++++----
 tests/unitTests/FE/Basis/test_ConstexprBasis.cpp |  4 ++--
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 77e5dc2f8..6fd3c68a2 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -149,16 +149,6 @@ using Hessian  = math::Matrix<Real, 3, 3>;
     return hessian;
 }
 
-inline void add_scaled_hessian(Hessian& target,
-                               const Hessian& source,
-                               Real scale) noexcept {
-    for (std::size_t r = 0; r < 3u; ++r) {
-        for (std::size_t c = 0; c < 3u; ++c) {
-            target(r, c) += scale * source(r, c);
-        }
-    }
-}
-
 /// \brief Throw BasisEvaluationException when an output span is smaller than the
 /// basis size. \p label is the full "Class::method" context used in the message,
 /// so each basis family passes its own qualified name.
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 184aff041..484e7c588 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -106,11 +106,12 @@ namespace detail {
     return topology(type) == BasisTopology::Wedge;
 }
 
-// Pyramids are outside the current basis scope, so topology() maps them to
-// Unknown and there is no BasisTopology::Pyramid to test against here.
+// Pyramids are a valid mesh cell family but not a supported basis topology, so
+// this classifier reads the mesh family directly: topology() maps pyramids to
+// Unknown, yet a truthful is_pyramid keeps the predicate set complete and ready
+// for future pyramid support.
 [[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
-    (void)type;
-    return false;
+    return to_mesh_family(type) == CellFamily::Pyramid;
 }
 
 [[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index 31b95eb77..e2a7bfb6d 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -25,8 +25,8 @@ static_assert(is_quadrilateral(ElementType::Quad8));
 static_assert(is_tetrahedron(ElementType::Tetra10));
 static_assert(is_hexahedron(ElementType::Hex20));
 static_assert(is_wedge(ElementType::Wedge18));
-static_assert(!is_pyramid(ElementType::Pyramid5));
-static_assert(!is_pyramid(ElementType::Pyramid14));
+static_assert(is_pyramid(ElementType::Pyramid5));
+static_assert(is_pyramid(ElementType::Pyramid14));
 static_assert(is_simplex(ElementType::Triangle3));
 static_assert(is_simplex(ElementType::Tetra4));
 static_assert(!is_simplex(ElementType::Wedge6));

From 4f7cb90ae515ec71948fa394392721e68bb8b7fe Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 00:44:31 -0700
Subject: [PATCH 36/91] Canonicalize exception helper wrappers in svmp::Core
 and migrate FE call sites to use the svmp::* helpers

---
 Code/Source/solver/Core/Exception.h           |  32 +++++
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  12 +-
 Code/Source/solver/FE/Basis/BasisFunction.cpp |   6 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  24 ++--
 .../FE/Basis/NodeOrderingConventions.cpp      |  12 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      |  22 ++--
 Code/Source/solver/FE/Common/FEException.h    | 123 ++----------------
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  28 ++--
 .../solver/FE/Math/DenseTransformKernels.h    |  10 +-
 Code/Source/solver/cep_ion.cpp                |   2 +-
 Code/Source/solver/fs.cpp                     |   2 +-
 Code/Source/solver/ionic_model.cpp            |   8 +-
 Code/Source/solver/mat_fun.h                  |   2 +-
 Code/Source/solver/nn.cpp                     |  56 ++++----
 Code/Source/solver/post.cpp                   |   4 +-
 Code/Source/solver/read_files.cpp             |   2 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  33 ++---
 17 files changed, 155 insertions(+), 223 deletions(-)

diff --git a/Code/Source/solver/Core/Exception.h b/Code/Source/solver/Core/Exception.h
index 80e6968ec..2e7e48d6f 100644
--- a/Code/Source/solver/Core/Exception.h
+++ b/Code/Source/solver/Core/Exception.h
@@ -397,6 +397,38 @@ void check_not_null(PointerT ptr, SourceLocation location, Args&&... args)
     }
 }
 
+template <class ExceptionT, class... Args>
+void throw_if(bool condition, SourceLocation location, Args&&... args)
+{
+    if (condition) {
+        raise<ExceptionT>(location, std::forward<Args>(args)...);
+    }
+}
+
+template <class ExceptionT, class IndexT, class SizeT>
+void check_index(IndexT index, SizeT size, SourceLocation location)
+{
+    const long long index_value = static_cast<long long>(index);
+    const long long size_value = static_cast<long long>(size);
+    check_arg<ExceptionT>(
+        index_value >= 0 && index_value < size_value,
+        location,
+        "Index " + std::to_string(index_value) + " out of bounds [0, " +
+            std::to_string(size_value) + ")");
+}
+
+template <class ExceptionT, class... Args>
+[[noreturn]] void not_implemented(SourceLocation location, Args&&... args)
+{
+    raise<ExceptionT>(location, std::forward<Args>(args)...);
+}
+
+template <class ExceptionT, class FeatureT>
+[[noreturn]] void not_implemented(FeatureT&& feature, SourceLocation location)
+{
+    raise<ExceptionT>(location, std::forward<FeatureT>(feature));
+}
+
 } // namespace svmp
 
 #define SVMP_HERE ::svmp::SourceLocation{__FILE__, __LINE__, __func__}
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index c3130d16f..22ae7183e 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -16,18 +16,18 @@ namespace {
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
-    FE::throw_if<BasisConfigurationException>(!req.order.has_value(), SVMP_HERE,
+    svmp::throw_if<BasisConfigurationException>(!req.order.has_value(), SVMP_HERE,
                                               missing_message);
-    FE::throw_if<BasisConfigurationException>(*req.order < 0, SVMP_HERE,
+    svmp::throw_if<BasisConfigurationException>(*req.order < 0, SVMP_HERE,
                                               negative_message);
     return *req.order;
 }
 
 void require_scalar_c0_request(const BasisRequest& req) {
-    FE::throw_if<BasisConfigurationException>(
+    svmp::throw_if<BasisConfigurationException>(
         req.field_type != FieldType::Scalar, SVMP_HERE,
         "BasisFactory: Lagrange/Serendipity bases support scalar fields only");
-    FE::throw_if<BasisConfigurationException>(
+    svmp::throw_if<BasisConfigurationException>(
         req.continuity != Continuity::C0, SVMP_HERE,
         "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
 }
@@ -61,7 +61,7 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
         case BasisType::Serendipity:
             return create_serendipity(req);
         default:
-            FE::raise<BasisConfigurationException>(SVMP_HERE,
+            svmp::raise<BasisConfigurationException>(SVMP_HERE,
                 "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope");
     }
 }
@@ -81,7 +81,7 @@ BasisRequest default_basis_request(ElementType element_type) {
             if (order >= 0) {
                 return BasisRequest{element_type, BasisType::Lagrange, order};
             }
-            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "BasisFactory: no default basis is defined for the requested element type");
         }
     }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index a349b4f80..abcb5096b 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -13,7 +13,7 @@ namespace basis {
 void require_span_size(std::size_t actual,
                        std::size_t expected,
                        const char* label) {
-    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+    svmp::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
         std::string(label) + ": output span is smaller than basis size");
 }
 
@@ -28,7 +28,7 @@ void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
     (void)gradients;
-    FE::raise<BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
         "Analytic gradient evaluation is not implemented for this basis");
 }
 
@@ -36,7 +36,7 @@ void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
                                       std::vector<Hessian>& hessians) const {
     (void)xi;
     (void)hessians;
-    FE::raise<BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
         "Analytic Hessian evaluation is not implemented for this basis");
 }
 
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index fdc6d912b..26e96d4da 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -39,7 +39,7 @@ struct NormalizedLagrangeRequest {
 // Validate and return the supported basis topology for a Lagrange element type.
 BasisTopology supported_lagrange_topology(ElementType type) {
     const BasisTopology top = topology(type);
-    FE::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
+    svmp::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
                                                      "LagrangeBasis: unsupported element type");
     return top;
 }
@@ -56,18 +56,18 @@ BasisTopology supported_lagrange_topology(ElementType type) {
 NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
     switch (element_type) {
         case ElementType::Quad8:
-            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
         case ElementType::Hex20:
-            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis");
         case ElementType::Wedge15:
-            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis");
         case ElementType::Pyramid5:
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
-            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "LagrangeBasis: pyramid support is not within the current solver basis scope");
         default:
             break;
@@ -86,7 +86,7 @@ std::size_t axis_index_pm_one(Real coord, int order) {
     }
     const Real scaled = (coord + Real(1)) * Real(order) / Real(2);
     const long long rounded = std::llround(scaled);
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         rounded < 0 || rounded > static_cast<long long>(order) ||
             !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
         SVMP_HERE,
@@ -101,7 +101,7 @@ int simplex_lattice_index(Real value, int order) {
     }
     const Real scaled = value * Real(order);
     const long long rounded = std::llround(scaled);
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         rounded < 0 || rounded > static_cast<long long>(order) ||
             !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
         SVMP_HERE,
@@ -129,7 +129,7 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
     }
     // e[0] is order minus the other exponents, so the exponents sum to order by
     // construction; a negative e[0] means the node coordinates are off-lattice.
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         e[0] < 0, SVMP_HERE,
         "LagrangeBasis: simplex node coordinate yields a negative implied exponent");
     return e;
@@ -304,7 +304,7 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     const auto normalized = normalize_lagrange_request(element_type_, order_);
     element_type_ = normalized.element_type;
     order_ = normalized.order;
-    FE::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
+    svmp::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
                                               "LagrangeBasis requires non-negative polynomial order");
 
     topology_ = supported_lagrange_topology(element_type_);
@@ -349,7 +349,7 @@ void LagrangeBasis::init_nodes() {
             break;
     }
 
-    FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+    svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
         "Unsupported element type in LagrangeBasis::init_nodes");
 }
 
@@ -402,7 +402,7 @@ void LagrangeBasis::build_wedge_nodes() {
         const auto tri_exp =
             simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
         auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
-        FE::throw_if<BasisConstructionException>(it == simplex_exponents_.end(), SVMP_HERE,
+        svmp::throw_if<BasisConstructionException>(it == simplex_exponents_.end(), SVMP_HERE,
                                                  "LagrangeBasis: wedge node triangle index lookup failed");
         const std::size_t tri_index =
             static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
@@ -573,7 +573,7 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
             break;
     }
 
-    FE::raise<BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
         "Unsupported element in LagrangeBasis evaluation");
 }
 
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 081637dc1..a058133dd 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -323,7 +323,7 @@ std::vector<Point> generate_wedge_nodes(int order) {
 }
 
 std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
-    FE::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
+    svmp::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
                                              "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
@@ -342,10 +342,10 @@ std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: unsupported Lagrange topology");
     }
 }
@@ -373,10 +373,10 @@ std::vector<Point> element_nodes(ElementType elem_type) {
             return nodes;
         }
         case ElementType::Pyramid13:
-            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: unknown element type");
     }
 }
@@ -386,7 +386,7 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
-    FE::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
+    svmp::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
                                              "ReferenceNodeLayout::get_node_coords: node index out of range");
     return nodes[local_node];
 }
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 6c9a66efd..f40550880 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -109,7 +109,7 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
         nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
     }
 
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         nodes.size() > total_size, SVMP_HERE,
         "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size");
 
@@ -149,7 +149,7 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
                   return a[0] < b[0];
               });
 
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         interior_count > interior_candidates.size(), SVMP_HERE,
         "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order");
 
@@ -171,7 +171,7 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
     std::span<const std::array<int, 2>> exponents,
     int order) {
     const int n = static_cast<int>(nodes.size());
-    FE::throw_if<BasisConstructionException>(
+    svmp::throw_if<BasisConstructionException>(
         n == 0 || exponents.size() != nodes.size(), SVMP_HERE,
         "SerendipityBasis: invalid quadrilateral serendipity interpolation setup");
 
@@ -373,13 +373,13 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         if (order_ < 1) {
             order_ = 1;
         }
-        FE::throw_if<BasisConfigurationException>(
+        svmp::throw_if<BasisConfigurationException>(
             type == ElementType::Quad8 && order_ != 2, SVMP_HERE,
             "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
         quad_monomial_exponents_ = quad_serendipity_exponents(order_);
         size_ = quad_monomial_exponents_.size();
         nodes_ = quad_serendipity_nodes(order_, size_);
-        FE::throw_if<BasisConstructionException>(
+        svmp::throw_if<BasisConstructionException>(
             nodes_.size() != size_, SVMP_HERE,
             "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
         quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
@@ -391,7 +391,7 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         } else if (order_ == 2) {
             size_ = 20;
         } else {
-            FE::raise<BasisConfigurationException>(SVMP_HERE,
+            svmp::raise<BasisConfigurationException>(SVMP_HERE,
                 "SerendipityBasis supports up to quadratic on hexahedra");
         }
     } else if (type == ElementType::Wedge15) {
@@ -402,11 +402,11 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         if (order_ == 2) {
             size_ = 15;
         } else {
-            FE::raise<BasisConfigurationException>(SVMP_HERE,
+            svmp::raise<BasisConfigurationException>(SVMP_HERE,
                 "SerendipityBasis supports up to quadratic on wedge15");
         }
     } else {
-        FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+        svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
             "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
     }
 
@@ -445,7 +445,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
     const Real z = xi[2];
 
     if (dimension_ == 2) {
-        FE::throw_if<BasisEvaluationException>(
+        svmp::throw_if<BasisEvaluationException>(
             quad_monomial_exponents_.size() != size_ ||
                 quad_inv_vandermonde_.size() != size_ * size_,
             SVMP_HERE,
@@ -477,7 +477,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         // The Hex20 coefficient table is authored in an internal node order, so
         // results are remapped into the public node layout through mesh_to_basis.
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
+        svmp::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
                                                "Hex20 mesh-to-basis ordering is not registered");
         eval_monomial_basis(
             x, y, z, size_,
@@ -500,7 +500,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    FE::raise<BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
         "SerendipityBasis::evaluate_all_to: unsupported serendipity configuration");
 }
 
diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index 84192e293..5fdb4b365 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -24,30 +24,28 @@ namespace FE {
 
 /// \defgroup FE_CommonExceptions Exceptions
 /// \ingroup FE_Common
-/// \brief FE exception hierarchy and throw/check helper functions.
+/// \brief FE exception hierarchy.
 ///
 /// \details All FE-specific exceptions derive from FEException, which itself
 /// derives from the shared solver ExceptionBase. Specialized subclasses carry
 /// structured context (element type, DOF index, backend name and error code,
 /// iteration counts, Jacobian determinants) so call sites can report
-/// actionable diagnostics. The free helper templates raise(), throw_if(),
-/// check_arg(), check_not_null(), and check_index() wrap common validation
-/// patterns with source-location capture.
+/// actionable diagnostics.
 ///
-/// Canonical FE code should throw through this helper layer instead of calling
-/// the core ::svmp helpers or constructing exceptions directly:
+/// Throw FE exceptions through the canonical core helpers in Core/Exception.h:
 ///
 /// \code
-/// FE::raise<ExceptionT>(SVMP_HERE, message);
-/// FE::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
-/// FE::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
-/// FE::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
-/// FE::check_index<ExceptionT>(index, size, SVMP_HERE);
-/// FE::not_implemented(feature, SVMP_HERE);
+/// svmp::raise<ExceptionT>(SVMP_HERE, message);
+/// svmp::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
+/// svmp::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
+/// svmp::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
+/// svmp::check_index<ExceptionT>(index, size, SVMP_HERE);
+/// svmp::not_implemented<ExceptionT>(feature, SVMP_HERE);
 /// \endcode
 ///
 /// throw_if() is failure-condition based. check_arg() is
-/// success-condition based.
+/// success-condition based. FE owns exception types; helper spelling is owned
+/// by the core layer.
 /// @{
 
 /**
@@ -441,105 +439,6 @@ class SingularMappingException : public FEException {
     double jacobian_det_ = 0.0;
 };
 
-/**
- * @brief Throw an FE exception with source-location capture
- * @tparam ExceptionT Exception type to throw.
- * @tparam Args Constructor argument types forwarded to the exception.
- * @param location Source location to record in the exception.
- * @param args Arguments forwarded to the exception constructor.
- */
-template <class ExceptionT, class... Args>
-[[noreturn]] inline void raise(SourceLocation location, Args&&... args)
-{
-    ::svmp::raise<ExceptionT>(location, std::forward<Args>(args)...);
-}
-
-/**
- * @brief Throw an FE exception when a condition holds
- * @tparam ExceptionT Exception type to throw; defaults to FEException.
- * @tparam Args Constructor argument types forwarded to the exception.
- * @param condition Condition that triggers the throw when true.
- * @param location Source location to record in the exception.
- * @param args Arguments forwarded to the exception constructor.
- */
-template <class ExceptionT = FEException, class... Args>
-inline void throw_if(bool condition, SourceLocation location, Args&&... args)
-{
-    if (condition) {
-        ::svmp::FE::raise<ExceptionT>(location, std::forward<Args>(args)...);
-    }
-}
-
-/**
- * @brief Validate an argument condition, throwing when it fails
- * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
- * @tparam Args Constructor argument types forwarded to the exception.
- * @param condition Condition that must hold for the argument to be valid.
- * @param location Source location to record in the exception.
- * @param args Arguments forwarded to the exception constructor.
- */
-template <class ExceptionT = InvalidArgumentException, class... Args>
-inline void check_arg(bool condition, SourceLocation location, Args&&... args)
-{
-    if (!condition) {
-        ::svmp::FE::raise<ExceptionT>(location, std::forward<Args>(args)...);
-    }
-}
-
-/**
- * @brief Validate that a pointer is non-null, throwing when it is null
- * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
- * @tparam PointerT Pointer-like type being checked.
- * @tparam Args Constructor argument types forwarded to the exception.
- * @param ptr Pointer to validate.
- * @param location Source location to record in the exception.
- * @param args Arguments forwarded to the exception constructor.
- */
-template <class ExceptionT = InvalidArgumentException, class PointerT,
-          class... Args>
-inline void check_not_null(PointerT ptr, SourceLocation location,
-                           Args&&... args)
-{
-    if (ptr == nullptr) {
-        ::svmp::FE::raise<ExceptionT>(location, std::forward<Args>(args)...);
-    }
-}
-
-/**
- * @brief Validate that an index lies in [0, size), throwing when out of bounds
- * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
- * @tparam IndexT Integral index type.
- * @tparam SizeT Integral size type.
- * @param index Index to validate.
- * @param size Exclusive upper bound for the index.
- * @param location Source location to record in the exception.
- */
-template <class ExceptionT = InvalidArgumentException, class IndexT,
-          class SizeT>
-inline void check_index(IndexT index, SizeT size, SourceLocation location)
-{
-    const long long fe_check_index_value = static_cast<long long>(index);
-    const long long fe_check_size_value = static_cast<long long>(size);
-
-    ::svmp::FE::check_arg<ExceptionT>(
-        fe_check_index_value >= 0 &&
-            fe_check_index_value < fe_check_size_value,
-        location,
-        "Index " + std::to_string(fe_check_index_value) +
-            " out of bounds [0, " + std::to_string(fe_check_size_value) + ")");
-}
-
-/**
- * @brief Throw NotImplementedException for a missing feature
- * @param feature Description of the unimplemented feature.
- * @param location Source location to record in the exception.
- */
-[[noreturn]] inline void not_implemented(const std::string& feature,
-                                         SourceLocation location)
-{
-    ::svmp::FE::raise<NotImplementedException>(location, feature);
-}
-
 /// @}
 
 } // namespace FE
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 0260ad2b1..5675bde86 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -83,13 +83,13 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs) const {
 
 void DenseLUSolver::solve_in_place(std::span<Real> rhs,
                                    std::size_t rhs_count) const {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         rhs_count > 0, SVMP_HERE,
         label + ": dense solve requires at least one right-hand side");
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         rhs.size() == n * rhs_count, SVMP_HERE,
         label + ": dense multi-RHS solve size mismatch");
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
         label + ": dense solver is not factorized");
     if (n == 0) {
@@ -115,10 +115,10 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         matrix.size() == rows * cols, SVMP_HERE,
         std::string(label) + ": diagnostic size mismatch");
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         rows > 0 && cols > 0, SVMP_HERE,
         std::string(label) + ": diagnostics require a nonempty matrix");
 
@@ -155,7 +155,7 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
 DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
                                   std::size_t n,
                                   std::string_view label) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         matrix.size() == n * n, SVMP_HERE,
         std::string(label) + ": dense factorization size mismatch");
 
@@ -175,7 +175,7 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
     const auto diagonal = solver.lu.matrixLU().diagonal();
     for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
         const Real pivot_magnitude = std::abs(diagonal[col]);
-        ::svmp::FE::check_arg<FEException>(
+        ::svmp::check_arg<FEException>(
             pivot_magnitude > solver.pivot_tolerance, SVMP_HERE,
             solver.label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
@@ -199,7 +199,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
     std::vector<Real> matrix,
     std::size_t n,
     std::string_view label) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         matrix.size() == n * n, SVMP_HERE,
         std::string(label) + ": dense inverse size mismatch");
     std::vector<Real> matrix_for_lu = matrix;
@@ -220,7 +220,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
                                                       static_cast<Eigen::Index>(n));
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
-            ::svmp::FE::check_arg<FEException>(
+            ::svmp::check_arg<FEException>(
                 singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = Real(1) / singular_values[i];
@@ -241,7 +241,7 @@ void validate_dense_inverse_diagnostics(
     std::size_t expected_rank,
     std::string_view label,
     Real max_condition) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         result.diagnostics.rank == expected_rank, SVMP_HERE,
         std::string(label) + ": rank-deficient matrix (rank " +
             std::to_string(result.diagnostics.rank) + " of " +
@@ -251,7 +251,7 @@ void validate_dense_inverse_diagnostics(
         return;
     }
 
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         result.diagnostics.condition_estimate <= max_condition, SVMP_HERE,
         std::string(label) + ": condition estimate " +
             std::to_string(result.diagnostics.condition_estimate) +
@@ -271,7 +271,7 @@ std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
 std::size_t dense_matrix_rank(std::vector<Real> matrix,
                               std::size_t rows,
                               std::size_t cols) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         matrix.size() == rows * cols, SVMP_HERE,
         "dense_matrix_rank: size mismatch");
 
@@ -299,10 +299,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         matrix.size() == rows * cols, SVMP_HERE,
         std::string(label) + ": pseudo-inverse size mismatch");
-    ::svmp::FE::check_arg<FEException>(
+    ::svmp::check_arg<FEException>(
         rows > 0 && cols > 0, SVMP_HERE,
         std::string(label) + ": pseudo-inverse requires a nonempty matrix");
 
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index f6639dcd3..1905f2776 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -36,16 +36,16 @@ inline void dense_transform_batched_row_major(
         return;
     }
 
-    FE::throw_if<FEException>(matrix.size() < rows * cols, SVMP_HERE,
+    svmp::throw_if<FEException>(matrix.size() < rows * cols, SVMP_HERE,
                               "dense_transform_batched_row_major: matrix span is too small");
-    FE::throw_if<FEException>(input_row_stride < rhs_count, SVMP_HERE,
+    svmp::throw_if<FEException>(input_row_stride < rhs_count, SVMP_HERE,
                               "dense_transform_batched_row_major: input stride is smaller than RHS count");
-    FE::throw_if<FEException>(output_row_stride < rhs_count, SVMP_HERE,
+    svmp::throw_if<FEException>(output_row_stride < rhs_count, SVMP_HERE,
                               "dense_transform_batched_row_major: output stride is smaller than RHS count");
-    FE::throw_if<FEException>(
+    svmp::throw_if<FEException>(
         input.size() < (cols - 1u) * input_row_stride + rhs_count, SVMP_HERE,
         "dense_transform_batched_row_major: input span is too small");
-    FE::throw_if<FEException>(
+    svmp::throw_if<FEException>(
         output.size() < (rows - 1u) * output_row_stride + rhs_count, SVMP_HERE,
         "dense_transform_batched_row_major: output span is too small");
 
diff --git a/Code/Source/solver/cep_ion.cpp b/Code/Source/solver/cep_ion.cpp
index eb64d7ffd..8c91a54fd 100644
--- a/Code/Source/solver/cep_ion.cpp
+++ b/Code/Source/solver/cep_ion.cpp
@@ -330,7 +330,7 @@ void cep_integ_l(CepMod &cep_mod, cepModelType &cep, Vector<double> &X,
   dmsg << "cep.odes.tIntTyp: " << cep.odes.tIntType;
   #endif
 
-  svmp::FE::check_not_null<svmp::FE::NotInitializedException>(
+  svmp::check_not_null<svmp::FE::NotInitializedException>(
       cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
 
   const double eps = std::numeric_limits<double>::epsilon();
diff --git a/Code/Source/solver/fs.cpp b/Code/Source/solver/fs.cpp
index 8ea252a04..81cf91466 100644
--- a/Code/Source/solver/fs.cpp
+++ b/Code/Source/solver/fs.cpp
@@ -377,7 +377,7 @@ void set_thood_fs(fsType& fs, consts::ElementType eType)
     break;
 
     default:
-      fe::raise<fe::InvalidElementException>(
+      svmp::raise<fe::InvalidElementException>(
           SVMP_HERE, "Cannot choose Taylor-Hood basis", element_name(eType));
   }
 }
diff --git a/Code/Source/solver/ionic_model.cpp b/Code/Source/solver/ionic_model.cpp
index e1f3251ad..df7d9c25e 100644
--- a/Code/Source/solver/ionic_model.cpp
+++ b/Code/Source/solver/ionic_model.cpp
@@ -40,7 +40,7 @@ void IonicModel::distribute_parameters(const CmMod &cm_mod, const cmType &cm) {
 
 void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
   if (initial_X.size() != X.size()) {
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE, "Initial conditions size for X does not match vector size.");
   }
 
@@ -48,7 +48,7 @@ void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
     X[i] = initial_X[i].second;
 
   if (initial_Xg.size() != Xg.size()) {
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "Initial conditions size for Xg does not match vector size.");
   }
@@ -76,7 +76,7 @@ void IonicModel::integ(const odeType &ode_solver_params, const int zone_id,
     break;
 
   default:
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "Unknown time integration type: " +
             std::to_string(static_cast<int>(ode_solver_params.tIntType)));
@@ -263,7 +263,7 @@ IonicModelFactory::create_model(const std::string &name) {
 
   auto iter = factory_instance.children.find(name);
   if (iter == factory_instance.children.end()) {
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE, "No model with name '" + name +
                        "' was registered in the ionic model factory.");
   }
diff --git a/Code/Source/solver/mat_fun.h b/Code/Source/solver/mat_fun.h
index d0c63d272..598aa1827 100644
--- a/Code/Source/solver/mat_fun.h
+++ b/Code/Source/solver/mat_fun.h
@@ -52,7 +52,7 @@ namespace mat_fun {
         if ((mat.rows() != dest.nrows()) || (mat.cols() != dest.ncols())) { 
           auto mat_dims = (std::stringstream() << "(" << mat.rows()  << "x" << mat.cols() << ")").str();
           auto dest_dims = (std::stringstream() << "(" << dest.nrows()  << "x" << dest.ncols() << ")").str();
-          svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+          svmp::raise<svmp::FE::InvalidArgumentException>(
               SVMP_HERE, "The 'mat" + mat_dims + "' and 'dest" + dest_dims +
               "' arrays have incompatible sizes.");
         }
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 2e08768e4..4fe2399cb 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -136,7 +136,7 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
 
   const auto fe_type = to_fe_element_type(eType);
   if (!fe_type) {
-    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "No FE Basis selection for solver element " + solver_element_name(eType));
   }
 
@@ -191,7 +191,7 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
 {
   if (solver_node < 0) {
-    fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
+    svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
         "Solver node " + std::to_string(solver_node) +
             " is outside node map for " + solver_element_name(eType));
   }
@@ -204,7 +204,7 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   if (node < map.size()) {
     return map[node];
   }
-  fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
+  svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
       "Solver node " + std::to_string(solver_node) +
           " is outside node map for " + solver_element_name(eType));
 }
@@ -218,7 +218,7 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
                                                const Array<double>& xi)
 {
   if (xi.nrows() < basis.dimension()) {
-    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "xi has " + std::to_string(xi.nrows()) +
             " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
             " reference coordinates");
@@ -246,12 +246,12 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         Array3<double>& Nx)
 {
   if (values.size() != static_cast<std::size_t>(eNoN)) {
-    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis value count " + std::to_string(values.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
   if (gradients.size() != static_cast<std::size_t>(eNoN)) {
-    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis gradient count " + std::to_string(gradients.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
@@ -259,7 +259,7 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
   for (int a = 0; a < eNoN; ++a) {
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= values.size() || basis_index >= gradients.size()) {
-      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
+      svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis node " +
               std::to_string(basis_index) + " outside basis output for " +
               solver_element_name(eType));
@@ -290,7 +290,7 @@ void evaluate_basis_values_and_gradients(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
             " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
@@ -332,7 +332,7 @@ int required_nxx_components_for_dimension(const int dimension)
     case 3:
       return 6;
     default:
-      fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+      svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
           "Unsupported FE Basis reference dimension " + std::to_string(dimension));
   }
 }
@@ -349,14 +349,14 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
                                        Array3<double>& Nxx)
 {
   if (hessians.size() != static_cast<std::size_t>(eNoN)) {
-    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis Hessian count " + std::to_string(hessians.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   const int required_components = required_nxx_components_for_dimension(dimension);
   if (Nxx.nrows() < required_components) {
-    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver Nxx has " + std::to_string(Nxx.nrows()) +
             " rows but FE Basis Hessian packing requires " + std::to_string(required_components));
   }
@@ -368,7 +368,7 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
 
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= hessians.size()) {
-      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
+      svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
               std::to_string(basis_index) + " outside basis output for " +
               solver_element_name(eType));
@@ -402,14 +402,14 @@ void evaluate_basis_hessians(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
             " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
-    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver ind2 " + std::to_string(ind2) +
             " is smaller than packed Hessian component count " + std::to_string(required_components));
   }
@@ -441,7 +441,7 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    fe::raise<fe::InvalidElementException>(SVMP_HERE,
+    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'get_element_gauss_int_data'",
         solver_element_name(eType));
   }
@@ -456,7 +456,7 @@ void get_gip(mshType& mesh)
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    fe::raise<fe::InvalidElementException>(SVMP_HERE,
+    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_element_gauss_int_data'",
         solver_element_name(mesh.eType));
   }
@@ -467,7 +467,7 @@ void get_gip(Simulation* simulation, faceType& face)
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    fe::raise<fe::InvalidElementException>(SVMP_HERE,
+    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_face_gauss_int_data'",
         solver_element_name(face.eType));
   }
@@ -479,7 +479,7 @@ void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const in
     Array<double>& N, Array3<double>& Nx)
 {
   if (!to_fe_element_type(eType).has_value()) {
-    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
   }
 
@@ -513,7 +513,7 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  fe::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, SVMP_HERE,
+  svmp::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, SVMP_HERE,
       "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis");
 
   if (face.eType == ElementType::PNT) {
@@ -527,7 +527,7 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
     return;
   }
 
-  fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+  svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
       "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType));
 }
 
@@ -546,7 +546,7 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
   }
 
   if (!to_fe_element_type(eType).has_value()) {
-    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
             solver_element_name(eType));
   }
@@ -735,7 +735,7 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
 
   l1 = (l1 && l2 && l3 && l4);
 
-  fe::throw_if<fe::InvalidArgumentException>(!l1, SVMP_HERE,
+  svmp::throw_if<fe::InvalidArgumentException>(!l1, SVMP_HERE,
       "Error in computing shape functions");
 }
 
@@ -984,7 +984,7 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
+      svmp::raise<fe::InvalidArgumentException>(SVMP_HERE,
           "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
               std::to_string(Ac) + " could not be matched to a node in the '" +
               msh.name + "' volume mesh.");
@@ -1036,7 +1036,7 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
+          svmp::raise<fe::InvalidArgumentException>(SVMP_HERE,
               "gnnb: invalid MechanicalConfigurationType provided");
       }
     }
@@ -1225,7 +1225,7 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+    svmp::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
         "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
@@ -1295,7 +1295,7 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+    svmp::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
         "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
@@ -1343,7 +1343,7 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      fe::raise<fe::InvalidElementException>(SVMP_HERE,
+      svmp::raise<fe::InvalidElementException>(SVMP_HERE,
           "[select_ele] No support for " + std::to_string(mesh.eNoN) +
               " noded " + std::to_string(insd) + "D elements.",
           solver_element_name(mesh.eType));
@@ -1402,7 +1402,7 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    fe::raise<fe::InvalidElementException>(SVMP_HERE,
+    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support for " + std::to_string(face.eNoN) + " noded " +
             std::to_string(insd) + "D elements in 'set_face_element_props'.",
         solver_element_name(face.eType));
diff --git a/Code/Source/solver/post.cpp b/Code/Source/solver/post.cpp
index 50872304e..84b2c23c8 100644
--- a/Code/Source/solver/post.cpp
+++ b/Code/Source/solver/post.cpp
@@ -804,13 +804,13 @@ void fib_stretch_rate(const ComMod& com_mod, const int iEq, const mshType& lM, c
   int nNo = lM.nNo;
 
   if (dt <= 0.0) {
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "[fib_stretch_rate] Expected com_mod.dt > 0, but got " + std::to_string(dt) + ".");
   }
 
   if (res.size() != nNo) {
-    svmp::FE::raise<svmp::FE::InvalidArgumentException>(
+    svmp::raise<svmp::FE::InvalidArgumentException>(
         SVMP_HERE,
         "[fib_stretch_rate] Expected res size " + std::to_string(nNo) + ", but got " + std::to_string(res.size()) + ".");
   }
diff --git a/Code/Source/solver/read_files.cpp b/Code/Source/solver/read_files.cpp
index f5812bf3d..164de7147 100644
--- a/Code/Source/solver/read_files.cpp
+++ b/Code/Source/solver/read_files.cpp
@@ -2384,7 +2384,7 @@ void read_outputs(Simulation* simulation, EquationParameters* eq_params, eqType&
         if (dmn.phys != consts::EquationType::phys_CEP)
           continue;
 
-        svmp::FE::check_not_null<svmp::FE::NotInitializedException>(
+        svmp::check_not_null<svmp::FE::NotInitializedException>(
             dmn.cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
 
         const auto registered_outputs =
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 1b9e63329..51df2d593 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -138,7 +138,7 @@ void expect_source_location(const FEException& e)
 }
 
 template <class Thrower>
-void expect_fe_helper_preserves_source_location(Thrower&& thrower)
+void expect_core_helper_preserves_source_location(Thrower&& thrower)
 {
     try {
         thrower();
@@ -206,45 +206,46 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
 
 TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
     try {
-        svmp::FE::raise<BasisConfigurationException>(SVMP_HERE, "invalid config");
+        svmp::raise<BasisConfigurationException>(SVMP_HERE, "invalid config");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InvalidArgument);
     }
 
     try {
-        svmp::FE::raise<BasisConstructionException>(SVMP_HERE, "construction failure");
+        svmp::raise<BasisConstructionException>(SVMP_HERE, "construction failure");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InternalError);
     }
 }
 
-TEST(BasisErrorPaths, FEHelpersPreserveSourceLocation) {
-    expect_fe_helper_preserves_source_location([] {
-        svmp::FE::raise<BasisEvaluationException>(SVMP_HERE, "raise location");
+TEST(BasisErrorPaths, CoreHelpersPreserveSourceLocation) {
+    expect_core_helper_preserves_source_location([] {
+        svmp::raise<BasisEvaluationException>(SVMP_HERE, "raise location");
     });
 
-    expect_fe_helper_preserves_source_location([] {
-        svmp::FE::throw_if<BasisEvaluationException>(
+    expect_core_helper_preserves_source_location([] {
+        svmp::throw_if<BasisEvaluationException>(
             true, SVMP_HERE, "throw_if location");
     });
 
-    expect_fe_helper_preserves_source_location([] {
-        svmp::FE::check_arg<BasisEvaluationException>(
+    expect_core_helper_preserves_source_location([] {
+        svmp::check_arg<BasisEvaluationException>(
             false, SVMP_HERE, "check_arg location");
     });
 
-    expect_fe_helper_preserves_source_location([] {
+    expect_core_helper_preserves_source_location([] {
         const int* ptr = nullptr;
-        svmp::FE::check_not_null<BasisEvaluationException>(
+        svmp::check_not_null<BasisEvaluationException>(
             ptr, SVMP_HERE, "check_not_null location");
     });
 
-    expect_fe_helper_preserves_source_location([] {
-        svmp::FE::check_index<BasisEvaluationException>(1, 1, SVMP_HERE);
+    expect_core_helper_preserves_source_location([] {
+        svmp::check_index<BasisEvaluationException>(1, 1, SVMP_HERE);
     });
 
-    expect_fe_helper_preserves_source_location([] {
-        svmp::FE::not_implemented("test feature", SVMP_HERE);
+    expect_core_helper_preserves_source_location([] {
+        svmp::not_implemented<NotImplementedException>(
+            "test feature", SVMP_HERE);
     });
 }
 

From 78ed2b980bd81034dabc03805146acb823a4922a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 01:47:21 -0700
Subject: [PATCH 37/91] FE basis/math test suite with improved documentation
 and clarity

---
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  5 ++
 .../unitTests/FE/Basis/test_BasisHessians.cpp | 49 -----------------
 .../FE/Basis/test_ConstexprBasis.cpp          | 17 +++---
 .../FE/Basis/test_HigherOrderWedge.cpp        |  3 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 53 ++++++++++++++++++-
 5 files changed, 69 insertions(+), 58 deletions(-)

diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 5675bde86..662bec8fc 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -220,6 +220,11 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
                                                       static_cast<Eigen::Index>(n));
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+            // Defensive: this branch runs only when condition_estimate is finite,
+            // and dense_matrix_diagnostics leaves it infinite whenever it drops a
+            // singular value (rank < full_rank). A sub-tolerance singular value
+            // therefore cannot reach here in current code; the guard protects
+            // against future refactors that derive the fallback condition differently.
             ::svmp::check_arg<FEException>(
                 singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index 9ad458c0b..fb1d9e26f 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -144,51 +144,6 @@ void expect_hessians_match_numerical(const BasisFunction& basis,
     }
 }
 
-void expect_partition_hessian_sum_zero(const LagrangeBasis& basis,
-                                       const math::Vector<Real, 3>& xi,
-                                       Real tol)
-{
-    std::vector<Hessian> hessians;
-    basis.evaluate_hessians(xi, hessians);
-
-    Hessian sum = Hessian::Zero();
-    for (const auto& hessian : hessians) {
-        for (std::size_t r = 0; r < 3u; ++r) {
-            for (std::size_t c = 0; c < 3u; ++c) {
-                sum(r, c) += hessian(r, c);
-            }
-        }
-    }
-
-    for (int r = 0; r < basis.dimension(); ++r) {
-        for (int c = 0; c < basis.dimension(); ++c) {
-            EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
-                        Real(0),
-                        tol)
-                << "element " << static_cast<int>(basis.element_type())
-                << ", order " << basis.order();
-        }
-    }
-}
-
-void expect_hessians_symmetric(const LagrangeBasis& basis,
-                               const math::Vector<Real, 3>& xi,
-                               Real tol)
-{
-    std::vector<Hessian> hessians;
-    basis.evaluate_hessians(xi, hessians);
-
-    for (const auto& hessian : hessians) {
-        for (int r = 0; r < basis.dimension(); ++r) {
-            for (int c = r + 1; c < basis.dimension(); ++c) {
-                const std::size_t sr = static_cast<std::size_t>(r);
-                const std::size_t sc = static_cast<std::size_t>(c);
-                EXPECT_NEAR(hessian(sr, sc), hessian(sc, sr), tol);
-            }
-        }
-    }
-}
-
 void expect_partition_hessian_sum_zero(const BasisFunction& basis,
                                        const math::Vector<Real, 3>& xi,
                                        Real tol)
@@ -331,15 +286,11 @@ TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
         {ElementType::Wedge6, BasisType::Lagrange, 1, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-12)},
     };
 
-    int covered = 0;
     for (const auto& c : cases) {
         auto basis = basis_factory::create(BasisRequest{c.type, c.basis_type, c.order});
         expect_partition_hessian_sum_zero(*basis, c.xi, c.tol);
         expect_hessians_symmetric(*basis, c.xi, c.tol);
-        ++covered;
     }
-
-    EXPECT_EQ(covered, 13);
 }
 
 // Gradients must match centered finite differences of values. This is the only
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index e2a7bfb6d..add2d256c 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -41,10 +41,10 @@ static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
 static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
 static_assert(detail::basis_abs(Real(-2)) == Real(2));
 static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
-static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));
+static_assert(detail::basis_near_zero(detail::basis_scaled_tolerance() * Real(0.5)));
 static_assert(detail::basis_nearly_equal(
     Real(1),
-    Real(1) + std::numeric_limits<Real>::epsilon() * Real(32)));
+    Real(1) + detail::basis_scaled_tolerance() * Real(0.5)));
 
 TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     const std::vector<std::pair<ElementType, std::size_t>> expected = {
@@ -72,11 +72,14 @@ TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
 
 TEST(ConstexprBasis, TraitToleranceScalesWithRealPrecision) {
     const Real eps = std::numeric_limits<Real>::epsilon();
-    EXPECT_GT(detail::basis_scaled_tolerance(), eps);
-    EXPECT_TRUE(detail::basis_near_zero(eps * Real(32)));
-    EXPECT_FALSE(detail::basis_near_zero(eps * Real(128)));
-    EXPECT_TRUE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(32)));
-    EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(128)));
+    const Real tol = detail::basis_scaled_tolerance();
+    // Probes straddle the tolerance itself rather than hardcoding the multiplier,
+    // so retuning basis_scaled_tolerance cannot silently invalidate them.
+    EXPECT_GT(tol, eps);
+    EXPECT_TRUE(detail::basis_near_zero(tol * Real(0.5)));
+    EXPECT_FALSE(detail::basis_near_zero(tol * Real(2)));
+    EXPECT_TRUE(detail::basis_nearly_equal(Real(1), Real(1) + tol * Real(0.5)));
+    EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + tol * Real(2)));
 }
 
 TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 8827eebb0..3ba943e04 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -144,7 +144,8 @@ TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
 TEST(HigherOrderWedge, OrderFourIsNodalAndPartitionsUnity) {
     LagrangeBasis wedge(ElementType::Wedge6, 4);
 
-    EXPECT_EQ(wedge.size(), 75u);
+    // Order-4 wedge = triangle(order 4) x line(order 4) = 15 x 5 nodes.
+    EXPECT_EQ(wedge.size(), 15u * 5u);
     expect_kronecker_at_nodes(wedge, Real(1e-9));
     expect_partition_gradient_hessian_sums(
         wedge,
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 68232d216..8288b4c37 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -285,6 +285,16 @@ TEST(LagrangeBasis, CompleteAliasesNormalizeToCanonicalBases) {
     }
 }
 
+// CompleteAliasesNormalizeToCanonicalBases pins the alias floor (a named
+// quadratic alias requested below order 2 is raised to 2). This pins the
+// complementary direction documented in normalize_lagrange_request: a higher
+// requested order on an alias is honored, not clamped to the alias order.
+TEST(LagrangeBasis, QuadraticAliasHonorsHigherRequestedOrder) {
+    const LagrangeBasis basis(ElementType::Hex27, 3);
+    EXPECT_EQ(basis.element_type(), ElementType::Hex8);
+    EXPECT_EQ(basis.order(), 3);
+}
+
 TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
     const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
         {ElementType::Line2, ElementType::Line2, 1},
@@ -330,6 +340,11 @@ TEST(LagrangeBasis, RemovedOrSerendipityFamiliesAreRejected) {
     }
 }
 
+// The polynomial-reproduction and higher-order-lattice tests here validate
+// VALUES and derivative invariants (gradient/Hessian sums). The authoritative
+// finite-difference checks of gradient and Hessian *values* live in
+// test_BasisHessians.cpp (BasisGradients/BasisHessians suites), covering the
+// canonical Lagrange topologies and the serendipity families.
 TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
     const std::vector<std::pair<ElementType, Point>> cases = {
         {ElementType::Line2, {Real(-0.2), Real(0), Real(0)}},
@@ -379,7 +394,7 @@ TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
     };
 
     for (const auto& [type, point] : cases) {
-        LagrangeBasis basis(type, 1);
+        LagrangeBasis basis(type, 2);
         std::vector<Real> values;
         basis.evaluate_values(point, values);
 
@@ -422,6 +437,42 @@ TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
     }
 }
 
+// The Kronecker test above proves the order-3 hex lattice is nodal, but a
+// permuted-yet-consistent face ordering would also pass it. This pins the
+// load-bearing external contract of the order>=3 face-interior emission: the
+// six face-interior blocks appear in VTK face order (-X, +X, -Y, +Y, -Z, +Z)
+// and lie on the correct face. (The within-face traversal is an internal
+// convention and is not separately pinned here.)
+TEST(LagrangeBasis, HigherOrderHexFaceInteriorFollowsVtkFaceOrder) {
+    // Order-3 hex (64 nodes): 8 vertices + 24 edge nodes + 24 face-interior
+    // (6 faces x (order-1)^2 = 4 each) + 8 volume. Face-interior block at [32, 56).
+    const auto nodes = ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Hex8, 3);
+    ASSERT_EQ(nodes.size(), 64u);
+
+    struct FaceBlock {
+        std::size_t axis;  // constant axis: 0=x, 1=y, 2=z
+        Real value;        // constant coordinate on the face
+    };
+    const FaceBlock blocks[] = {
+        {0u, Real(-1)},  // -X
+        {0u, Real(1)},   // +X
+        {1u, Real(-1)},  // -Y
+        {1u, Real(1)},   // +Y
+        {2u, Real(-1)},  // -Z
+        {2u, Real(1)},   // +Z
+    };
+
+    constexpr std::size_t kFaceStart = 32u;
+    constexpr std::size_t kPerFace = 4u;  // (order-1)^2 at order 3
+    for (std::size_t f = 0; f < 6u; ++f) {
+        for (std::size_t m = 0; m < kPerFace; ++m) {
+            const auto& node = nodes[kFaceStart + f * kPerFace + m];
+            EXPECT_NEAR(node[blocks[f].axis], blocks[f].value, Real(1e-14))
+                << "face block " << f << ", node " << m;
+        }
+    }
+}
+
 TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
     const std::vector<std::pair<ElementType, Point>> cases = {
         {ElementType::Tetra4, {Real(0.15), Real(0.2), Real(0.25)}},

From 58b80a253646a3d0cbaaa7a56f243a382b5c75bd Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 02:44:12 -0700
Subject: [PATCH 38/91] removed the DenseTransformKernels.h and the associated
 unit tests

---
 .../solver/FE/Math/DenseTransformKernels.h    | 81 -------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/DenseTransformKernels.h

diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
deleted file mode 100644
index 1905f2776..000000000
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
-#define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
-
-#include "FEException.h"
-#include "Types.h"
-
-#include <Eigen/Core>
-
-#include <cstddef>
-#include <span>
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-/// \brief Apply a row-major dense matrix to a batch of right-hand sides.
-///
-/// Computes output = matrix * input where matrix is rows-by-cols (row-major),
-/// input holds cols rows of rhs_count values each (row stride
-/// input_row_stride), and output holds rows rows of rhs_count values each
-/// (row stride output_row_stride). Strides may exceed rhs_count for padded
-/// layouts; padding entries are left untouched.
-inline void dense_transform_batched_row_major(
-    std::span<const Real> matrix,
-    std::size_t rows,
-    std::size_t cols,
-    std::span<const Real> input,
-    std::size_t input_row_stride,
-    std::span<Real> output,
-    std::size_t output_row_stride,
-    std::size_t rhs_count) {
-    if (rows == 0u || cols == 0u || rhs_count == 0u) {
-        return;
-    }
-
-    svmp::throw_if<FEException>(matrix.size() < rows * cols, SVMP_HERE,
-                              "dense_transform_batched_row_major: matrix span is too small");
-    svmp::throw_if<FEException>(input_row_stride < rhs_count, SVMP_HERE,
-                              "dense_transform_batched_row_major: input stride is smaller than RHS count");
-    svmp::throw_if<FEException>(output_row_stride < rhs_count, SVMP_HERE,
-                              "dense_transform_batched_row_major: output stride is smaller than RHS count");
-    svmp::throw_if<FEException>(
-        input.size() < (cols - 1u) * input_row_stride + rhs_count, SVMP_HERE,
-        "dense_transform_batched_row_major: input span is too small");
-    svmp::throw_if<FEException>(
-        output.size() < (rows - 1u) * output_row_stride + rhs_count, SVMP_HERE,
-        "dense_transform_batched_row_major: output span is too small");
-
-    using RowMajorMatrix =
-        Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-    using ConstMap = Eigen::Map<const RowMajorMatrix>;
-    using ConstStridedMap =
-        Eigen::Map<const RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
-    using StridedMap =
-        Eigen::Map<RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
-
-    const ConstMap matrix_map(matrix.data(),
-                              static_cast<Eigen::Index>(rows),
-                              static_cast<Eigen::Index>(cols));
-    const ConstStridedMap input_map(
-        input.data(),
-        static_cast<Eigen::Index>(cols),
-        static_cast<Eigen::Index>(rhs_count),
-        Eigen::OuterStride<>(static_cast<Eigen::Index>(input_row_stride)));
-    StridedMap output_map(
-        output.data(),
-        static_cast<Eigen::Index>(rows),
-        static_cast<Eigen::Index>(rhs_count),
-        Eigen::OuterStride<>(static_cast<Eigen::Index>(output_row_stride)));
-
-    output_map.noalias() = matrix_map * input_map;
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_DENSETRANSFORMKERNELS_H

From 1d93c5cfd75d43f1e763cff3b943fdca671bfd72 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 02:55:15 -0700
Subject: [PATCH 39/91] added quadrilateral-serendipity justification and
 removed exposed Eigen types

---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  4 ++++
 .../solver/FE/Math/DenseLinearAlgebra.cpp     | 24 +++++++++++++------
 .../solver/FE/Math/DenseLinearAlgebra.h       | 16 +++++++++----
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index f40550880..9e118c6da 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -189,6 +189,10 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
         }
     }
 
+    // Quadrilateral serendipity bases are generated from the requested
+    // monomial space, so a small dense inverse produces the nodal coefficient
+    // table at construction time. Hex20 and Wedge15 use fixed tables because
+    // only their quadratic layouts are supported here.
     const std::string label = "Quad order " + std::to_string(order);
     return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
 }
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 662bec8fc..db3f8561d 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -19,7 +19,7 @@ namespace math {
 
 namespace {
 
-using DenseMatrix = DenseLUSolver::DenseMatrix;
+using DenseMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
 using RowMajorMatrix =
     Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 using ConstRowMajorMap = Eigen::Map<const RowMajorMatrix>;
@@ -41,6 +41,15 @@ void copy_to_row_major(const DenseMatrix& source, std::vector<Real>& dest) {
 
 } // namespace
 
+struct DenseLUSolver::Impl {
+    Eigen::PartialPivLU<DenseMatrix> lu;
+};
+
+DenseLUSolver::DenseLUSolver() = default;
+DenseLUSolver::~DenseLUSolver() = default;
+DenseLUSolver::DenseLUSolver(DenseLUSolver&&) noexcept = default;
+DenseLUSolver& DenseLUSolver::operator=(DenseLUSolver&&) noexcept = default;
+
 Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept {
     Real max_abs = Real(0);
     for (const Real value : matrix) {
@@ -90,7 +99,7 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs,
         rhs.size() == n * rhs_count, SVMP_HERE,
         label + ": dense multi-RHS solve size mismatch");
     ::svmp::check_arg<FEException>(
-        lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
+        impl && impl->lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
         label + ": dense solver is not factorized");
     if (n == 0) {
         return;
@@ -100,7 +109,7 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs,
                                        static_cast<Eigen::Index>(n),
                                        static_cast<Eigen::Index>(rhs_count));
     // Evaluate into a temporary: lu.solve cannot alias its argument.
-    const DenseMatrix solution = lu.solve(rhs_map);
+    const DenseMatrix solution = impl->lu.solve(rhs_map);
     rhs_map = solution;
 }
 
@@ -166,13 +175,14 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
         dense_matrix_max_abs(std::span<const Real>(matrix.data(), matrix.size()));
     solver.pivot_tolerance = dense_matrix_pivot_tolerance(n, n, max_abs);
 
-    solver.lu.compute(map_row_major(matrix, n, n));
+    solver.impl = std::make_unique<DenseLUSolver::Impl>();
+    solver.impl->lu.compute(map_row_major(matrix, n, n));
 
     // Partial pivoting leaves the pivots on the diagonal of the packed LU
     // factor; a pivot below the scale-aware tolerance marks rank deficiency.
     Real max_pivot_abs = Real(0);
     Real min_pivot_abs = std::numeric_limits<Real>::infinity();
-    const auto diagonal = solver.lu.matrixLU().diagonal();
+    const auto diagonal = solver.impl->lu.matrixLU().diagonal();
     for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
         const Real pivot_magnitude = std::abs(diagonal[col]);
         ::svmp::check_arg<FEException>(
@@ -236,7 +246,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
         return result;
     }
 
-    const DenseMatrix inverse = solver.lu.inverse();
+    const DenseMatrix inverse = solver.impl->lu.inverse();
     copy_to_row_major(inverse, result.inverse);
     return result;
 }
@@ -267,7 +277,7 @@ std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
                                       std::size_t n,
                                       std::string_view label) {
     const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
-    const DenseMatrix inverse = solver.lu.inverse();
+    const DenseMatrix inverse = solver.impl->lu.inverse();
     std::vector<Real> result;
     copy_to_row_major(inverse, result);
     return result;
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 08669de74..c94351bb3 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -6,10 +6,9 @@
 
 #include "Types.h"
 
-#include <Eigen/Dense>
-
 #include <cstddef>
 #include <limits>
+#include <memory>
 #include <span>
 #include <string>
 #include <string_view>
@@ -20,7 +19,7 @@ namespace FE {
 namespace math {
 
 // Dense solve, inverse, rank, and pseudo-inverse support for FE construction
-// utilities, backed by Eigen. Matrices are row-major: matrix[row * cols + col].
+// utilities. Matrices are row-major: matrix[row * cols + col].
 [[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
 
 [[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
@@ -59,15 +58,22 @@ struct DenseInverseResult {
 [[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
 
 struct DenseLUSolver {
-    using DenseMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+    struct Impl;
+
+    DenseLUSolver();
+    ~DenseLUSolver();
+    DenseLUSolver(DenseLUSolver&&) noexcept;
+    DenseLUSolver& operator=(DenseLUSolver&&) noexcept;
+    DenseLUSolver(const DenseLUSolver&) = delete;
+    DenseLUSolver& operator=(const DenseLUSolver&) = delete;
 
     std::size_t n{0};
-    Eigen::PartialPivLU<DenseMatrix> lu;
     DenseMatrixDiagnostics diagnostics;
     Real pivot_tolerance{0};
     Real min_pivot{0};
     Real max_pivot{0};
     std::string label;
+    std::unique_ptr<Impl> impl;
 
     [[nodiscard]] bool empty() const noexcept { return n == 0; }
 

From 7ba882874ac597dd07ba1887394e54afcec320fe Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 08:35:58 -0700
Subject: [PATCH 40/91] replaced quadrilateral serendipity interior-node
 heuristic with triangular lattice and high-order unisolvence regression
 coverage

---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  79 ++++---
 .../Source/solver/FE/Basis/SerendipityBasis.h |  39 +++-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  55 +++++
 .../FE/Basis/test_SerendipityBasis.cpp        | 212 +++++++++++++++++-
 4 files changed, 338 insertions(+), 47 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 9e118c6da..ed8c073f3 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -83,6 +83,40 @@ std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
     return exponents;
 }
 
+std::size_t quad_serendipity_interior_count(int order) {
+    if (order < 4) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 4);
+    return (m + 1u) * (m + 2u) / 2u;
+}
+
+// Interior nodes are a triangular row set for P_m, m = order - 4. If a
+// serendipity polynomial vanishes at the p + 1 boundary nodes on each edge,
+// each edge restriction is identically zero and the polynomial factors as
+// (1 - x^2)(1 - y^2) q with q in P_m. Row 0 has m + 1 distinct x-values; if q
+// vanishes there, q(x, y_0) is the zero one-variable polynomial and
+// q = (y - y_0) q_1 with q_1 in P_{m-1}. Repeating over the remaining rows
+// proves q = 0, so the full quadrilateral serendipity Vandermonde is
+// nonsingular for this node set.
+void append_quad_serendipity_interior_nodes(std::vector<Vec3>& nodes, int order) {
+    if (order < 4) {
+        return;
+    }
+
+    const int m = order - 4;
+    const Real y_denominator = Real(m + 2);
+    for (int row = 0; row <= m; ++row) {
+        const int row_count = m + 1 - row;
+        const Real y = Real(-1) + Real(2) * Real(row + 1) / y_denominator;
+        const Real x_denominator = Real(row_count + 1);
+        for (int col = 0; col < row_count; ++col) {
+            const Real x = Real(-1) + Real(2) * Real(col + 1) / x_denominator;
+            nodes.push_back(Vec3{x, y, Real(0)});
+        }
+    }
+}
+
 std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
     std::vector<Vec3> nodes;
     if (order <= 0) {
@@ -113,49 +147,12 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
         nodes.size() > total_size, SVMP_HERE,
         "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size");
 
-    const std::size_t interior_count = total_size - nodes.size();
-    if (interior_count == 0u) {
-        return nodes;
-    }
-
-    std::vector<Vec3> interior_candidates;
-    interior_candidates.reserve(static_cast<std::size_t>((order - 1) * (order - 1)));
-    for (int j = 1; j < order; ++j) {
-        for (int i = 1; i < order; ++i) {
-            interior_candidates.push_back(
-                Vec3{Real(-1) + Real(2 * i) * inv_order,
-                     Real(-1) + Real(2 * j) * inv_order,
-                     Real(0)});
-        }
-    }
-
-    std::sort(interior_candidates.begin(), interior_candidates.end(),
-              [](const Vec3& a, const Vec3& b) {
-                  const Real a_linf = std::max(std::abs(a[0]), std::abs(a[1]));
-                  const Real b_linf = std::max(std::abs(b[0]), std::abs(b[1]));
-                  if (a_linf != b_linf) {
-                      return a_linf < b_linf;
-                  }
-
-                  const Real a_l1 = std::abs(a[0]) + std::abs(a[1]);
-                  const Real b_l1 = std::abs(b[0]) + std::abs(b[1]);
-                  if (a_l1 != b_l1) {
-                      return a_l1 < b_l1;
-                  }
-
-                  if (a[1] != b[1]) {
-                      return a[1] < b[1];
-                  }
-                  return a[0] < b[0];
-              });
-
+    const std::size_t interior_count = quad_serendipity_interior_count(order);
     svmp::throw_if<BasisConstructionException>(
-        interior_count > interior_candidates.size(), SVMP_HERE,
-        "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order");
+        nodes.size() + interior_count != total_size, SVMP_HERE,
+        "SerendipityBasis: quadrilateral serendipity monomial/node count mismatch");
 
-    nodes.insert(nodes.end(),
-                 interior_candidates.begin(),
-                 interior_candidates.begin() + static_cast<std::ptrdiff_t>(interior_count));
+    append_quad_serendipity_interior_nodes(nodes, order);
     return nodes;
 }
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 95a9d0ad9..930875fe7 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -50,6 +50,35 @@ namespace basis {
 /// matrix at the selected reference nodes. Values, gradients, and Hessians are
 /// then evaluated by differentiating the monomial vector and applying the
 /// inverse Vandermonde coefficients.
+/// For order \f$p \ge 1\f$, this space has \f$4p\f$ boundary modes for
+/// \f$p \le 3\f$ and
+/// \f[
+///   4p + \frac{(p - 3)(p - 2)}{2}
+/// \f]
+/// modes for \f$p \ge 4\f$.
+///
+/// The quadrilateral node set is unisolvent by construction. If
+/// \f$s(x,y)\f$ in this space vanishes at the \f$p + 1\f$ distinct nodes on
+/// every edge, each edge restriction is a degree-\f$p\f$ one-variable
+/// polynomial with \f$p + 1\f$ roots, so all edge restrictions vanish. Thus
+/// \f$s\f$ is divisible by the boundary bubble
+/// \f$(1 - x^2)(1 - y^2)\f$, and the quotient lies in
+/// \f$P_{p-4}\f$ (with no quotient for \f$p < 4\f$). For \f$p \ge 4\f$, the
+/// interior nodes form triangular rows for \f$P_{p-4}\f$: the first row has
+/// \f$m + 1\f$ distinct \f$x\f$ values, the next row has \f$m\f$, and so on
+/// for \f$m = p - 4\f$. A total-degree polynomial that vanishes on those rows
+/// is zero by induction over rows, because each vanished row factors out one
+/// linear term in \f$y\f$. The interpolation Vandermonde is therefore
+/// nonsingular for the implemented quadrilateral serendipity space.
+///
+/// `SerendipityBasis(ElementType::Quad4, p)` supports explicit
+/// arbitrary-order quadrilateral serendipity requests for \f$p \ge 1\f$
+/// (requests below one are normalized to one). `ElementType::Quad8` remains
+/// the standard quadratic eight-node layout and is valid only with order 2.
+/// Solver-default basis selection remains separate: `basis_factory` maps the
+/// complete Quad4 layout to the default linear Lagrange basis and maps Quad8 to
+/// quadratic serendipity unless a caller explicitly requests a different
+/// supported basis.
 ///
 /// Hex8 uses the standard trilinear corner basis
 /// \f$(1 \pm r)(1 \pm s)(1 \pm t)/8\f$. Hex20 and Wedge15 use tabulated
@@ -67,7 +96,9 @@ class SerendipityBasis final : public BasisFunction {
     /// invert a Vandermonde matrix for the selected serendipity monomials.
     /// Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
     /// linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
-    /// wedges, only quadratic Wedge15 is supported.
+    /// wedges, only quadratic Wedge15 is supported. Quad4 supports explicit
+    /// quadrilateral serendipity requests of any order \f$p \ge 1\f$; Quad8 is
+    /// restricted to order 2.
     ///
     /// \param type Element type used to determine topology and reference-node layout.
     /// \param order Requested polynomial order.
@@ -97,7 +128,11 @@ class SerendipityBasis final : public BasisFunction {
     /// placed first on the boundary and then, for higher order requests, at the
     /// selected interior points needed to make the reduced monomial space
     /// unisolvent. Hexahedral and wedge nodes are taken from
-    /// ReferenceNodeLayout.
+    /// ReferenceNodeLayout. For high-order Quad4 serendipity, the deterministic
+    /// interior row ordering is an implementation convention; callers should
+    /// pair it with basis values from the same object rather than assume an
+    /// external mesh ordering contract beyond the supported Quad4/Quad8
+    /// production layouts.
     ///
     /// \return Reference node coordinates, one per basis function.
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept final { return nodes_; }
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index fb1d9e26f..0bf0b3d33 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -189,6 +189,38 @@ void expect_hessians_symmetric(const BasisFunction& basis,
     }
 }
 
+void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
+                                        const std::vector<math::Vector<Real, 3>>& points,
+                                        Real tol)
+{
+    ASSERT_EQ(basis.dimension(), 2);
+    for (const auto& xi : points) {
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_gradients(xi, gradients);
+        basis.evaluate_hessians(xi, hessians);
+
+        ASSERT_EQ(gradients.size(), basis.size());
+        ASSERT_EQ(hessians.size(), basis.size());
+        for (std::size_t n = 0; n < basis.size(); ++n) {
+            EXPECT_NEAR(gradients[n][2], Real(0), tol)
+                << "basis " << n << ", element "
+                << static_cast<int>(basis.element_type())
+                << ", order " << basis.order();
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(hessians[n](2, d), Real(0), tol)
+                    << "basis " << n << ", component (2," << d
+                    << "), element " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order();
+                EXPECT_NEAR(hessians[n](d, 2), Real(0), tol)
+                    << "basis " << n << ", component (" << d
+                    << ",2), element " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order();
+            }
+        }
+    }
+}
+
 std::vector<math::Vector<Real, 3>> serendipity_sample_points(ElementType type) {
     if (type == ElementType::Quad4 || type == ElementType::Quad8) {
         return {{Real(0.17), Real(-0.31), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
@@ -333,6 +365,7 @@ TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
         {ElementType::Quad8, 2, Real(1e-7)},
         {ElementType::Quad4, 3, Real(1e-7)},
         {ElementType::Quad4, 4, Real(5e-7)},
+        {ElementType::Quad4, 6, Real(2e-6)},
         {ElementType::Hex8, 1, Real(1e-8)},
         {ElementType::Hex20, 2, Real(1e-7)},
         {ElementType::Wedge15, 2, Real(1e-7)},
@@ -344,6 +377,27 @@ TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
     }
 }
 
+TEST(BasisGradients, QuadrilateralSerendipityInactiveZDerivativesRemainZero) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Quad4, 1},
+        {ElementType::Quad8, 2},
+        {ElementType::Quad4, 4},
+        {ElementType::Quad4, 6},
+        {ElementType::Quad4, 10},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_inactive_z_derivatives_zero(
+            basis,
+            serendipity_sample_points(c.type),
+            Real(1e-12));
+    }
+}
+
 TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
     const struct Case {
         ElementType type;
@@ -354,6 +408,7 @@ TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
         {ElementType::Quad8, 2, Real(1e-6)},
         {ElementType::Quad4, 3, Real(1e-6)},
         {ElementType::Quad4, 4, Real(5e-6)},
+        {ElementType::Quad4, 6, Real(2e-5)},
         {ElementType::Hex8, 1, Real(1e-6)},
         {ElementType::Hex20, 2, Real(1e-6)},
         {ElementType::Wedge15, 2, Real(1e-6)},
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index d44631734..4fbc321a8 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -8,7 +8,10 @@
 #include "FE/Basis/LagrangeBasis.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 #include "FE/Basis/SerendipityBasis.h"
+#include "FE/Math/DenseLinearAlgebra.h"
 
+#include <algorithm>
+#include <array>
 #include <cmath>
 #include <vector>
 
@@ -86,6 +89,87 @@ Real interpolate_nodal_function(const SerendipityBasis& basis,
     return result;
 }
 
+int quad_serendipity_superlinear_degree_for_test(int ax, int ay) {
+    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
+}
+
+std::vector<std::array<int, 2>> quad_serendipity_exponents_for_test(int order) {
+    std::vector<std::array<int, 2>> exponents;
+    for (int ay = 0; ay <= order; ++ay) {
+        for (int ax = 0; ax <= order; ++ax) {
+            if (quad_serendipity_superlinear_degree_for_test(ax, ay) <= order) {
+                exponents.push_back({ax, ay});
+            }
+        }
+    }
+    return exponents;
+}
+
+std::size_t expected_quad_serendipity_size(int order) {
+    const auto p = static_cast<std::size_t>(order);
+    const std::size_t boundary = 4u * p;
+    if (order < 4) {
+        return boundary;
+    }
+    const auto m = static_cast<std::size_t>(order - 4);
+    return boundary + (m + 1u) * (m + 2u) / 2u;
+}
+
+Real integer_power_for_test(Real base, int exponent) {
+    Real result = Real(1);
+    for (int k = 0; k < exponent; ++k) {
+        result *= base;
+    }
+    return result;
+}
+
+Real monomial_value_for_test(const math::Vector<Real, 3>& p,
+                             const std::array<int, 2>& exponent) {
+    return integer_power_for_test(p[0], exponent[0]) *
+           integer_power_for_test(p[1], exponent[1]);
+}
+
+std::vector<Real> quadrilateral_vandermonde_for_test(
+    const std::vector<math::Vector<Real, 3>>& nodes,
+    const std::vector<std::array<int, 2>>& exponents)
+{
+    const std::size_t n = nodes.size();
+    std::vector<Real> vandermonde(n * n, Real(0));
+    for (std::size_t row = 0; row < n; ++row) {
+        for (std::size_t col = 0; col < n; ++col) {
+            vandermonde[row * n + col] =
+                monomial_value_for_test(nodes[row], exponents[col]);
+        }
+    }
+    return vandermonde;
+}
+
+void expect_no_duplicate_nodes(const std::vector<math::Vector<Real, 3>>& nodes,
+                               Real tolerance)
+{
+    for (std::size_t a = 0; a < nodes.size(); ++a) {
+        for (std::size_t b = a + 1u; b < nodes.size(); ++b) {
+            const Real dx = std::abs(nodes[a][0] - nodes[b][0]);
+            const Real dy = std::abs(nodes[a][1] - nodes[b][1]);
+            EXPECT_GT(std::max(dx, dy), tolerance)
+                << "duplicate nodes " << a << " and " << b;
+        }
+    }
+}
+
+void expect_nodes_near(const std::vector<math::Vector<Real, 3>>& actual,
+                       const std::vector<math::Vector<Real, 3>>& expected,
+                       Real tolerance)
+{
+    ASSERT_EQ(actual.size(), expected.size());
+    for (std::size_t i = 0; i < actual.size(); ++i) {
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_NEAR(actual[i][d], expected[i][d], tolerance)
+                << "node=" << i << " component=" << d;
+        }
+    }
+}
+
 // Every monomial here has superlinear degree at most three, so it lies in the
 // order-three quadrilateral serendipity space.
 Real cubic_serendipity_function(const math::Vector<Real, 3>& p) {
@@ -104,8 +188,10 @@ Real bilinear_function(const math::Vector<Real, 3>& p) {
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
     SerendipityBasis basis(ElementType::Quad8, 2);
+    SerendipityBasis explicit_quad4_basis(ElementType::Quad4, 2);
 
     EXPECT_EQ(basis.size(), 8u);
+    expect_nodes_near(basis.nodes(), explicit_quad4_basis.nodes(), Real(1e-14));
     expect_nodal_delta(basis, basis.nodes(), Real(1e-10));
     expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)});
 }
@@ -135,12 +221,21 @@ TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 1), FEException);
 }
 
-// Orders other than two run the generic quadrilateral path: serendipity
-// monomial selection, boundary plus interior node placement, and a runtime
-// Vandermonde inversion whose unisolvence is assumed rather than tabulated.
-// Order four is the first order that selects an interior node.
+TEST(SerendipityBasis, QuadrilateralOrderZeroNormalizesToLinear) {
+    SerendipityBasis basis(ElementType::Quad4, 0);
+
+    EXPECT_EQ(basis.order(), 1);
+    EXPECT_EQ(basis.size(), 4u);
+    expect_nodal_delta(basis, basis.nodes(), Real(1e-12));
+}
+
+// Explicit Quad4 serendipity orders run the documented monomial selection,
+// boundary plus triangular interior node placement, and runtime Vandermonde
+// inversion. Order four is the first order with an interior residual
+// polynomial, so it is the first order that appends an interior node.
 TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity) {
     const struct Case {
         int order;
@@ -169,6 +264,68 @@ TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity)
     }
 }
 
+TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrderTen) {
+    constexpr Real kTol = Real(1e-14);
+
+    for (int order = 1; order <= 10; ++order) {
+        SerendipityBasis basis(ElementType::Quad4, order);
+        const auto& nodes = basis.nodes();
+        const std::size_t expected_size = expected_quad_serendipity_size(order);
+        const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
+
+        ASSERT_EQ(basis.size(), expected_size) << "order=" << order;
+        ASSERT_EQ(nodes.size(), expected_size) << "order=" << order;
+        EXPECT_EQ(quad_serendipity_exponents_for_test(order).size(),
+                  expected_size) << "order=" << order;
+        expect_no_duplicate_nodes(nodes, kTol);
+
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            EXPECT_NEAR(nodes[i][2], Real(0), kTol) << "order=" << order
+                                                    << " node=" << i;
+            EXPECT_LE(std::abs(nodes[i][0]), Real(1)) << "order=" << order
+                                                       << " node=" << i;
+            EXPECT_LE(std::abs(nodes[i][1]), Real(1)) << "order=" << order
+                                                       << " node=" << i;
+
+            const bool on_boundary =
+                std::abs(std::abs(nodes[i][0]) - Real(1)) <= kTol ||
+                std::abs(std::abs(nodes[i][1]) - Real(1)) <= kTol;
+            if (i < boundary_count) {
+                EXPECT_TRUE(on_boundary) << "order=" << order << " node=" << i;
+            } else {
+                EXPECT_FALSE(on_boundary) << "order=" << order << " node=" << i;
+                EXPECT_LT(std::abs(nodes[i][0]), Real(1)) << "order=" << order
+                                                           << " node=" << i;
+                EXPECT_LT(std::abs(nodes[i][1]), Real(1)) << "order=" << order
+                                                           << " node=" << i;
+            }
+        }
+
+        std::size_t index = boundary_count;
+        if (order >= 4) {
+            const int m = order - 4;
+            const Real y_denominator = Real(m + 2);
+            for (int row = 0; row <= m; ++row) {
+                const int row_count = m + 1 - row;
+                const Real expected_y =
+                    Real(-1) + Real(2) * Real(row + 1) / y_denominator;
+                const Real x_denominator = Real(row_count + 1);
+                for (int col = 0; col < row_count; ++col) {
+                    ASSERT_LT(index, nodes.size());
+                    const Real expected_x =
+                        Real(-1) + Real(2) * Real(col + 1) / x_denominator;
+                    EXPECT_NEAR(nodes[index][0], expected_x, kTol)
+                        << "order=" << order << " row=" << row << " col=" << col;
+                    EXPECT_NEAR(nodes[index][1], expected_y, kTol)
+                        << "order=" << order << " row=" << row << " col=" << col;
+                    ++index;
+                }
+            }
+        }
+        EXPECT_EQ(index, nodes.size()) << "order=" << order;
+    }
+}
+
 TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
     SerendipityBasis basis(ElementType::Quad4, 1);
 
@@ -197,6 +354,53 @@ TEST(SerendipityBasis, QuadrilateralOrderThreeReproducesSerendipityCubics) {
     }
 }
 
+TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(-0.4), Real(0)},
+        {Real(-0.7), Real(0.6), Real(0)},
+        {Real(0.11), Real(0.23), Real(0)},
+    };
+
+    for (int order = 1; order <= 10; ++order) {
+        SerendipityBasis basis(ElementType::Quad4, order);
+        const auto exponents = quad_serendipity_exponents_for_test(order);
+        ASSERT_EQ(exponents.size(), basis.size()) << "order=" << order;
+
+        const Real tolerance = (order <= 7) ? Real(1e-10) : Real(2e-8);
+        for (const auto& exponent : exponents) {
+            for (const auto& xi : points) {
+                const Real interpolated =
+                    interpolate_nodal_function(
+                        basis,
+                        xi,
+                        [&exponent](const math::Vector<Real, 3>& node) {
+                            return monomial_value_for_test(node, exponent);
+                        });
+                const Real expected = monomial_value_for_test(xi, exponent);
+                EXPECT_NEAR(interpolated, expected, tolerance)
+                    << "order=" << order << " ax=" << exponent[0]
+                    << " ay=" << exponent[1] << " xi=(" << xi[0] << ","
+                    << xi[1] << ")";
+            }
+        }
+    }
+}
+
+TEST(SerendipityBasis, QuadrilateralVandermondeHasFullRankThroughOrderTen) {
+    for (int order = 1; order <= 10; ++order) {
+        SerendipityBasis basis(ElementType::Quad4, order);
+        const auto exponents = quad_serendipity_exponents_for_test(order);
+        const auto vandermonde =
+            quadrilateral_vandermonde_for_test(basis.nodes(), exponents);
+        const std::size_t n = basis.size();
+
+        ASSERT_EQ(exponents.size(), n) << "order=" << order;
+        ASSERT_EQ(vandermonde.size(), n * n) << "order=" << order;
+        EXPECT_EQ(math::dense_matrix_rank(vandermonde, n, n), n)
+            << "order=" << order;
+    }
+}
+
 // SerendipityBasis(Hex8, 1) is the only route to the hand-written trilinear
 // corner evaluator (values, gradients, and Hessians); it must agree with the
 // trilinear Lagrange basis on the same element.

From f5e7828494802419a45d40e6bd91c5297ccd58fd Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 10:44:01 -0700
Subject: [PATCH 41/91] adding comments to clarify coefficient calculated
 tables and node ordering

---
 .../solver/FE/Basis/NodeOrderingConventions.cpp       |  4 ++++
 Code/Source/solver/FE/Basis/SerendipityBasis.cpp      | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index a058133dd..79d5b7f21 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -15,6 +15,10 @@ namespace {
 
 using Point = math::Vector<Real, 3>;
 
+// Maps public Hex20 ReferenceNodeLayout slots to the internal coefficient-table
+// basis columns used by kHex20Coefficients. Wedge15 and quadrilateral
+// serendipity tables are stored directly in public node order and need no
+// equivalent permutation.
 constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     0, 1, 2, 3, 4, 5, 6, 7,
     8, 13, 10, 12,
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index ed8c073f3..a7063b915 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -211,6 +211,11 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{2, 0, 1}}
 }};
 
+// Coefficients for the quadratic Wedge15 nodal serendipity basis. Rows are
+// monomials in kWedge15MonomialExponents order; columns are basis functions in
+// public Wedge15 node order. The table is the inverse of
+// V[node][monomial] = r^a s^b t^c evaluated at ReferenceNodeLayout Wedge15
+// nodes, so V * kWedge15Coefficients is the identity.
 constexpr std::array<std::array<Real, 15>, 15> kWedge15Coefficients = {{
     {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}},
     {{-0.5, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
@@ -236,6 +241,12 @@ constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
     {{1, 2, 1}}, {{2, 0, 0}}, {{2, 0, 1}}, {{2, 1, 0}}, {{2, 1, 1}}
 }};
 
+// Coefficients for the quadratic Hex20 nodal serendipity basis. Rows are
+// monomials in kHex20MonomialExponents order; columns are basis functions in
+// the internal Hex20 coefficient-table order. The table is the inverse of
+// V[node][monomial] = r^a s^b t^c evaluated at the corresponding Hex20
+// reference nodes, so V * kHex20Coefficients is the identity. Evaluation
+// remaps public output slots through ReferenceNodeLayout::mesh_to_basis_ordering.
 constexpr std::array<std::array<Real, 20>, 20> kHex20Coefficients = {{
     {{-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25}},
     {{0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0}},

From a07a66cb1a3fcbf93b70e4a1d2902fce2e0e832e Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 18 Jun 2026 12:47:35 -0700
Subject: [PATCH 42/91] removed is_simplex and is_tensor_product. added
 clarifying comments to node ordering conventions

---
 .../FE/Basis/NodeOrderingConventions.cpp      |  8 --------
 .../solver/FE/Basis/NodeOrderingConventions.h |  2 --
 .../solver/FE/Basis/SerendipityBasis.cpp      | 19 +++++++++----------
 .../FE/Math/test_DenseLinearAlgebra.cpp       |  7 +++----
 4 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 79d5b7f21..2a922c6a2 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -412,14 +412,6 @@ std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(Element
     return {};
 }
 
-bool ReferenceNodeLayout::is_simplex(ElementType elem_type) {
-    return svmp::FE::basis::is_simplex(elem_type);
-}
-
-bool ReferenceNodeLayout::is_tensor_product(ElementType elem_type) {
-    return svmp::FE::basis::is_tensor_product(elem_type);
-}
-
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 4f9d1525a..5dde6c493 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -36,8 +36,6 @@ class ReferenceNodeLayout {
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
     static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
-    static bool is_simplex(ElementType elem_type);
-    static bool is_tensor_product(ElementType elem_type);
 };
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index a7063b915..3a340351b 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -7,7 +7,6 @@
 
 #include <algorithm>
 #include <array>
-#include <cmath>
 #include <span>
 #include <string>
 
@@ -71,6 +70,14 @@ int quad_serendipity_superlinear_degree(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }
 
+inline Real integer_power(Real base, int exponent) {
+    Real result = Real(1);
+    for (int k = 0; k < exponent; ++k) {
+        result *= base;
+    }
+    return result;
+}
+
 std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
     std::vector<std::array<int, 2>> exponents;
     for (int ay = 0; ay <= order; ++ay) {
@@ -182,7 +189,7 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
         const Real y = nodes[static_cast<std::size_t>(row)][1];
         for (int col = 0; col < n; ++col) {
             const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
-            vandermonde[idx(row, col)] = std::pow(x, ax) * std::pow(y, ay);
+            vandermonde[idx(row, col)] = integer_power(x, ax) * integer_power(y, ay);
         }
     }
 
@@ -278,14 +285,6 @@ struct MonomialAxis {
     Real second;  ///< d^2/dx^2 (x^a) = a (a-1) x^(a-2)
 };
 
-inline Real integer_power(Real base, int exponent) {
-    Real result = Real(1);
-    for (int k = 0; k < exponent; ++k) {
-        result *= base;
-    }
-    return result;
-}
-
 inline MonomialAxis monomial_axis(Real x, int exponent) {
     MonomialAxis axis;
     axis.value = integer_power(x, exponent);
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
index 9e9e08e95..f21e37dd4 100644
--- a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -109,10 +109,9 @@ TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
 }
 
 // Every other matrix in this file already has its largest pivot on the
-// diagonal, so without these cases the row-exchange branch in
-// factor_dense_matrix and the permutation replay in solve_in_place never
-// execute. SerendipityBasis inverts its Vandermonde matrices through this
-// code in production.
+// diagonal, so these cases cover the row-exchange branch in factor_dense_matrix,
+// the inverse path used by SerendipityBasis, and the permutation replay in
+// solve_in_place.
 TEST(DenseLinearAlgebra, FactorizationPivotsThroughZeroLeadingDiagonal) {
     const std::vector<Real> swap_2x2{
         Real(0), Real(1),

From d442f70c30abe06c3aeccdd06fb558e7aa362b06 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 22 Jun 2026 11:36:39 -0700
Subject: [PATCH 43/91] removing `Real` alias for `double`

---
 Code/Source/solver/FE/Basis/BasisFactory.h    |   8 +-
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  48 ++--
 Code/Source/solver/FE/Basis/BasisFunction.h   |  46 ++--
 Code/Source/solver/FE/Basis/BasisTraits.h     |  28 +--
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 150 +++++------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  44 ++--
 .../FE/Basis/NodeOrderingConventions.cpp      | 162 ++++++------
 .../solver/FE/Basis/NodeOrderingConventions.h |  10 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      | 156 ++++++------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  30 +--
 Code/Source/solver/FE/Common/Types.h          |  12 +-
 .../solver/FE/Math/DenseLinearAlgebra.cpp     | 114 ++++-----
 .../solver/FE/Math/DenseLinearAlgebra.h       |  62 ++---
 Code/Source/solver/nn.cpp                     |   8 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  90 +++----
 .../unitTests/FE/Basis/test_BasisHessians.cpp | 208 ++++++++--------
 .../FE/Basis/test_ConstexprBasis.cpp          |  24 +-
 .../FE/Basis/test_HigherOrderWedge.cpp        |  60 ++---
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 232 +++++++++---------
 .../FE/Basis/test_SerendipityBasis.cpp        | 178 +++++++-------
 .../FE/Math/test_DenseLinearAlgebra.cpp       | 228 ++++++++---------
 21 files changed, 948 insertions(+), 950 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index b14cc5501..2e1154c10 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -25,11 +25,11 @@ struct BasisRequest {
     std::optional<int> order{};
     Continuity continuity{Continuity::C0};
     FieldType field_type{FieldType::Scalar};
-    std::vector<Real> knot_vector{};
-    std::vector<Real> weights{};
+    std::vector<double> knot_vector{};
+    std::vector<double> weights{};
     std::vector<int> axis_orders{};
-    std::vector<std::vector<Real>> axis_knot_vectors{};
-    std::vector<std::vector<Real>> axis_weights{};
+    std::vector<std::vector<double>> axis_knot_vectors{};
+    std::vector<std::vector<double>> axis_weights{};
     std::vector<int> tensor_extents{};
     std::string custom_id{};
 };
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index abcb5096b..583692ca4 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -17,14 +17,14 @@ void require_span_size(std::size_t actual,
         std::string(label) + ": output span is smaller than basis size");
 }
 
-const std::vector<math::Vector<Real, 3>>& BasisFunction::nodes() const noexcept {
+const std::vector<math::Vector<double, 3>>& BasisFunction::nodes() const noexcept {
     // Default for bases that do not expose interpolation nodes; nodal families
     // (LagrangeBasis, SerendipityBasis) override this to return their layout.
-    static const std::vector<math::Vector<Real, 3>> kNoNodes;
+    static const std::vector<math::Vector<double, 3>> kNoNodes;
     return kNoNodes;
 }
 
-void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
+void BasisFunction::evaluate_gradients(const math::Vector<double, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
     (void)gradients;
@@ -32,7 +32,7 @@ void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
         "Analytic gradient evaluation is not implemented for this basis");
 }
 
-void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
+void BasisFunction::evaluate_hessians(const math::Vector<double, 3>& xi,
                                       std::vector<Hessian>& hessians) const {
     (void)xi;
     (void)hessians;
@@ -40,8 +40,8 @@ void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
         "Analytic Hessian evaluation is not implemented for this basis");
 }
 
-void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
-                                 std::vector<Real>& values,
+void BasisFunction::evaluate_all(const math::Vector<double, 3>& xi,
+                                 std::vector<double>& values,
                                  std::vector<Gradient>& gradients,
                                  std::vector<Hessian>& hessians) const {
     evaluate_values(xi, values);
@@ -49,15 +49,15 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
     evaluate_hessians(xi, hessians);
 }
 
-void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                       std::span<Real> values_out) const {
+void BasisFunction::evaluate_values_to(const math::Vector<double, 3>& xi,
+                                       std::span<double> values_out) const {
     require_span_size(values_out.size(), size(), "BasisFunction::evaluate_values_to");
-    std::vector<Real> tmp(size());
+    std::vector<double> tmp(size());
     evaluate_values(xi, tmp);
     std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
 }
 
-void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+void BasisFunction::evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                           std::span<Gradient> gradients_out) const {
     require_span_size(gradients_out.size(), size(), "BasisFunction::evaluate_gradients_to");
     std::vector<Gradient> tmp(size());
@@ -65,7 +65,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
     std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
 }
 
-void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+void BasisFunction::evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                          std::span<Hessian> hessians_out) const {
     require_span_size(hessians_out.size(), size(), "BasisFunction::evaluate_hessians_to");
     std::vector<Hessian> tmp(size());
@@ -73,41 +73,41 @@ void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
     std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
 }
 
-void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
+void BasisFunction::numerical_gradient(const math::Vector<double, 3>& xi,
                                        std::vector<Gradient>& gradients,
-                                       Real eps) const {
-    std::vector<Real> base;
+                                       double eps) const {
+    std::vector<double> base;
     evaluate_values(xi, base);
     gradients.assign(base.size(), Gradient::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
-        math::Vector<Real, 3> forward = xi;
-        math::Vector<Real, 3> backward = xi;
+        math::Vector<double, 3> forward = xi;
+        math::Vector<double, 3> backward = xi;
         const auto idx = static_cast<std::size_t>(d);
         forward[idx] += eps;
         backward[idx] -= eps;
 
-        std::vector<Real> fwd;
-        std::vector<Real> bwd;
+        std::vector<double> fwd;
+        std::vector<double> bwd;
         evaluate_values(forward, fwd);
         evaluate_values(backward, bwd);
 
         for (std::size_t i = 0; i < base.size(); ++i) {
-            gradients[i][idx] = (fwd[i] - bwd[i]) / (Real(2) * eps);
+            gradients[i][idx] = (fwd[i] - bwd[i]) / (double(2) * eps);
         }
     }
 }
 
-void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
+void BasisFunction::numerical_hessian(const math::Vector<double, 3>& xi,
                                       std::vector<Hessian>& hessians,
-                                      Real eps) const {
+                                      double eps) const {
     std::vector<Gradient> base_grad;
     evaluate_gradients(xi, base_grad);
     hessians.assign(base_grad.size(), Hessian::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
-        math::Vector<Real, 3> forward = xi;
-        math::Vector<Real, 3> backward = xi;
+        math::Vector<double, 3> forward = xi;
+        math::Vector<double, 3> backward = xi;
         const auto col = static_cast<std::size_t>(d);
         forward[col] += eps;
         backward[col] -= eps;
@@ -121,7 +121,7 @@ void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
             for (int k = 0; k < dimension(); ++k) {
                 const auto row = static_cast<std::size_t>(k);
                 hessians[i](row, col) =
-                    (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
+                    (g_forward[i][row] - g_backward[i][row]) / (double(2) * eps);
             }
         }
     }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 6fd3c68a2..be6418c50 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -128,17 +128,17 @@ namespace FE {
 namespace basis {
 
 /// \brief Gradient vector type used by basis evaluators.
-using Gradient = math::Vector<Real, 3>;
+using Gradient = math::Vector<double, 3>;
 
 /// \brief Hessian matrix type used by basis evaluators.
-using Hessian  = math::Matrix<Real, 3, 3>;
+using Hessian  = math::Matrix<double, 3, 3>;
 
-[[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
-                                                    Real yy,
-                                                    Real zz,
-                                                    Real xy,
-                                                    Real xz,
-                                                    Real yz) {
+[[nodiscard]] inline Hessian make_symmetric_hessian(double xx,
+                                                    double yy,
+                                                    double zz,
+                                                    double xy,
+                                                    double xz,
+                                                    double yz) {
     Hessian hessian = Hessian::Zero();
     hessian(0, 0) = xx;
     hessian(1, 1) = yy;
@@ -210,26 +210,26 @@ class BasisFunction {
     ///
     /// \return Reference node coordinates: size() entries for nodal families,
     ///         empty otherwise.
-    virtual const std::vector<math::Vector<Real, 3>>& nodes() const noexcept;
+    virtual const std::vector<math::Vector<double, 3>>& nodes() const noexcept;
 
     /// \brief Evaluate basis function values at a reference coordinate.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
-    virtual void evaluate_values(const math::Vector<Real, 3>& xi,
-                                 std::vector<Real>& values) const = 0;
+    virtual void evaluate_values(const math::Vector<double, 3>& xi,
+                                 std::vector<double>& values) const = 0;
 
     /// \brief Evaluate basis gradients at a reference coordinate.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
     /// \throws BasisEvaluationException If gradients are not available for the basis.
-    virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
+    virtual void evaluate_gradients(const math::Vector<double, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
 
     /// \brief Evaluate basis Hessians at a reference coordinate.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
     /// \throws BasisEvaluationException If Hessians are not available for the basis.
-    virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
+    virtual void evaluate_hessians(const math::Vector<double, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
 
     /// \brief Evaluate values, gradients, and Hessians together.
@@ -237,27 +237,27 @@ class BasisFunction {
     /// \param values Receives one value per basis function.
     /// \param gradients Receives one three-component gradient per basis function.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    virtual void evaluate_all(const math::Vector<Real, 3>& xi,
-                              std::vector<Real>& values,
+    virtual void evaluate_all(const math::Vector<double, 3>& xi,
+                              std::vector<double>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
     /// \brief Evaluate basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values_out Output span with at least size() entries.
-    virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                    std::span<Real> values_out) const;
+    virtual void evaluate_values_to(const math::Vector<double, 3>& xi,
+                                    std::span<double> values_out) const;
 
     /// \brief Evaluate basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients_out Output span with at least size() entries.
-    virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+    virtual void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                        std::span<Gradient> gradients_out) const;
 
     /// \brief Evaluate basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians_out Output span with at least size() entries.
-    virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+    virtual void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                       std::span<Hessian> hessians_out) const;
 
 protected:
@@ -270,9 +270,9 @@ class BasisFunction {
     /// analytical gradients when available because finite differences introduce
     /// truncation/roundoff sensitivity and require multiple value evaluations
     /// per reference coordinate.
-    void numerical_gradient(const math::Vector<Real, 3>& xi,
+    void numerical_gradient(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients,
-                            Real eps = Real(1e-6)) const;
+                            double eps = double(1e-6)) const;
 
     /// \brief Approximate Hessians by centered finite differences of gradients.
     ///
@@ -284,9 +284,9 @@ class BasisFunction {
     /// Hessians should be used in performance-sensitive solver paths because
     /// finite-difference Hessians amplify numerical error and require repeated
     /// gradient evaluations.
-    void numerical_hessian(const math::Vector<Real, 3>& xi,
+    void numerical_hessian(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians,
-                           Real eps = Real(1e-5)) const;
+                           double eps = double(1e-5)) const;
 };
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 484e7c588..c9df8789c 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -26,30 +26,30 @@ enum class BasisTopology {
 
 namespace detail {
 
-[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
-    return value < Real(0) ? -value : value;
+[[nodiscard]] constexpr double basis_abs(double value) noexcept {
+    return value < double(0) ? -value : value;
 }
 
-[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
+[[nodiscard]] constexpr double basis_max(double lhs, double rhs) noexcept {
     return lhs < rhs ? rhs : lhs;
 }
 
-[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
-                                                    Real multiplier = Real(64)) noexcept {
-    return multiplier * std::numeric_limits<Real>::epsilon() *
-           basis_max(Real(1), basis_abs(scale));
+[[nodiscard]] constexpr double basis_scaled_tolerance(double scale = double(1),
+                                                    double multiplier = double(64)) noexcept {
+    return multiplier * std::numeric_limits<double>::epsilon() *
+           basis_max(double(1), basis_abs(scale));
 }
 
-[[nodiscard]] constexpr bool basis_near_zero(Real value,
-                                             Real scale = Real(1),
-                                             Real multiplier = Real(64)) noexcept {
+[[nodiscard]] constexpr bool basis_near_zero(double value,
+                                             double scale = double(1),
+                                             double multiplier = double(64)) noexcept {
     return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
 }
 
-[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
-                                                Real b,
-                                                Real multiplier = Real(64)) noexcept {
-    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
+[[nodiscard]] constexpr bool basis_nearly_equal(double a,
+                                                double b,
+                                                double multiplier = double(64)) noexcept {
+    const double scale = basis_max(double(1), basis_max(basis_abs(a), basis_abs(b)));
     return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
 }
 
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 26e96d4da..0233eb18a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -17,16 +17,16 @@ namespace basis {
 
 namespace {
 
-using Vec3 = math::Vector<Real, 3>;
+using Vec3 = math::Vector<double, 3>;
 
 struct AxisEval {
-    std::vector<Real> value;
-    std::vector<Real> first;
-    std::vector<Real> second;
+    std::vector<double> value;
+    std::vector<double> first;
+    std::vector<double> second;
 };
 
 struct SimplexEval {
-    std::vector<Real> value;
+    std::vector<double> value;
     std::vector<Gradient> gradient;
     std::vector<Hessian> hessian;
 };
@@ -80,30 +80,30 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
 }
 
 // Convert a coordinate on [-1, 1] to an equispaced axis node index.
-std::size_t axis_index_pm_one(Real coord, int order) {
+std::size_t axis_index_pm_one(double coord, int order) {
     if (order <= 0) {
         return 0u;
     }
-    const Real scaled = (coord + Real(1)) * Real(order) / Real(2);
+    const double scaled = (coord + double(1)) * double(order) / double(2);
     const long long rounded = std::llround(scaled);
     svmp::throw_if<BasisConstructionException>(
         rounded < 0 || rounded > static_cast<long long>(order) ||
-            !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
+            !detail::basis_nearly_equal(scaled, static_cast<double>(rounded)),
         SVMP_HERE,
         "LagrangeBasis: tensor-product node coordinate is off the equispaced lattice");
     return static_cast<std::size_t>(rounded);
 }
 
 // Convert a simplex barycentric coordinate to a lattice index.
-int simplex_lattice_index(Real value, int order) {
+int simplex_lattice_index(double value, int order) {
     if (order <= 0) {
         return 0;
     }
-    const Real scaled = value * Real(order);
+    const double scaled = value * double(order);
     const long long rounded = std::llround(scaled);
     svmp::throw_if<BasisConstructionException>(
         rounded < 0 || rounded > static_cast<long long>(order) ||
-            !detail::basis_nearly_equal(scaled, static_cast<Real>(rounded)),
+            !detail::basis_nearly_equal(scaled, static_cast<double>(rounded)),
         SVMP_HERE,
         "LagrangeBasis: simplex node coordinate is off the lattice");
     return static_cast<int>(rounded);
@@ -139,14 +139,14 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
 constexpr std::size_t kNoSkip = std::numeric_limits<std::size_t>::max();
 
 // Evaluate 1D Lagrange polynomials and derivatives at a point.
-void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
+void evaluate_1d_lagrange(double x, const std::vector<double>& nodes, AxisEval& out) {
     const std::size_t n = nodes.size();
-    out.value.assign(n, Real(0));
-    out.first.assign(n, Real(0));
-    out.second.assign(n, Real(0));
+    out.value.assign(n, double(0));
+    out.first.assign(n, double(0));
+    out.second.assign(n, double(0));
 
     if (n == 1u) {
-        out.value[0] = Real(1);
+        out.value[0] = double(1);
         return;
     }
 
@@ -155,7 +155,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
         // Each derivative order drops one additional factor from the product.
         const auto product_excluding = [&](std::size_t skip1 = kNoSkip,
                                            std::size_t skip2 = kNoSkip) {
-            Real product = Real(1);
+            double product = double(1);
             for (std::size_t j = 0; j < n; ++j) {
                 if (j != i && j != skip1 && j != skip2) {
                     product *= x - nodes[j];
@@ -164,7 +164,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
             return product;
         };
 
-        Real denom = Real(1);
+        double denom = double(1);
         for (std::size_t j = 0; j < n; ++j) {
             if (j != i) {
                 denom *= nodes[i] - nodes[j];
@@ -173,7 +173,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
 
         out.value[i] = product_excluding() / denom;
 
-        Real first = Real(0);
+        double first = double(0);
         for (std::size_t m = 0; m < n; ++m) {
             if (m != i) {
                 first += product_excluding(m);
@@ -181,7 +181,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
         }
         out.first[i] = first / denom;
 
-        Real second = Real(0);
+        double second = double(0);
         for (std::size_t m = 0; m < n; ++m) {
             if (m == i) {
                 continue;
@@ -197,20 +197,20 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
 }
 
 // Evaluate one barycentric polynomial factor and derivatives.
-std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
-    Real value = Real(1);
-    Real first = Real(0);
-    Real second = Real(0);
+std::array<double, 3> simplex_factor(int alpha, double lambda, int order) {
+    double value = double(1);
+    double first = double(0);
+    double second = double(0);
 
     for (int m = 0; m < alpha; ++m) {
-        const Real factor = Real(order) * lambda - Real(m);
-        const Real inv = Real(1) / Real(m + 1);
-        const Real old_value = value;
-        const Real old_first = first;
-        const Real old_second = second;
+        const double factor = double(order) * lambda - double(m);
+        const double inv = double(1) / double(m + 1);
+        const double old_value = value;
+        const double old_first = first;
+        const double old_second = second;
         value = old_value * factor * inv;
-        first = (old_first * factor + old_value * Real(order)) * inv;
-        second = (old_second * factor + Real(2) * old_first * Real(order)) * inv;
+        first = (old_first * factor + old_value * double(order)) * inv;
+        second = (old_second * factor + double(2) * old_first * double(order)) * inv;
     }
 
     return {value, first, second};
@@ -223,51 +223,51 @@ void evaluate_simplex(const Vec3& xi,
                       const std::vector<LagrangeBasis::SimplexExponent>& exponents,
                       SimplexEval& out) {
     const std::size_t n = exponents.size();
-    out.value.assign(n, Real(0));
+    out.value.assign(n, double(0));
     out.gradient.assign(n, Gradient::Zero());
     out.hessian.assign(n, Hessian::Zero());
 
     if (n == 1u && order == 0) {
-        out.value[0] = Real(1);
+        out.value[0] = double(1);
         return;
     }
 
     const std::size_t bary_count = top == BasisTopology::Triangle ? 3u : 4u;
-    std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
+    std::array<double, 4> lambda{double(0), double(0), double(0), double(0)};
     std::array<Gradient, 4> lambda_grad;
     lambda_grad.fill(Gradient::Zero());
 
     lambda[1] = xi[0];
     lambda[2] = xi[1];
-    lambda_grad[1][0] = Real(1);
-    lambda_grad[2][1] = Real(1);
+    lambda_grad[1][0] = double(1);
+    lambda_grad[2][1] = double(1);
     if (top == BasisTopology::Triangle) {
-        lambda[0] = Real(1) - xi[0] - xi[1];
-        lambda_grad[0][0] = Real(-1);
-        lambda_grad[0][1] = Real(-1);
+        lambda[0] = double(1) - xi[0] - xi[1];
+        lambda_grad[0][0] = double(-1);
+        lambda_grad[0][1] = double(-1);
     } else {
         lambda[3] = xi[2];
-        lambda[0] = Real(1) - xi[0] - xi[1] - xi[2];
-        lambda_grad[0][0] = Real(-1);
-        lambda_grad[0][1] = Real(-1);
-        lambda_grad[0][2] = Real(-1);
-        lambda_grad[3][2] = Real(1);
+        lambda[0] = double(1) - xi[0] - xi[1] - xi[2];
+        lambda_grad[0][0] = double(-1);
+        lambda_grad[0][1] = double(-1);
+        lambda_grad[0][2] = double(-1);
+        lambda_grad[3][2] = double(1);
     }
 
     for (std::size_t i = 0; i < n; ++i) {
-        std::array<std::array<Real, 3>, 4> f{};
+        std::array<std::array<double, 3>, 4> f{};
         for (std::size_t a = 0; a < bary_count; ++a) {
             f[a] = simplex_factor(exponents[i][a], lambda[a], order);
         }
 
-        Real value = Real(1);
+        double value = double(1);
         for (std::size_t a = 0; a < bary_count; ++a) {
             value *= f[a][0];
         }
         out.value[i] = value;
 
         for (std::size_t a = 0; a < bary_count; ++a) {
-            Real product = f[a][1];
+            double product = f[a][1];
             for (std::size_t b = 0; b < bary_count; ++b) {
                 if (b != a) {
                     product *= f[b][0];
@@ -280,7 +280,7 @@ void evaluate_simplex(const Vec3& xi,
 
         for (std::size_t a = 0; a < bary_count; ++a) {
             for (std::size_t b = 0; b < bary_count; ++b) {
-                Real product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
+                double product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
                 for (std::size_t k = 0; k < bary_count; ++k) {
                     if (k != a && k != b) {
                         product *= f[k][0];
@@ -355,7 +355,7 @@ void LagrangeBasis::init_nodes() {
 
 // Build the single reference node for a point basis.
 void LagrangeBasis::build_point_nodes() {
-    nodes_.push_back(Vec3{Real(0), Real(0), Real(0)});
+    nodes_.push_back(Vec3{double(0), double(0), double(0)});
 }
 
 // Build nodes and axis indices for tensor-product elements.
@@ -411,11 +411,11 @@ void LagrangeBasis::build_wedge_nodes() {
 }
 
 // Evaluate the constant point basis.
-void LagrangeBasis::evaluate_point_to(std::span<Real> values_out,
+void LagrangeBasis::evaluate_point_to(std::span<double> values_out,
                                       std::span<Gradient> gradients_out,
                                       std::span<Hessian> hessians_out) const {
     if (!values_out.empty()) {
-        values_out[0] = Real(1);
+        values_out[0] = double(1);
     }
     if (!gradients_out.empty()) {
         gradients_out[0] = Gradient::Zero();
@@ -427,7 +427,7 @@ void LagrangeBasis::evaluate_point_to(std::span<Real> values_out,
 
 // Evaluate line, quadrilateral, and hexahedron bases as axis-polynomial products.
 void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
-                                               std::span<Real> values_out,
+                                               std::span<double> values_out,
                                                std::span<Gradient> gradients_out,
                                                std::span<Hessian> hessians_out) const {
     AxisEval ax;
@@ -443,15 +443,15 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
 
     for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
         const auto& idx = tensor_indices_[node];
-        const Real vx = ax.value[idx[0]];
-        const Real dx = ax.first[idx[0]];
-        const Real d2x = ax.second[idx[0]];
-        const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
-        const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
-        const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
-        const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
-        const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
-        const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
+        const double vx = ax.value[idx[0]];
+        const double dx = ax.first[idx[0]];
+        const double d2x = ax.second[idx[0]];
+        const double vy = dimension_ >= 2 ? ay.value[idx[1]] : double(1);
+        const double dy = dimension_ >= 2 ? ay.first[idx[1]] : double(0);
+        const double d2y = dimension_ >= 2 ? ay.second[idx[1]] : double(0);
+        const double vz = dimension_ >= 3 ? az.value[idx[2]] : double(1);
+        const double dz = dimension_ >= 3 ? az.first[idx[2]] : double(0);
+        const double d2z = dimension_ >= 3 ? az.second[idx[2]] : double(0);
 
         if (!values_out.empty()) {
             values_out[node] = vx * vy * vz;
@@ -479,7 +479,7 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
 
 // Evaluate triangle and tetrahedron bases from barycentric factors.
 void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
-                                        std::span<Real> values_out,
+                                        std::span<double> values_out,
                                         std::span<Gradient> gradients_out,
                                         std::span<Hessian> hessians_out) const {
     SimplexEval simplex;
@@ -499,7 +499,7 @@ void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
 
 // Evaluate wedge bases as triangle/through-axis products.
 void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
-                                      std::span<Real> values_out,
+                                      std::span<double> values_out,
                                       std::span<Gradient> gradients_out,
                                       std::span<Hessian> hessians_out) const {
     SimplexEval tri;
@@ -509,10 +509,10 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
 
     for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
         const auto [tri_idx, z_idx] = wedge_indices_[node];
-        const Real tv = tri.value[tri_idx];
-        const Real zv = z_axis.value[z_idx];
-        const Real dz = z_axis.first[z_idx];
-        const Real d2z = z_axis.second[z_idx];
+        const double tv = tri.value[tri_idx];
+        const double zv = z_axis.value[z_idx];
+        const double dz = z_axis.first[z_idx];
+        const double d2z = z_axis.second[z_idx];
 
         if (!values_out.empty()) {
             values_out[node] = tv * zv;
@@ -542,7 +542,7 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
 
 // Evaluate requested basis quantities into caller-provided spans.
 void LagrangeBasis::evaluate_all_to(const Vec3& xi,
-                                    std::span<Real> values_out,
+                                    std::span<double> values_out,
                                     std::span<Gradient> gradients_out,
                                     std::span<Hessian> hessians_out) const {
     require_requested_span_size(values_out, size(), "LagrangeBasis::evaluate_all_to values");
@@ -578,9 +578,9 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
 }
 
 void LagrangeBasis::evaluate_values(const Vec3& xi,
-                                    std::vector<Real>& values) const {
+                                    std::vector<double>& values) const {
     values.resize(size());
-    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
+    evaluate_values_to(xi, std::span<double>(values.data(), values.size()));
 }
 
 void LagrangeBasis::evaluate_gradients(const Vec3& xi,
@@ -596,20 +596,20 @@ void LagrangeBasis::evaluate_hessians(const Vec3& xi,
 }
 
 void LagrangeBasis::evaluate_all(const Vec3& xi,
-                                 std::vector<Real>& values,
+                                 std::vector<double>& values,
                                  std::vector<Gradient>& gradients,
                                  std::vector<Hessian>& hessians) const {
     values.resize(size());
     gradients.resize(size());
     hessians.resize(size());
     evaluate_all_to(xi,
-                    std::span<Real>(values.data(), values.size()),
+                    std::span<double>(values.data(), values.size()),
                     std::span<Gradient>(gradients.data(), gradients.size()),
                     std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
-                                       std::span<Real> values_out) const {
+                                       std::span<double> values_out) const {
     require_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
     evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
@@ -617,13 +617,13 @@ void LagrangeBasis::evaluate_values_to(const Vec3& xi,
 void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
                                           std::span<Gradient> gradients_out) const {
     require_span_size(gradients_out.size(), size(), "LagrangeBasis::evaluate_gradients_to");
-    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
+    evaluate_all_to(xi, std::span<double>{}, gradients_out, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
                                          std::span<Hessian> hessians_out) const {
     require_span_size(hessians_out.size(), size(), "LagrangeBasis::evaluate_hessians_to");
-    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
+    evaluate_all_to(xi, std::span<double>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 9546e64f7..be41d9b54 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -111,7 +111,7 @@ class LagrangeBasis final : public BasisFunction {
     /// reference coordinates with a \f$[-1,1]\f$ through-axis coordinate.
     ///
     /// \return Reference node coordinates, one per basis function.
-    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept final { return nodes_; }
+    const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
     /// \brief Evaluate Lagrange basis function values at a reference coordinate.
     ///
@@ -123,8 +123,8 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const final;
+    void evaluate_values(const math::Vector<double, 3>& xi,
+                         std::vector<double>& values) const final;
 
     /// \brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
     ///
@@ -138,7 +138,7 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
-    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+    void evaluate_gradients(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
 
     /// \brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
@@ -153,7 +153,7 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+    void evaluate_hessians(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
 
     /// \brief Evaluate Lagrange values, gradients, and Hessians together.
@@ -167,8 +167,8 @@ class LagrangeBasis final : public BasisFunction {
     /// \param values Receives one value per basis function.
     /// \param gradients Receives one three-component gradient per basis function.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    void evaluate_all(const math::Vector<Real, 3>& xi,
-                      std::vector<Real>& values,
+    void evaluate_all(const math::Vector<double, 3>& xi,
+                      std::vector<double>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
@@ -180,8 +180,8 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values_out Output span with at least size() entries.
-    void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            std::span<Real> values_out) const final;
+    void evaluate_values_to(const math::Vector<double, 3>& xi,
+                            std::span<double> values_out) const final;
 
     /// \brief Evaluate Lagrange basis gradients into caller-provided storage.
     ///
@@ -190,7 +190,7 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients_out Output span with at least size() entries.
-    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+    void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                std::span<Gradient> gradients_out) const final;
 
     /// \brief Evaluate Lagrange basis Hessians into caller-provided storage.
@@ -200,7 +200,7 @@ class LagrangeBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians_out Output span with at least size() entries.
-    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+    void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                               std::span<Hessian> hessians_out) const final;
 
 private:
@@ -209,8 +209,8 @@ class LagrangeBasis final : public BasisFunction {
     int dimension_{0};
     int order_{0};
 
-    std::vector<Real> nodes_1d_;
-    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<double> nodes_1d_;
+    std::vector<math::Vector<double, 3>> nodes_;
     std::vector<TensorNodeIndex> tensor_indices_;
     std::vector<SimplexExponent> simplex_exponents_;
     std::vector<WedgeNodeIndex> wedge_indices_;
@@ -222,23 +222,23 @@ class LagrangeBasis final : public BasisFunction {
     void build_wedge_nodes();
     void init_equispaced_1d_nodes();
 
-    void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         std::span<Real> values_out,
+    void evaluate_all_to(const math::Vector<double, 3>& xi,
+                         std::span<double> values_out,
                          std::span<Gradient> gradients_out,
                          std::span<Hessian> hessians_out) const;
-    void evaluate_point_to(std::span<Real> values_out,
+    void evaluate_point_to(std::span<double> values_out,
                            std::span<Gradient> gradients_out,
                            std::span<Hessian> hessians_out) const;
-    void evaluate_tensor_product_to(const math::Vector<Real, 3>& xi,
-                                    std::span<Real> values_out,
+    void evaluate_tensor_product_to(const math::Vector<double, 3>& xi,
+                                    std::span<double> values_out,
                                     std::span<Gradient> gradients_out,
                                     std::span<Hessian> hessians_out) const;
-    void evaluate_simplex_to(const math::Vector<Real, 3>& xi,
-                             std::span<Real> values_out,
+    void evaluate_simplex_to(const math::Vector<double, 3>& xi,
+                             std::span<double> values_out,
                              std::span<Gradient> gradients_out,
                              std::span<Hessian> hessians_out) const;
-    void evaluate_wedge_to(const math::Vector<Real, 3>& xi,
-                           std::span<Real> values_out,
+    void evaluate_wedge_to(const math::Vector<double, 3>& xi,
+                           std::span<double> values_out,
                            std::span<Gradient> gradients_out,
                            std::span<Hessian> hessians_out) const;
 };
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 2a922c6a2..b09892507 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -13,7 +13,7 @@ namespace basis {
 
 namespace {
 
-using Point = math::Vector<Real, 3>;
+using Point = math::Vector<double, 3>;
 
 // Maps public Hex20 ReferenceNodeLayout slots to the internal coefficient-table
 // basis columns used by kHex20Coefficients. Wedge15 and quadrilateral
@@ -26,11 +26,11 @@ constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     16, 17, 19, 18
 };
 
-Real line_coord_zero_one(int i, int order) {
+double line_coord_zero_one(int i, int order) {
     if (order <= 0) {
-        return Real(0);
+        return double(0);
     }
-    return static_cast<Real>(i) / static_cast<Real>(order);
+    return static_cast<double>(i) / static_cast<double>(order);
 }
 
 void append_triangle_face_interior(std::vector<Point>& nodes,
@@ -41,87 +41,87 @@ void append_triangle_face_interior(std::vector<Point>& nodes,
     for (int c = 1; c <= order - 2; ++c) {
         for (int b = 1; b <= order - c - 1; ++b) {
             const int a = order - b - c;
-            const Real inv = Real(1) / Real(order);
-            nodes.push_back(v0 * (Real(a) * inv) +
-                            v1 * (Real(b) * inv) +
-                            v2 * (Real(c) * inv));
+            const double inv = double(1) / double(order);
+            nodes.push_back(v0 * (double(a) * inv) +
+                            v1 * (double(b) * inv) +
+                            v2 * (double(c) * inv));
         }
     }
 }
 
 std::vector<Point> generate_line_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(0), Real(0), Real(0)}};
+        return {Point{double(0), double(0), double(0)}};
     }
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>(order + 1));
-    nodes.push_back(Point{Real(-1), Real(0), Real(0)});
-    nodes.push_back(Point{Real(1), Real(0), Real(0)});
+    nodes.push_back(Point{double(-1), double(0), double(0)});
+    nodes.push_back(Point{double(1), double(0), double(0)});
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), Real(0), Real(0)});
+        nodes.push_back(Point{line_coord_pm_one(i, order), double(0), double(0)});
     }
     return nodes;
 }
 
 std::vector<Point> generate_triangle_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+        return {Point{double(1) / double(3), double(1) / double(3), double(0)}};
     }
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
-    nodes.push_back(Point{Real(0), Real(0), Real(0)});
-    nodes.push_back(Point{Real(1), Real(0), Real(0)});
-    nodes.push_back(Point{Real(0), Real(1), Real(0)});
+    nodes.push_back(Point{double(0), double(0), double(0)});
+    nodes.push_back(Point{double(1), double(0), double(0)});
+    nodes.push_back(Point{double(0), double(1), double(0)});
 
     for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{line_coord_zero_one(m, order), Real(0), Real(0)});
+        nodes.push_back(Point{line_coord_zero_one(m, order), double(0), double(0)});
     }
     for (int m = 1; m < order; ++m) {
         nodes.push_back(Point{line_coord_zero_one(order - m, order),
-                              line_coord_zero_one(m, order), Real(0)});
+                              line_coord_zero_one(m, order), double(0)});
     }
     for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{Real(0), line_coord_zero_one(order - m, order), Real(0)});
+        nodes.push_back(Point{double(0), line_coord_zero_one(order - m, order), double(0)});
     }
 
     append_triangle_face_interior(nodes,
-                                  Point{Real(0), Real(0), Real(0)},
-                                  Point{Real(1), Real(0), Real(0)},
-                                  Point{Real(0), Real(1), Real(0)},
+                                  Point{double(0), double(0), double(0)},
+                                  Point{double(1), double(0), double(0)},
+                                  Point{double(0), double(1), double(0)},
                                   order);
     return nodes;
 }
 
 std::vector<Point> generate_quad_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(0), Real(0), Real(0)}};
+        return {Point{double(0), double(0), double(0)}};
     }
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
-    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(1), Real(0)});
-    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
+    nodes.push_back(Point{double(-1), double(-1), double(0)});
+    nodes.push_back(Point{double(1), double(-1), double(0)});
+    nodes.push_back(Point{double(1), double(1), double(0)});
+    nodes.push_back(Point{double(-1), double(1), double(0)});
 
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), Real(0)});
+        nodes.push_back(Point{line_coord_pm_one(i, order), double(-1), double(0)});
     }
     for (int j = 1; j < order; ++j) {
-        nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), Real(0)});
+        nodes.push_back(Point{double(1), line_coord_pm_one(j, order), double(0)});
     }
     for (int i = order - 1; i >= 1; --i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), Real(0)});
+        nodes.push_back(Point{line_coord_pm_one(i, order), double(1), double(0)});
     }
     for (int j = order - 1; j >= 1; --j) {
-        nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), Real(0)});
+        nodes.push_back(Point{double(-1), line_coord_pm_one(j, order), double(0)});
     }
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
             nodes.push_back(Point{line_coord_pm_one(i, order),
-                                  line_coord_pm_one(j, order), Real(0)});
+                                  line_coord_pm_one(j, order), double(0)});
         }
     }
     return nodes;
@@ -129,14 +129,14 @@ std::vector<Point> generate_quad_nodes(int order) {
 
 std::vector<Point> generate_tetra_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(0.25), Real(0.25), Real(0.25)}};
+        return {Point{double(0.25), double(0.25), double(0.25)}};
     }
 
     const Point verts[] = {
-        Point{Real(0), Real(0), Real(0)},
-        Point{Real(1), Real(0), Real(0)},
-        Point{Real(0), Real(1), Real(0)},
-        Point{Real(0), Real(0), Real(1)},
+        Point{double(0), double(0), double(0)},
+        Point{double(1), double(0), double(0)},
+        Point{double(0), double(1), double(0)},
+        Point{double(0), double(0), double(1)},
     };
 
     std::vector<Point> nodes;
@@ -148,8 +148,8 @@ std::vector<Point> generate_tetra_nodes(int order) {
     const int edges[6][2] = {{0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}};
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
         }
     }
 
@@ -165,9 +165,9 @@ std::vector<Point> generate_tetra_nodes(int order) {
     for (int l = 1; l <= order - 3; ++l) {
         for (int k = 1; k <= order - l - 2; ++k) {
             for (int j = 1; j <= order - l - k - 1; ++j) {
-                nodes.push_back(Point{Real(j) / Real(order),
-                                      Real(k) / Real(order),
-                                      Real(l) / Real(order)});
+                nodes.push_back(Point{double(j) / double(order),
+                                      double(k) / double(order),
+                                      double(l) / double(order)});
             }
         }
     }
@@ -176,18 +176,18 @@ std::vector<Point> generate_tetra_nodes(int order) {
 
 std::vector<Point> generate_hex_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(0), Real(0), Real(0)}};
+        return {Point{double(0), double(0), double(0)}};
     }
 
     const Point verts[] = {
-        Point{Real(-1), Real(-1), Real(-1)},
-        Point{Real(1), Real(-1), Real(-1)},
-        Point{Real(1), Real(1), Real(-1)},
-        Point{Real(-1), Real(1), Real(-1)},
-        Point{Real(-1), Real(-1), Real(1)},
-        Point{Real(1), Real(-1), Real(1)},
-        Point{Real(1), Real(1), Real(1)},
-        Point{Real(-1), Real(1), Real(1)},
+        Point{double(-1), double(-1), double(-1)},
+        Point{double(1), double(-1), double(-1)},
+        Point{double(1), double(1), double(-1)},
+        Point{double(-1), double(1), double(-1)},
+        Point{double(-1), double(-1), double(1)},
+        Point{double(1), double(-1), double(1)},
+        Point{double(1), double(1), double(1)},
+        Point{double(-1), double(1), double(1)},
     };
 
     std::vector<Point> nodes;
@@ -203,8 +203,8 @@ std::vector<Point> generate_hex_nodes(int order) {
     };
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
         }
     }
 
@@ -217,37 +217,37 @@ std::vector<Point> generate_hex_nodes(int order) {
     // -X face (x = -1)
     for (int k = 1; k < order; ++k) {
         for (int j = order - 1; j >= 1; --j) {
-            nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            nodes.push_back(Point{double(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
     // +X face (x = +1)
     for (int k = 1; k < order; ++k) {
         for (int j = 1; j < order; ++j) {
-            nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            nodes.push_back(Point{double(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
     // -Y face (y = -1)
     for (int k = 1; k < order; ++k) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), line_coord_pm_one(k, order)});
+            nodes.push_back(Point{line_coord_pm_one(i, order), double(-1), line_coord_pm_one(k, order)});
         }
     }
     // +Y face (y = +1)
     for (int k = 1; k < order; ++k) {
         for (int i = order - 1; i >= 1; --i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), line_coord_pm_one(k, order)});
+            nodes.push_back(Point{line_coord_pm_one(i, order), double(1), line_coord_pm_one(k, order)});
         }
     }
     // -Z face (z = -1)
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(-1)});
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(-1)});
         }
     }
     // +Z face (z = +1)
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(1)});
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(1)});
         }
     }
     for (int k = 1; k < order; ++k) {
@@ -264,16 +264,16 @@ std::vector<Point> generate_hex_nodes(int order) {
 
 std::vector<Point> generate_wedge_nodes(int order) {
     if (order == 0) {
-        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+        return {Point{double(1) / double(3), double(1) / double(3), double(0)}};
     }
 
     const Point verts[] = {
-        Point{Real(0), Real(0), Real(-1)},
-        Point{Real(1), Real(0), Real(-1)},
-        Point{Real(0), Real(1), Real(-1)},
-        Point{Real(0), Real(0), Real(1)},
-        Point{Real(1), Real(0), Real(1)},
-        Point{Real(0), Real(1), Real(1)},
+        Point{double(0), double(0), double(-1)},
+        Point{double(1), double(0), double(-1)},
+        Point{double(0), double(1), double(-1)},
+        Point{double(0), double(0), double(1)},
+        Point{double(1), double(0), double(1)},
+        Point{double(0), double(1), double(1)},
     };
 
     std::vector<Point> nodes;
@@ -289,8 +289,8 @@ std::vector<Point> generate_wedge_nodes(int order) {
     };
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
         }
     }
 
@@ -298,27 +298,27 @@ std::vector<Point> generate_wedge_nodes(int order) {
     append_triangle_face_interior(nodes, verts[3], verts[4], verts[5], order);
 
     for (int r = 1; r < order; ++r) {
-        const Real z = line_coord_pm_one(r, order);
+        const double z = line_coord_pm_one(r, order);
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(Point{t, Real(0), z});
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(Point{t, double(0), z});
         }
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(Point{Real(1) - t, t, z});
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(Point{double(1) - t, t, z});
         }
         for (int m = 1; m < order; ++m) {
-            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
-            nodes.push_back(Point{Real(0), Real(1) - t, z});
+            const double t = static_cast<double>(m) / static_cast<double>(order);
+            nodes.push_back(Point{double(0), double(1) - t, z});
         }
     }
 
     for (int r = 1; r < order; ++r) {
-        const Real z = line_coord_pm_one(r, order);
+        const double z = line_coord_pm_one(r, order);
         for (int c = 1; c <= order - 2; ++c) {
             for (int b = 1; b <= order - c - 1; ++b) {
-                nodes.push_back(Point{Real(b) / Real(order),
-                                      Real(c) / Real(order),
+                nodes.push_back(Point{double(b) / double(order),
+                                      double(c) / double(order),
                                       z});
             }
         }
@@ -332,7 +332,7 @@ std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
         case ElementType::Point1:
-            return {Point{Real(0), Real(0), Real(0)}};
+            return {Point{double(0), double(0), double(0)}};
         case ElementType::Line2:
             return generate_line_nodes(order);
         case ElementType::Triangle3:
@@ -387,7 +387,7 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 
 } // namespace
 
-math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+math::Vector<double, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
     svmp::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
@@ -399,7 +399,7 @@ std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
     return element_nodes(elem_type).size();
 }
 
-std::vector<math::Vector<Real, 3>>
+std::vector<math::Vector<double, 3>>
 ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
     return complete_lagrange_nodes(canonical_type, order);
 }
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 5dde6c493..043668c47 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -19,20 +19,20 @@ namespace basis {
 ///
 /// Shared by the reference-node layout generators and the Lagrange tensor-axis
 /// node initialization so the lattice formula lives in a single place.
-[[nodiscard]] inline constexpr Real line_coord_pm_one(int i, int order) noexcept {
+[[nodiscard]] inline constexpr double line_coord_pm_one(int i, int order) noexcept {
     if (order <= 0) {
-        return Real(0);
+        return double(0);
     }
-    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+    return double(-1) + double(2) * static_cast<double>(i) / static_cast<double>(order);
 }
 
 class ReferenceNodeLayout {
 public:
-    static math::Vector<Real, 3> get_node_coords(ElementType elem_type,
+    static math::Vector<double, 3> get_node_coords(ElementType elem_type,
                                                  std::size_t local_node);
     static std::size_t num_nodes(ElementType elem_type);
 
-    static std::vector<math::Vector<Real, 3>>
+    static std::vector<math::Vector<double, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
     static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 3a340351b..141b0df77 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -15,12 +15,12 @@ namespace FE {
 namespace basis {
 
 namespace {
-using Vec3 = math::Vector<Real, 3>;
+using Vec3 = math::Vector<double, 3>;
 
-void evaluate_hex8_reference(Real r,
-                             Real s,
-                             Real t,
-                             std::span<Real> values,
+void evaluate_hex8_reference(double r,
+                             double s,
+                             double t,
+                             std::span<double> values,
                              std::span<Gradient> gradients,
                              std::span<Hessian> hessians) {
     static constexpr int signs[8][3] = {
@@ -35,33 +35,33 @@ void evaluate_hex8_reference(Real r,
     };
 
     for (std::size_t i = 0; i < 8u; ++i) {
-        const Real a = Real(signs[i][0]);
-        const Real b = Real(signs[i][1]);
-        const Real c = Real(signs[i][2]);
-        const Real ar = Real(1) + a * r;
-        const Real bs = Real(1) + b * s;
-        const Real ct = Real(1) + c * t;
+        const double a = double(signs[i][0]);
+        const double b = double(signs[i][1]);
+        const double c = double(signs[i][2]);
+        const double ar = double(1) + a * r;
+        const double bs = double(1) + b * s;
+        const double ct = double(1) + c * t;
 
         if (!values.empty()) {
-            values[i] = Real(0.125) * ar * bs * ct;
+            values[i] = double(0.125) * ar * bs * ct;
         }
         if (!gradients.empty()) {
             Gradient& g = gradients[i];
-            g[0] = Real(0.125) * a * bs * ct;
-            g[1] = Real(0.125) * b * ar * ct;
-            g[2] = Real(0.125) * c * ar * bs;
+            g[0] = double(0.125) * a * bs * ct;
+            g[1] = double(0.125) * b * ar * ct;
+            g[2] = double(0.125) * c * ar * bs;
         }
         if (!hessians.empty()) {
             Hessian& h = hessians[i];
-            h(0, 0) = Real(0);
-            h(0, 1) = Real(0.125) * a * b * ct;
-            h(0, 2) = Real(0.125) * a * c * bs;
+            h(0, 0) = double(0);
+            h(0, 1) = double(0.125) * a * b * ct;
+            h(0, 2) = double(0.125) * a * c * bs;
             h(1, 0) = h(0, 1);
-            h(1, 1) = Real(0);
-            h(1, 2) = Real(0.125) * b * c * ar;
+            h(1, 1) = double(0);
+            h(1, 2) = double(0.125) * b * c * ar;
             h(2, 0) = h(0, 2);
             h(2, 1) = h(1, 2);
-            h(2, 2) = Real(0);
+            h(2, 2) = double(0);
         }
     }
 }
@@ -70,8 +70,8 @@ int quad_serendipity_superlinear_degree(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }
 
-inline Real integer_power(Real base, int exponent) {
-    Real result = Real(1);
+inline double integer_power(double base, int exponent) {
+    double result = double(1);
     for (int k = 0; k < exponent; ++k) {
         result *= base;
     }
@@ -112,14 +112,14 @@ void append_quad_serendipity_interior_nodes(std::vector<Vec3>& nodes, int order)
     }
 
     const int m = order - 4;
-    const Real y_denominator = Real(m + 2);
+    const double y_denominator = double(m + 2);
     for (int row = 0; row <= m; ++row) {
         const int row_count = m + 1 - row;
-        const Real y = Real(-1) + Real(2) * Real(row + 1) / y_denominator;
-        const Real x_denominator = Real(row_count + 1);
+        const double y = double(-1) + double(2) * double(row + 1) / y_denominator;
+        const double x_denominator = double(row_count + 1);
         for (int col = 0; col < row_count; ++col) {
-            const Real x = Real(-1) + Real(2) * Real(col + 1) / x_denominator;
-            nodes.push_back(Vec3{x, y, Real(0)});
+            const double x = double(-1) + double(2) * double(col + 1) / x_denominator;
+            nodes.push_back(Vec3{x, y, double(0)});
         }
     }
 }
@@ -130,24 +130,24 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
         return nodes;
     }
 
-    const Real inv_order = Real(1) / Real(order);
+    const double inv_order = double(1) / double(order);
 
-    nodes.push_back(Vec3{Real(-1), Real(-1), Real(0)});
-    nodes.push_back(Vec3{Real(1),  Real(-1), Real(0)});
-    nodes.push_back(Vec3{Real(1),  Real(1),  Real(0)});
-    nodes.push_back(Vec3{Real(-1), Real(1),  Real(0)});
+    nodes.push_back(Vec3{double(-1), double(-1), double(0)});
+    nodes.push_back(Vec3{double(1),  double(-1), double(0)});
+    nodes.push_back(Vec3{double(1),  double(1),  double(0)});
+    nodes.push_back(Vec3{double(-1), double(1),  double(0)});
 
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{Real(-1) + Real(2 * i) * inv_order, Real(-1), Real(0)});
+        nodes.push_back(Vec3{double(-1) + double(2 * i) * inv_order, double(-1), double(0)});
     }
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{Real(1), Real(-1) + Real(2 * i) * inv_order, Real(0)});
+        nodes.push_back(Vec3{double(1), double(-1) + double(2 * i) * inv_order, double(0)});
     }
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{Real(1) - Real(2 * i) * inv_order, Real(1), Real(0)});
+        nodes.push_back(Vec3{double(1) - double(2 * i) * inv_order, double(1), double(0)});
     }
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
+        nodes.push_back(Vec3{double(-1), double(1) - double(2 * i) * inv_order, double(0)});
     }
 
     svmp::throw_if<BasisConstructionException>(
@@ -163,14 +163,14 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
     return nodes;
 }
 
-std::vector<Real> invert_dense_matrix(std::vector<Real> matrix, int n, const char* label) {
+std::vector<double> invert_dense_matrix(std::vector<double> matrix, int n, const char* label) {
     return math::invert_dense_matrix(
         std::move(matrix),
         static_cast<std::size_t>(n),
         std::string("SerendipityBasis interpolation matrix for ") + label);
 }
 
-std::vector<Real> quad_serendipity_inverse_vandermonde(
+std::vector<double> quad_serendipity_inverse_vandermonde(
     std::span<const Vec3> nodes,
     std::span<const std::array<int, 2>> exponents,
     int order) {
@@ -179,14 +179,14 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
         n == 0 || exponents.size() != nodes.size(), SVMP_HERE,
         "SerendipityBasis: invalid quadrilateral serendipity interpolation setup");
 
-    std::vector<Real> vandermonde(static_cast<std::size_t>(n * n), Real(0));
+    std::vector<double> vandermonde(static_cast<std::size_t>(n * n), double(0));
     auto idx = [n](int row, int col) -> std::size_t {
         return static_cast<std::size_t>(row * n + col);
     };
 
     for (int row = 0; row < n; ++row) {
-        const Real x = nodes[static_cast<std::size_t>(row)][0];
-        const Real y = nodes[static_cast<std::size_t>(row)][1];
+        const double x = nodes[static_cast<std::size_t>(row)][0];
+        const double y = nodes[static_cast<std::size_t>(row)][1];
         for (int col = 0; col < n; ++col) {
             const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
             vandermonde[idx(row, col)] = integer_power(x, ax) * integer_power(y, ay);
@@ -223,7 +223,7 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
 // public Wedge15 node order. The table is the inverse of
 // V[node][monomial] = r^a s^b t^c evaluated at ReferenceNodeLayout Wedge15
 // nodes, so V * kWedge15Coefficients is the identity.
-constexpr std::array<std::array<Real, 15>, 15> kWedge15Coefficients = {{
+constexpr std::array<std::array<double, 15>, 15> kWedge15Coefficients = {{
     {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}},
     {{-0.5, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
     {{0.5, -0, -0, 0.5, -0, -0, -0, -0, -0, -0, -0, -0, -1, -0, -0}},
@@ -254,7 +254,7 @@ constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
 // V[node][monomial] = r^a s^b t^c evaluated at the corresponding Hex20
 // reference nodes, so V * kHex20Coefficients is the identity. Evaluation
 // remaps public output slots through ReferenceNodeLayout::mesh_to_basis_ordering.
-constexpr std::array<std::array<Real, 20>, 20> kHex20Coefficients = {{
+constexpr std::array<std::array<double, 20>, 20> kHex20Coefficients = {{
     {{-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25}},
     {{0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0}},
     {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25}},
@@ -280,18 +280,18 @@ constexpr std::array<std::array<Real, 20>, 20> kHex20Coefficients = {{
 // Value and first/second derivatives of the 1D monomial x^a. The derivative of
 // a constant or linear term collapses to zero, so negative powers never arise.
 struct MonomialAxis {
-    Real value;   ///< x^a
-    Real first;   ///< d/dx (x^a)     = a x^(a-1)
-    Real second;  ///< d^2/dx^2 (x^a) = a (a-1) x^(a-2)
+    double value;   ///< x^a
+    double first;   ///< d/dx (x^a)     = a x^(a-1)
+    double second;  ///< d^2/dx^2 (x^a) = a (a-1) x^(a-2)
 };
 
-inline MonomialAxis monomial_axis(Real x, int exponent) {
+inline MonomialAxis monomial_axis(double x, int exponent) {
     MonomialAxis axis;
     axis.value = integer_power(x, exponent);
-    axis.first = (exponent > 0) ? Real(exponent) * integer_power(x, exponent - 1) : Real(0);
+    axis.first = (exponent > 0) ? double(exponent) * integer_power(x, exponent - 1) : double(0);
     axis.second = (exponent > 1)
-                      ? Real(exponent * (exponent - 1)) * integer_power(x, exponent - 2)
-                      : Real(0);
+                      ? double(exponent * (exponent - 1)) * integer_power(x, exponent - 2)
+                      : double(0);
     return axis;
 }
 
@@ -308,12 +308,12 @@ inline MonomialAxis monomial_axis(Real x, int exponent) {
 // because its table is authored in an internal node order, while Wedge15 and the
 // quadrilateral serendipity tables are authored directly in public order.
 template <typename ExponentFn, typename CoeffFn>
-void eval_monomial_basis(Real r, Real s, Real t,
+void eval_monomial_basis(double r, double s, double t,
                          std::size_t count,
                          ExponentFn&& exponent,
                          CoeffFn&& coeff,
                          std::span<const std::size_t> table_to_output_order,
-                         std::span<Real> values,
+                         std::span<double> values,
                          std::span<Gradient> gradients,
                          std::span<Hessian> hessians) {
     const bool want_values = !values.empty();
@@ -326,17 +326,17 @@ void eval_monomial_basis(Real r, Real s, Real t,
         const MonomialAxis ay = monomial_axis(s, e[1]);
         const MonomialAxis az = monomial_axis(t, e[2]);
 
-        const Real phi = ax.value * ay.value * az.value;
+        const double phi = ax.value * ay.value * az.value;
 
-        Real d_dr = Real(0), d_ds = Real(0), d_dt = Real(0);
+        double d_dr = double(0), d_ds = double(0), d_dt = double(0);
         if (want_gradients || want_hessians) {
             d_dr = ax.first * ay.value * az.value;
             d_ds = ax.value * ay.first * az.value;
             d_dt = ax.value * ay.value * az.first;
         }
 
-        Real d_drr = Real(0), d_dss = Real(0), d_dtt = Real(0);
-        Real d_drs = Real(0), d_drt = Real(0), d_dst = Real(0);
+        double d_drr = double(0), d_dss = double(0), d_dtt = double(0);
+        double d_drs = double(0), d_drt = double(0), d_dst = double(0);
         if (want_hessians) {
             d_drr = ax.second * ay.value * az.value;
             d_dss = ax.value * ay.second * az.value;
@@ -349,7 +349,7 @@ void eval_monomial_basis(Real r, Real s, Real t,
         for (std::size_t slot = 0; slot < count; ++slot) {
             const std::size_t basis_index =
                 table_to_output_order.empty() ? slot : table_to_output_order[slot];
-            const Real c = coeff(j, basis_index);
+            const double c = coeff(j, basis_index);
             if (want_values) {
                 values[slot] += c * phi;
             }
@@ -429,8 +429,8 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
     }
 }
 
-void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
-                                       std::span<Real> values_out,
+void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
+                                       std::span<double> values_out,
                                        std::span<Gradient> gradients_out,
                                        std::span<Hessian> hessians_out) const {
     require_requested_span_size(values_out, size_, "SerendipityBasis::evaluate_all_to values");
@@ -442,7 +442,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
     }
 
     if (!values_out.empty()) {
-        std::fill(values_out.begin(), values_out.end(), Real(0));
+        std::fill(values_out.begin(), values_out.end(), double(0));
     }
     if (!gradients_out.empty()) {
         std::fill(gradients_out.begin(), gradients_out.end(), Gradient::Zero());
@@ -451,9 +451,9 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         std::fill(hessians_out.begin(), hessians_out.end(), Hessian::Zero());
     }
 
-    const Real x = xi[0];
-    const Real y = xi[1];
-    const Real z = xi[2];
+    const double x = xi[0];
+    const double y = xi[1];
+    const double z = xi[2];
 
     if (dimension_ == 2) {
         svmp::throw_if<BasisEvaluationException>(
@@ -515,53 +515,53 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         "SerendipityBasis::evaluate_all_to: unsupported serendipity configuration");
 }
 
-void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
-                                       std::vector<Real>& values) const {
+void SerendipityBasis::evaluate_values(const math::Vector<double, 3>& xi,
+                                       std::vector<double>& values) const {
     values.resize(size_);
-    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
+    evaluate_values_to(xi, std::span<double>(values.data(), values.size()));
 }
 
-void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+void SerendipityBasis::evaluate_gradients(const math::Vector<double, 3>& xi,
                                           std::vector<Gradient>& gradients) const {
     gradients.resize(size_);
     evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
-void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+void SerendipityBasis::evaluate_hessians(const math::Vector<double, 3>& xi,
                                          std::vector<Hessian>& hessians) const {
     hessians.resize(size_);
     evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
-void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
-                                    std::vector<Real>& values,
+void SerendipityBasis::evaluate_all(const math::Vector<double, 3>& xi,
+                                    std::vector<double>& values,
                                     std::vector<Gradient>& gradients,
                                     std::vector<Hessian>& hessians) const {
     values.resize(size_);
     gradients.resize(size_);
     hessians.resize(size_);
     evaluate_all_to(xi,
-                    std::span<Real>(values.data(), values.size()),
+                    std::span<double>(values.data(), values.size()),
                     std::span<Gradient>(gradients.data(), gradients.size()),
                     std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
-void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                          std::span<Real> values_out) const {
+void SerendipityBasis::evaluate_values_to(const math::Vector<double, 3>& xi,
+                                          std::span<double> values_out) const {
     require_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
     evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
-void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+void SerendipityBasis::evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                              std::span<Gradient> gradients_out) const {
     require_span_size(gradients_out.size(), size_, "SerendipityBasis::evaluate_gradients_to");
-    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
+    evaluate_all_to(xi, std::span<double>{}, gradients_out, std::span<Hessian>{});
 }
 
-void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+void SerendipityBasis::evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                             std::span<Hessian> hessians_out) const {
     require_span_size(hessians_out.size(), size_, "SerendipityBasis::evaluate_hessians_to");
-    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
+    evaluate_all_to(xi, std::span<double>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 930875fe7..5a1471436 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -135,7 +135,7 @@ class SerendipityBasis final : public BasisFunction {
     /// production layouts.
     ///
     /// \return Reference node coordinates, one per basis function.
-    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept final { return nodes_; }
+    const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
     /// \brief Evaluate serendipity basis function values at a reference coordinate.
     ///
@@ -147,8 +147,8 @@ class SerendipityBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const final;
+    void evaluate_values(const math::Vector<double, 3>& xi,
+                         std::vector<double>& values) const final;
 
     /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
     ///
@@ -161,7 +161,7 @@ class SerendipityBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
-    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+    void evaluate_gradients(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
 
     /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
@@ -175,7 +175,7 @@ class SerendipityBasis final : public BasisFunction {
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+    void evaluate_hessians(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
 
     /// \brief Evaluate serendipity values, gradients, and Hessians together.
@@ -188,27 +188,27 @@ class SerendipityBasis final : public BasisFunction {
     /// \param values Receives one value per basis function.
     /// \param gradients Receives one three-component gradient per basis function.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    void evaluate_all(const math::Vector<Real, 3>& xi,
-                      std::vector<Real>& values,
+    void evaluate_all(const math::Vector<double, 3>& xi,
+                      std::vector<double>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
     /// \brief Evaluate serendipity basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values_out Output span with at least size() entries.
-    void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            std::span<Real> values_out) const final;
+    void evaluate_values_to(const math::Vector<double, 3>& xi,
+                            std::span<double> values_out) const final;
 
     /// \brief Evaluate serendipity basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients_out Output span with at least size() entries.
-    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+    void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                std::span<Gradient> gradients_out) const final;
 
     /// \brief Evaluate serendipity basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians_out Output span with at least size() entries.
-    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+    void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                               std::span<Hessian> hessians_out) const final;
 
 private:
@@ -216,13 +216,13 @@ class SerendipityBasis final : public BasisFunction {
     int dimension_;
     int order_;
     std::size_t size_;
-    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<math::Vector<double, 3>> nodes_;
     std::vector<std::array<int, 2>> quad_monomial_exponents_;
     // Row-major inverse Vandermonde, indexed as [monomial, basis].
-    std::vector<Real> quad_inv_vandermonde_;
+    std::vector<double> quad_inv_vandermonde_;
 
-    void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         std::span<Real> values_out,
+    void evaluate_all_to(const math::Vector<double, 3>& xi,
+                         std::span<double> values_out,
                          std::span<Gradient> gradients_out,
                          std::span<Hessian> hessians_out) const;
 };
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 462b7ca76..38afe4086 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -139,12 +139,10 @@ using BlockId = std::uint16_t;
 using MeshIndex = svmp::index_t;        ///< Local mesh entity index, shared with the Mesh library.
 using MeshOffset = svmp::offset_t;      ///< Offset type for mesh connectivity arrays.
 using MeshGlobalId = svmp::gid_t;       ///< Global mesh entity identifier.
-using Real = svmp::real_t;              ///< Floating-point scalar type; same precision as the Mesh library.
 #else
 using MeshIndex = std::int32_t;         ///< Local mesh entity index, shared with the Mesh library.
 using MeshOffset = std::int64_t;        ///< Offset type for mesh connectivity arrays.
 using MeshGlobalId = std::int64_t;      ///< Global mesh entity identifier.
-using Real = double;                    ///< Floating-point scalar type; same precision as the Mesh library.
 #endif
 
 // ============================================================================
@@ -202,7 +200,7 @@ constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
 struct FieldValueEntry {
     FieldId field{INVALID_FIELD_ID};                  ///< Field this value belongs to.
     int n_components{0};                              ///< Number of valid entries in components.
-    Real components[MAX_FIELD_VALUE_COMPONENTS]{};    ///< Component values, row-major for tensors.
+    double components[MAX_FIELD_VALUE_COMPONENTS]{};    ///< Component values, row-major for tensors.
 };
 
 // ============================================================================
@@ -337,12 +335,12 @@ enum class FEStatus : std::uint8_t {
  * @tparam Dim Reference-space dimension
  */
 template<int Dim>
-using ReferencePoint = std::array<Real, static_cast<std::size_t>(Dim)>;
+using ReferencePoint = std::array<double, static_cast<std::size_t>(Dim)>;
 
 /**
  * @brief Point in physical coordinates
  */
-using PhysicalPoint = std::array<Real, 3>;
+using PhysicalPoint = std::array<double, 3>;
 
 /**
  * @brief Jacobian matrix type
@@ -350,7 +348,7 @@ using PhysicalPoint = std::array<Real, 3>;
  * @tparam ReferenceDim Reference-space dimension (columns)
  */
 template<int SpatialDim, int ReferenceDim = SpatialDim>
-using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
+using Jacobian = std::array<std::array<double, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
 
 // ============================================================================
 // Strong Type Wrappers (C++17 idiom for type safety)
@@ -423,7 +421,7 @@ struct BasisGradientTag {};     ///< Tag type for basis-function gradients.
 /// Type-safe index of a quadrature point within a rule.
 using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
 /// Type-safe quadrature weight value.
-using QuadratureWeight = StrongType<Real, QuadratureWeightTag>;
+using QuadratureWeight = StrongType<double, QuadratureWeightTag>;
 
 // ============================================================================
 // Type Traits
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index db3f8561d..df06700f9 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -19,12 +19,12 @@ namespace math {
 
 namespace {
 
-using DenseMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+using DenseMatrix = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
 using RowMajorMatrix =
-    Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 using ConstRowMajorMap = Eigen::Map<const RowMajorMatrix>;
 
-ConstRowMajorMap map_row_major(std::span<const Real> matrix,
+ConstRowMajorMap map_row_major(std::span<const double> matrix,
                                std::size_t rows,
                                std::size_t cols) {
     return ConstRowMajorMap(matrix.data(),
@@ -32,7 +32,7 @@ ConstRowMajorMap map_row_major(std::span<const Real> matrix,
                             static_cast<Eigen::Index>(cols));
 }
 
-void copy_to_row_major(const DenseMatrix& source, std::vector<Real>& dest) {
+void copy_to_row_major(const DenseMatrix& source, std::vector<double>& dest) {
     const auto rows = static_cast<std::size_t>(source.rows());
     const auto cols = static_cast<std::size_t>(source.cols());
     dest.resize(rows * cols);
@@ -50,47 +50,47 @@ DenseLUSolver::~DenseLUSolver() = default;
 DenseLUSolver::DenseLUSolver(DenseLUSolver&&) noexcept = default;
 DenseLUSolver& DenseLUSolver::operator=(DenseLUSolver&&) noexcept = default;
 
-Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept {
-    Real max_abs = Real(0);
-    for (const Real value : matrix) {
+double dense_matrix_max_abs(std::span<const double> matrix) noexcept {
+    double max_abs = double(0);
+    for (const double value : matrix) {
         max_abs = std::max(max_abs, std::abs(value));
     }
     return max_abs;
 }
 
-Real dense_matrix_pivot_tolerance(std::size_t rows,
+double dense_matrix_pivot_tolerance(std::size_t rows,
                                   std::size_t cols,
-                                  Real max_abs,
-                                  Real multiplier) noexcept {
-    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
-    const Real value_scale = std::max(Real(1), max_abs);
-    return multiplier * std::numeric_limits<Real>::epsilon() *
-           std::max(Real(1), size_scale) * value_scale;
+                                  double max_abs,
+                                  double multiplier) noexcept {
+    const double size_scale = static_cast<double>(std::max<std::size_t>(rows, cols));
+    const double value_scale = std::max(double(1), max_abs);
+    return multiplier * std::numeric_limits<double>::epsilon() *
+           std::max(double(1), size_scale) * value_scale;
 }
 
-Real dense_matrix_singular_value_tolerance(std::size_t rows,
+double dense_matrix_singular_value_tolerance(std::size_t rows,
                                            std::size_t cols,
-                                           Real largest_singular_value,
-                                           Real multiplier) noexcept {
-    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
-    return multiplier * std::numeric_limits<Real>::epsilon() *
-           std::max(Real(1), size_scale) *
-           std::max(Real(1), largest_singular_value);
+                                           double largest_singular_value,
+                                           double multiplier) noexcept {
+    const double size_scale = static_cast<double>(std::max<std::size_t>(rows, cols));
+    return multiplier * std::numeric_limits<double>::epsilon() *
+           std::max(double(1), size_scale) *
+           std::max(double(1), largest_singular_value);
 }
 
-Real dense_matrix_condition_fallback_threshold() noexcept {
-    return Real(1.0e12);
+double dense_matrix_condition_fallback_threshold() noexcept {
+    return double(1.0e12);
 }
 
-Real dense_matrix_condition_error_threshold() noexcept {
-    return Real(1.0e14);
+double dense_matrix_condition_error_threshold() noexcept {
+    return double(1.0e14);
 }
 
-void DenseLUSolver::solve_in_place(std::span<Real> rhs) const {
+void DenseLUSolver::solve_in_place(std::span<double> rhs) const {
     solve_in_place(rhs, 1u);
 }
 
-void DenseLUSolver::solve_in_place(std::span<Real> rhs,
+void DenseLUSolver::solve_in_place(std::span<double> rhs,
                                    std::size_t rhs_count) const {
     ::svmp::check_arg<FEException>(
         rhs_count > 0, SVMP_HERE,
@@ -113,14 +113,14 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs,
     rhs_map = solution;
 }
 
-std::vector<Real> DenseLUSolver::solve(std::span<const Real> rhs) const {
-    std::vector<Real> x(rhs.begin(), rhs.end());
-    solve_in_place(std::span<Real>(x.data(), x.size()));
+std::vector<double> DenseLUSolver::solve(std::span<const double> rhs) const {
+    std::vector<double> x(rhs.begin(), rhs.end());
+    solve_in_place(std::span<double>(x.data(), x.size()));
     return x;
 }
 
 DenseMatrixDiagnostics dense_matrix_diagnostics(
-    std::span<const Real> matrix,
+    std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
@@ -137,13 +137,13 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     DenseMatrixDiagnostics diagnostics;
     const auto& singular_values = svd.singularValues();
     diagnostics.largest_singular_value =
-        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+        (singular_values.size() > 0) ? singular_values[0] : double(0);
     diagnostics.tolerance =
         dense_matrix_singular_value_tolerance(rows, cols,
                                               diagnostics.largest_singular_value);
 
     for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
-        const Real sigma = singular_values[i];
+        const double sigma = singular_values[i];
         if (sigma <= diagnostics.tolerance) {
             continue;
         }
@@ -153,7 +153,7 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
 
     const std::size_t full_rank = std::min(rows, cols);
     if (diagnostics.rank == full_rank &&
-        diagnostics.smallest_retained_singular_value > Real(0)) {
+        diagnostics.smallest_retained_singular_value > double(0)) {
         diagnostics.condition_estimate =
             diagnostics.largest_singular_value /
             diagnostics.smallest_retained_singular_value;
@@ -161,7 +161,7 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     return diagnostics;
 }
 
-DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
                                   std::size_t n,
                                   std::string_view label) {
     ::svmp::check_arg<FEException>(
@@ -171,8 +171,8 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
     DenseLUSolver solver;
     solver.n = n;
     solver.label = std::string(label);
-    const Real max_abs =
-        dense_matrix_max_abs(std::span<const Real>(matrix.data(), matrix.size()));
+    const double max_abs =
+        dense_matrix_max_abs(std::span<const double>(matrix.data(), matrix.size()));
     solver.pivot_tolerance = dense_matrix_pivot_tolerance(n, n, max_abs);
 
     solver.impl = std::make_unique<DenseLUSolver::Impl>();
@@ -180,11 +180,11 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
 
     // Partial pivoting leaves the pivots on the diagonal of the packed LU
     // factor; a pivot below the scale-aware tolerance marks rank deficiency.
-    Real max_pivot_abs = Real(0);
-    Real min_pivot_abs = std::numeric_limits<Real>::infinity();
+    double max_pivot_abs = double(0);
+    double min_pivot_abs = std::numeric_limits<double>::infinity();
     const auto diagonal = solver.impl->lu.matrixLU().diagonal();
     for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
-        const Real pivot_magnitude = std::abs(diagonal[col]);
+        const double pivot_magnitude = std::abs(diagonal[col]);
         ::svmp::check_arg<FEException>(
             pivot_magnitude > solver.pivot_tolerance, SVMP_HERE,
             solver.label + ": rank-deficient matrix (rank " +
@@ -201,24 +201,24 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
     solver.diagnostics.rank = n;
     solver.diagnostics.tolerance = solver.pivot_tolerance;
     solver.max_pivot = max_pivot_abs;
-    solver.min_pivot = std::isfinite(min_pivot_abs) ? min_pivot_abs : Real(0);
+    solver.min_pivot = std::isfinite(min_pivot_abs) ? min_pivot_abs : double(0);
     return solver;
 }
 
 DenseInverseResult invert_dense_matrix_with_diagnostics(
-    std::vector<Real> matrix,
+    std::vector<double> matrix,
     std::size_t n,
     std::string_view label) {
     ::svmp::check_arg<FEException>(
         matrix.size() == n * n, SVMP_HERE,
         std::string(label) + ": dense inverse size mismatch");
-    std::vector<Real> matrix_for_lu = matrix;
+    std::vector<double> matrix_for_lu = matrix;
     const DenseLUSolver solver =
         factor_dense_matrix(std::move(matrix_for_lu), n, label);
 
     DenseInverseResult result;
     result.diagnostics =
-        dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
+        dense_matrix_diagnostics(std::span<const double>(matrix.data(), matrix.size()),
                                  n, n, label);
 
     if (std::isfinite(result.diagnostics.condition_estimate) &&
@@ -238,7 +238,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
             ::svmp::check_arg<FEException>(
                 singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
-            sigma_inverse(i, i) = Real(1) / singular_values[i];
+            sigma_inverse(i, i) = double(1) / singular_values[i];
         }
         const DenseMatrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
         copy_to_row_major(inverse, result.inverse);
@@ -255,7 +255,7 @@ void validate_dense_inverse_diagnostics(
     const DenseInverseResult& result,
     std::size_t expected_rank,
     std::string_view label,
-    Real max_condition) {
+    double max_condition) {
     ::svmp::check_arg<FEException>(
         result.diagnostics.rank == expected_rank, SVMP_HERE,
         std::string(label) + ": rank-deficient matrix (rank " +
@@ -273,17 +273,17 @@ void validate_dense_inverse_diagnostics(
             " exceeds supported threshold " + std::to_string(max_condition));
 }
 
-std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+std::vector<double> invert_dense_matrix(std::vector<double> matrix,
                                       std::size_t n,
                                       std::string_view label) {
     const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
     const DenseMatrix inverse = solver.impl->lu.inverse();
-    std::vector<Real> result;
+    std::vector<double> result;
     copy_to_row_major(inverse, result);
     return result;
 }
 
-std::size_t dense_matrix_rank(std::vector<Real> matrix,
+std::size_t dense_matrix_rank(std::vector<double> matrix,
                               std::size_t rows,
                               std::size_t cols) {
     ::svmp::check_arg<FEException>(
@@ -291,13 +291,13 @@ std::size_t dense_matrix_rank(std::vector<Real> matrix,
         "dense_matrix_rank: size mismatch");
 
     const DenseMatrix dense =
-        map_row_major(std::span<const Real>(matrix.data(), matrix.size()), rows, cols);
+        map_row_major(std::span<const double>(matrix.data(), matrix.size()), rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense);
 
     const auto& singular_values = svd.singularValues();
-    const Real largest =
-        (singular_values.size() > 0) ? singular_values[0] : Real(0);
-    const Real tolerance =
+    const double largest =
+        (singular_values.size() > 0) ? singular_values[0] : double(0);
+    const double tolerance =
         dense_matrix_singular_value_tolerance(rows, cols, largest);
 
     std::size_t rank = 0;
@@ -310,7 +310,7 @@ std::size_t dense_matrix_rank(std::vector<Real> matrix,
 }
 
 DensePseudoInverseResult rank_revealing_pseudo_inverse(
-    std::span<const Real> matrix,
+    std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
     std::string_view label) {
@@ -328,18 +328,18 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
 
     const auto& singular_values = svd.singularValues();
     result.largest_singular_value =
-        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+        (singular_values.size() > 0) ? singular_values[0] : double(0);
     result.tolerance =
         dense_matrix_singular_value_tolerance(rows, cols, result.largest_singular_value);
 
     DenseMatrix sigma_inverse = DenseMatrix::Zero(static_cast<Eigen::Index>(cols),
                                                   static_cast<Eigen::Index>(rows));
     for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
-        const Real sigma = singular_values[i];
+        const double sigma = singular_values[i];
         if (sigma <= result.tolerance) {
             continue;
         }
-        sigma_inverse(i, i) = Real(1) / sigma;
+        sigma_inverse(i, i) = double(1) / sigma;
         ++result.rank;
         result.smallest_retained_singular_value = sigma;
     }
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index c94351bb3..440df817c 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -20,42 +20,42 @@ namespace math {
 
 // Dense solve, inverse, rank, and pseudo-inverse support for FE construction
 // utilities. Matrices are row-major: matrix[row * cols + col].
-[[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
+[[nodiscard]] double dense_matrix_max_abs(std::span<const double> matrix) noexcept;
 
-[[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
+[[nodiscard]] double dense_matrix_pivot_tolerance(std::size_t rows,
                                                 std::size_t cols,
-                                                Real max_abs,
-                                                Real multiplier = Real(64)) noexcept;
+                                                double max_abs,
+                                                double multiplier = double(64)) noexcept;
 
-[[nodiscard]] Real dense_matrix_singular_value_tolerance(std::size_t rows,
+[[nodiscard]] double dense_matrix_singular_value_tolerance(std::size_t rows,
                                                          std::size_t cols,
-                                                         Real largest_singular_value,
-                                                         Real multiplier = Real(64)) noexcept;
+                                                         double largest_singular_value,
+                                                         double multiplier = double(64)) noexcept;
 
 struct DensePseudoInverseResult {
-    std::vector<Real> inverse;
+    std::vector<double> inverse;
     std::size_t rank{0};
-    Real tolerance{0};
-    Real largest_singular_value{0};
-    Real smallest_retained_singular_value{0};
+    double tolerance{0};
+    double largest_singular_value{0};
+    double smallest_retained_singular_value{0};
 };
 
 struct DenseMatrixDiagnostics {
     std::size_t rank{0};
-    Real tolerance{0};
-    Real largest_singular_value{0};
-    Real smallest_retained_singular_value{0};
-    Real condition_estimate{std::numeric_limits<Real>::infinity()};
+    double tolerance{0};
+    double largest_singular_value{0};
+    double smallest_retained_singular_value{0};
+    double condition_estimate{std::numeric_limits<double>::infinity()};
 };
 
 struct DenseInverseResult {
-    std::vector<Real> inverse;
+    std::vector<double> inverse;
     DenseMatrixDiagnostics diagnostics;
     bool used_svd_fallback{false};
 };
 
-[[nodiscard]] Real dense_matrix_condition_fallback_threshold() noexcept;
-[[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
+[[nodiscard]] double dense_matrix_condition_fallback_threshold() noexcept;
+[[nodiscard]] double dense_matrix_condition_error_threshold() noexcept;
 
 struct DenseLUSolver {
     struct Impl;
@@ -69,37 +69,37 @@ struct DenseLUSolver {
 
     std::size_t n{0};
     DenseMatrixDiagnostics diagnostics;
-    Real pivot_tolerance{0};
-    Real min_pivot{0};
-    Real max_pivot{0};
+    double pivot_tolerance{0};
+    double min_pivot{0};
+    double max_pivot{0};
     std::string label;
     std::unique_ptr<Impl> impl;
 
     [[nodiscard]] bool empty() const noexcept { return n == 0; }
 
-    void solve_in_place(std::span<Real> rhs) const;
-    void solve_in_place(std::span<Real> rhs, std::size_t rhs_count) const;
-    [[nodiscard]] std::vector<Real> solve(std::span<const Real> rhs) const;
+    void solve_in_place(std::span<double> rhs) const;
+    void solve_in_place(std::span<double> rhs, std::size_t rhs_count) const;
+    [[nodiscard]] std::vector<double> solve(std::span<const double> rhs) const;
 };
 
 // Inverses and pseudo-inverses keep the same row-major convention for their
 // returned dimensions.
 [[nodiscard]] DenseMatrixDiagnostics dense_matrix_diagnostics(
-    std::span<const Real> matrix,
+    std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
     std::string_view label = "dense matrix");
 
-[[nodiscard]] DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+[[nodiscard]] DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
                                                 std::size_t n,
                                                 std::string_view label = "dense matrix");
 
-[[nodiscard]] std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+[[nodiscard]] std::vector<double> invert_dense_matrix(std::vector<double> matrix,
                                                     std::size_t n,
                                                     std::string_view label = "dense matrix");
 
 [[nodiscard]] DenseInverseResult invert_dense_matrix_with_diagnostics(
-    std::vector<Real> matrix,
+    std::vector<double> matrix,
     std::size_t n,
     std::string_view label = "dense matrix");
 
@@ -107,14 +107,14 @@ void validate_dense_inverse_diagnostics(
     const DenseInverseResult& result,
     std::size_t expected_rank,
     std::string_view label = "dense matrix",
-    Real max_condition = dense_matrix_condition_error_threshold());
+    double max_condition = dense_matrix_condition_error_threshold());
 
-[[nodiscard]] std::size_t dense_matrix_rank(std::vector<Real> matrix,
+[[nodiscard]] std::size_t dense_matrix_rank(std::vector<double> matrix,
                                             std::size_t rows,
                                             std::size_t cols);
 
 [[nodiscard]] DensePseudoInverseResult rank_revealing_pseudo_inverse(
-    std::span<const Real> matrix,
+    std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
     std::string_view label = "dense matrix");
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 4fe2399cb..9a145964e 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -213,7 +213,7 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
 /// xi array, zero-filling the trailing components that are inactive for
 /// lower-dimensional elements. Throws BasisConfigurationException when xi has
 /// fewer rows than the basis reference dimension.
-fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
+fe::math::Vector<double, 3> make_basis_point(const febasis::BasisFunction& basis,
                                                const int g,
                                                const Array<double>& xi)
 {
@@ -226,7 +226,7 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
 
   // Inactive trailing components must be zero for lower-dimensional elements;
   // Eigen-backed vectors are not zero-initialized by default construction.
-  fe::math::Vector<fe::Real, 3> point = fe::math::Vector<fe::Real, 3>::Zero();
+  fe::math::Vector<double, 3> point = fe::math::Vector<double, 3>::Zero();
   for (int d = 0; d < basis.dimension(); ++d) {
     point[static_cast<std::size_t>(d)] = xi(d, g);
   }
@@ -240,7 +240,7 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
 void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         const int eNoN,
                                         const int g,
-                                        const std::vector<fe::Real>& values,
+                                        const std::vector<double>& values,
                                         const std::vector<febasis::Gradient>& gradients,
                                         Array<double>& N,
                                         Array3<double>& Nx)
@@ -296,7 +296,7 @@ void evaluate_basis_values_and_gradients(const int insd,
   }
 
   const auto point = make_basis_point(basis, g, xi);
-  std::vector<fe::Real> values;
+  std::vector<double> values;
   std::vector<febasis::Gradient> gradients;
   basis.evaluate_values(point, values);
   basis.evaluate_gradients(point, gradients);
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 51df2d593..0a3048d65 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -28,10 +28,10 @@ class MinimalScalarBasis : public BasisFunction {
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<Real, 3>&,
-                         std::vector<Real>& values) const override
+    void evaluate_values(const math::Vector<double, 3>&,
+                         std::vector<double>& values) const override
     {
-        values.assign(size(), Real(0));
+        values.assign(size(), double(0));
     }
 };
 
@@ -50,43 +50,43 @@ class ExactQuadraticBasis : public BasisFunction {
     int order() const noexcept override { return 2; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
+    void evaluate_values(const math::Vector<double, 3>& xi,
+                         std::vector<double>& values) const override
     {
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
+        const double x = xi[0];
+        const double y = xi[1];
+        const double z = xi[2];
         values.resize(size());
-        values[0] = Real(1) + Real(2) * x - y + Real(0.5) * z +
-                    x * x + Real(0.75) * y * y - Real(0.25) * z * z +
-                    Real(0.2) * x * y - Real(0.3) * x * z + Real(0.4) * y * z;
-        values[1] = Real(3) - x + Real(2) * y + z +
-                    Real(0.5) * x * x - y * y + z * z +
+        values[0] = double(1) + double(2) * x - y + double(0.5) * z +
+                    x * x + double(0.75) * y * y - double(0.25) * z * z +
+                    double(0.2) * x * y - double(0.3) * x * z + double(0.4) * y * z;
+        values[1] = double(3) - x + double(2) * y + z +
+                    double(0.5) * x * x - y * y + z * z +
                     x * y + x * z - y * z;
     }
 
-    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+    void evaluate_gradients(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients) const override
     {
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
+        const double x = xi[0];
+        const double y = xi[1];
+        const double z = xi[2];
         gradients.assign(size(), Gradient::Zero());
-        gradients[0][0] = Real(2) + Real(2) * x + Real(0.2) * y - Real(0.3) * z;
-        gradients[0][1] = Real(-1) + Real(1.5) * y + Real(0.2) * x + Real(0.4) * z;
-        gradients[0][2] = Real(0.5) - Real(0.5) * z - Real(0.3) * x + Real(0.4) * y;
-        gradients[1][0] = Real(-1) + x + y + z;
-        gradients[1][1] = Real(2) - Real(2) * y + x - z;
-        gradients[1][2] = Real(1) + Real(2) * z + x - y;
+        gradients[0][0] = double(2) + double(2) * x + double(0.2) * y - double(0.3) * z;
+        gradients[0][1] = double(-1) + double(1.5) * y + double(0.2) * x + double(0.4) * z;
+        gradients[0][2] = double(0.5) - double(0.5) * z - double(0.3) * x + double(0.4) * y;
+        gradients[1][0] = double(-1) + x + y + z;
+        gradients[1][1] = double(2) - double(2) * y + x - z;
+        gradients[1][2] = double(1) + double(2) * z + x - y;
     }
 
     void exact_hessians(std::vector<Hessian>& hessians) const
     {
         hessians.assign(size(), Hessian::Zero());
-        hessians[0] = make_symmetric_hessian(Real(2), Real(1.5), Real(-0.5),
-                                             Real(0.2), Real(-0.3), Real(0.4));
-        hessians[1] = make_symmetric_hessian(Real(1), Real(-2), Real(2),
-                                             Real(1), Real(1), Real(-1));
+        hessians[0] = make_symmetric_hessian(double(2), double(1.5), double(-0.5),
+                                             double(0.2), double(-0.3), double(0.4));
+        hessians[1] = make_symmetric_hessian(double(1), double(-2), double(2),
+                                             double(1), double(1), double(-1));
     }
 };
 
@@ -98,32 +98,32 @@ class CompleteFallbackBasis : public BasisFunction {
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
+    void evaluate_values(const math::Vector<double, 3>& xi,
+                         std::vector<double>& values) const override
     {
         values.resize(size());
-        values[0] = Real(1) + xi[0];
-        values[1] = Real(2) + xi[1];
+        values[0] = double(1) + xi[0];
+        values[1] = double(2) + xi[1];
     }
 
-    void evaluate_gradients(const math::Vector<Real, 3>&,
+    void evaluate_gradients(const math::Vector<double, 3>&,
                             std::vector<Gradient>& gradients) const override
     {
         gradients.assign(size(), Gradient::Zero());
-        gradients[0][0] = Real(1);
-        gradients[1][1] = Real(1);
+        gradients[0][0] = double(1);
+        gradients[1][1] = double(1);
     }
 
-    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+    void evaluate_hessians(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians) const override
     {
         hessians.assign(size(), Hessian::Zero());
         for (std::size_t d = 0; d < hessians.size(); ++d) {
             for (std::size_t r = 0; r < 3u; ++r) {
                 for (std::size_t c = 0; c < 3u; ++c) {
-                    hessians[d](r, c) = Real(100) * static_cast<Real>(d + 1u) +
-                                        Real(10) * static_cast<Real>(r) +
-                                        static_cast<Real>(c) + xi[2];
+                    hessians[d](r, c) = double(100) * static_cast<double>(d + 1u) +
+                                        double(10) * static_cast<double>(r) +
+                                        static_cast<double>(c) + xi[2];
                 }
             }
         }
@@ -260,7 +260,7 @@ TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
 
 TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
     MinimalScalarBasis basis;
-    const math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    const math::Vector<double, 3> xi{double(0), double(0), double(0)};
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
 
@@ -270,7 +270,7 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
 
 TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     ExactQuadraticBasis basis;
-    const math::Vector<Real, 3> xi{Real(0.2), Real(-0.35), Real(0.4)};
+    const math::Vector<double, 3> xi{double(0.2), double(-0.35), double(0.4)};
 
     std::vector<Gradient> exact_gradients;
     basis.evaluate_gradients(xi, exact_gradients);
@@ -281,7 +281,7 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     for (std::size_t n = 0; n < basis.size(); ++n) {
         for (int d = 0; d < basis.dimension(); ++d) {
             const std::size_t sd = static_cast<std::size_t>(d);
-            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], Real(1e-8))
+            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], double(1e-8))
                 << "basis=" << n << " component=" << d;
         }
     }
@@ -298,7 +298,7 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
                 const std::size_t sr = static_cast<std::size_t>(r);
                 const std::size_t sc = static_cast<std::size_t>(c);
                 EXPECT_NEAR(approx_hessians[n](sr, sc), exact_hessians[n](sr, sc),
-                            Real(1e-8))
+                            double(1e-8))
                     << "basis=" << n << " component=(" << r << "," << c << ")";
             }
         }
@@ -307,16 +307,16 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
 
 TEST(BasisErrorPaths, BasisFunctionFallbackWritesSpanOutputs) {
     CompleteFallbackBasis basis;
-    const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
+    const math::Vector<double, 3> point{double(0.25), double(0.5), double(-0.25)};
 
-    std::vector<Real> span_values(basis.size());
+    std::vector<double> span_values(basis.size());
     std::vector<Gradient> span_gradients(basis.size());
     std::vector<Hessian> span_hessians(basis.size());
     basis.evaluate_values_to(point, span_values);
     basis.evaluate_gradients_to(point, span_gradients);
     basis.evaluate_hessians_to(point, span_hessians);
 
-    std::vector<Real> expected_values;
+    std::vector<double> expected_values;
     std::vector<Gradient> expected_gradients;
     std::vector<Hessian> expected_hessians;
     basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index 0bf0b3d33..bc0fea554 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -19,44 +19,44 @@ using namespace svmp::FE::basis;
 namespace {
 
 void numerical_gradient_helper(const BasisFunction& basis,
-                               const math::Vector<Real, 3>& xi,
+                               const math::Vector<double, 3>& xi,
                                std::vector<Gradient>& gradients,
-                               Real eps = Real(1e-6))
+                               double eps = double(1e-6))
 {
-    std::vector<Real> base;
+    std::vector<double> base;
     basis.evaluate_values(xi, base);
     gradients.assign(base.size(), Gradient::Zero());
 
     for (int d = 0; d < basis.dimension(); ++d) {
         const std::size_t sd = static_cast<std::size_t>(d);
-        math::Vector<Real, 3> xi_p = xi;
-        math::Vector<Real, 3> xi_m = xi;
+        math::Vector<double, 3> xi_p = xi;
+        math::Vector<double, 3> xi_m = xi;
         xi_p[sd] += eps;
         xi_m[sd] -= eps;
 
-        std::vector<Real> v_p;
-        std::vector<Real> v_m;
+        std::vector<double> v_p;
+        std::vector<double> v_m;
         basis.evaluate_values(xi_p, v_p);
         basis.evaluate_values(xi_m, v_m);
 
         for (std::size_t n = 0; n < base.size(); ++n) {
-            gradients[n][sd] = (v_p[n] - v_m[n]) / (Real(2) * eps);
+            gradients[n][sd] = (v_p[n] - v_m[n]) / (double(2) * eps);
         }
     }
 }
 
 void numerical_hessian_helper(const BasisFunction& basis,
-                              const math::Vector<Real, 3>& xi,
+                              const math::Vector<double, 3>& xi,
                               std::vector<Hessian>& hessians,
-                              Real eps = Real(1e-5))
+                              double eps = double(1e-5))
 {
     hessians.assign(basis.size(), Hessian::Zero());
     const int dim = basis.dimension();
 
     for (int i = 0; i < dim; ++i) {
         for (int j = 0; j < dim; ++j) {
-            math::Vector<Real, 3> xi_p = xi;
-            math::Vector<Real, 3> xi_m = xi;
+            math::Vector<double, 3> xi_p = xi;
+            math::Vector<double, 3> xi_m = xi;
             const std::size_t sj = static_cast<std::size_t>(j);
             xi_p[sj] += eps;
             xi_m[sj] -= eps;
@@ -68,35 +68,35 @@ void numerical_hessian_helper(const BasisFunction& basis,
 
             for (std::size_t n = 0; n < basis.size(); ++n) {
                 const std::size_t si = static_cast<std::size_t>(i);
-                hessians[n](si, sj) = (g_p[n][si] - g_m[n][si]) / (Real(2) * eps);
+                hessians[n](si, sj) = (g_p[n][si] - g_m[n][si]) / (double(2) * eps);
             }
         }
     }
 }
 
-std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
+std::vector<math::Vector<double, 3>> sample_points_for(ElementType type) {
     switch (type) {
         case ElementType::Line2:
-            return {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}};
+            return {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}};
         case ElementType::Triangle3:
-            return {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}};
+            return {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}};
         case ElementType::Quad4:
-            return {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+            return {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}};
         case ElementType::Tetra4:
-            return {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}};
+            return {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}};
         case ElementType::Hex8:
-            return {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+            return {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}};
         case ElementType::Wedge6:
-            return {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}};
+            return {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}};
         default:
-            return {{Real(0), Real(0), Real(0)}};
+            return {{double(0), double(0), double(0)}};
     }
 }
 
 void expect_gradients_match_numerical(const BasisFunction& basis,
-                                      const std::vector<math::Vector<Real, 3>>& points,
-                                      Real tol,
-                                      Real eps = Real(1e-6))
+                                      const std::vector<math::Vector<double, 3>>& points,
+                                      double tol,
+                                      double eps = double(1e-6))
 {
     for (const auto& xi : points) {
         std::vector<Gradient> analytical;
@@ -118,9 +118,9 @@ void expect_gradients_match_numerical(const BasisFunction& basis,
 }
 
 void expect_hessians_match_numerical(const BasisFunction& basis,
-                                     const std::vector<math::Vector<Real, 3>>& points,
-                                     Real tol,
-                                     Real eps = Real(1e-5))
+                                     const std::vector<math::Vector<double, 3>>& points,
+                                     double tol,
+                                     double eps = double(1e-5))
 {
     for (const auto& xi : points) {
         std::vector<Hessian> analytical;
@@ -145,8 +145,8 @@ void expect_hessians_match_numerical(const BasisFunction& basis,
 }
 
 void expect_partition_hessian_sum_zero(const BasisFunction& basis,
-                                       const math::Vector<Real, 3>& xi,
-                                       Real tol)
+                                       const math::Vector<double, 3>& xi,
+                                       double tol)
 {
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
@@ -163,7 +163,7 @@ void expect_partition_hessian_sum_zero(const BasisFunction& basis,
     for (int r = 0; r < basis.dimension(); ++r) {
         for (int c = 0; c < basis.dimension(); ++c) {
             EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
-                        Real(0),
+                        double(0),
                         tol)
                 << "element " << static_cast<int>(basis.element_type())
                 << ", order " << basis.order();
@@ -172,8 +172,8 @@ void expect_partition_hessian_sum_zero(const BasisFunction& basis,
 }
 
 void expect_hessians_symmetric(const BasisFunction& basis,
-                               const math::Vector<Real, 3>& xi,
-                               Real tol)
+                               const math::Vector<double, 3>& xi,
+                               double tol)
 {
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
@@ -190,8 +190,8 @@ void expect_hessians_symmetric(const BasisFunction& basis,
 }
 
 void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
-                                        const std::vector<math::Vector<Real, 3>>& points,
-                                        Real tol)
+                                        const std::vector<math::Vector<double, 3>>& points,
+                                        double tol)
 {
     ASSERT_EQ(basis.dimension(), 2);
     for (const auto& xi : points) {
@@ -203,16 +203,16 @@ void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
         ASSERT_EQ(gradients.size(), basis.size());
         ASSERT_EQ(hessians.size(), basis.size());
         for (std::size_t n = 0; n < basis.size(); ++n) {
-            EXPECT_NEAR(gradients[n][2], Real(0), tol)
+            EXPECT_NEAR(gradients[n][2], double(0), tol)
                 << "basis " << n << ", element "
                 << static_cast<int>(basis.element_type())
                 << ", order " << basis.order();
             for (std::size_t d = 0; d < 3u; ++d) {
-                EXPECT_NEAR(hessians[n](2, d), Real(0), tol)
+                EXPECT_NEAR(hessians[n](2, d), double(0), tol)
                     << "basis " << n << ", component (2," << d
                     << "), element " << static_cast<int>(basis.element_type())
                     << ", order " << basis.order();
-                EXPECT_NEAR(hessians[n](d, 2), Real(0), tol)
+                EXPECT_NEAR(hessians[n](d, 2), double(0), tol)
                     << "basis " << n << ", component (" << d
                     << ",2), element " << static_cast<int>(basis.element_type())
                     << ", order " << basis.order();
@@ -221,14 +221,14 @@ void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
     }
 }
 
-std::vector<math::Vector<Real, 3>> serendipity_sample_points(ElementType type) {
+std::vector<math::Vector<double, 3>> serendipity_sample_points(ElementType type) {
     if (type == ElementType::Quad4 || type == ElementType::Quad8) {
-        return {{Real(0.17), Real(-0.31), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+        return {{double(0.17), double(-0.31), double(0)}, {double(-0.45), double(0.25), double(0)}};
     }
     if (type == ElementType::Hex8 || type == ElementType::Hex20) {
-        return {{Real(0.2), Real(-0.1), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+        return {{double(0.2), double(-0.1), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}};
     }
-    return {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.12), Real(0.16), Real(-0.2)}};
+    return {{double(0.2), double(0.3), double(0.1)}, {double(0.12), double(0.16), double(-0.2)}};
 }
 
 } // namespace
@@ -237,15 +237,15 @@ TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
     const struct Case {
         ElementType type;
         int order;
-        Real tol;
-        Real eps;
+        double tol;
+        double eps;
     } cases[] = {
-        {ElementType::Line2, 3, Real(1e-7), Real(1e-5)},
-        {ElementType::Triangle3, 3, Real(2e-6), Real(1e-5)},
-        {ElementType::Quad4, 3, Real(1e-6), Real(1e-5)},
-        {ElementType::Tetra4, 2, Real(1e-6), Real(1e-5)},
-        {ElementType::Hex8, 2, Real(1e-6), Real(1e-5)},
-        {ElementType::Wedge6, 2, Real(1e-5), Real(1e-5)},
+        {ElementType::Line2, 3, double(1e-7), double(1e-5)},
+        {ElementType::Triangle3, 3, double(2e-6), double(1e-5)},
+        {ElementType::Quad4, 3, double(1e-6), double(1e-5)},
+        {ElementType::Tetra4, 2, double(1e-6), double(1e-5)},
+        {ElementType::Hex8, 2, double(1e-6), double(1e-5)},
+        {ElementType::Wedge6, 2, double(1e-5), double(1e-5)},
     };
 
     for (const auto& c : cases) {
@@ -258,20 +258,20 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
     const struct Case {
         ElementType type;
         int order;
-        math::Vector<Real, 3> xi;
-        Real tol;
+        math::Vector<double, 3> xi;
+        double tol;
     } cases[] = {
-        {ElementType::Line2, 3, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
-        {ElementType::Triangle3, 3, {Real(0.2), Real(0.25), Real(0)}, Real(1e-10)},
-        {ElementType::Quad4, 3, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
-        {ElementType::Tetra4, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
-        {ElementType::Hex8, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
-        {ElementType::Wedge6, 2, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-10)},
+        {ElementType::Line2, 3, {double(0.15), double(0), double(0)}, double(1e-12)},
+        {ElementType::Triangle3, 3, {double(0.2), double(0.25), double(0)}, double(1e-10)},
+        {ElementType::Quad4, 3, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
+        {ElementType::Tetra4, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
+        {ElementType::Hex8, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
+        {ElementType::Wedge6, 2, {double(0.2), double(0.15), double(-0.3)}, double(1e-10)},
     };
 
     for (const auto& c : cases) {
         LagrangeBasis basis(c.type, c.order);
-        expect_partition_hessian_sum_zero(basis, c.xi, Real(10) * c.tol);
+        expect_partition_hessian_sum_zero(basis, c.xi, double(10) * c.tol);
         expect_hessians_symmetric(basis, c.xi, c.tol);
     }
 }
@@ -280,12 +280,12 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
     const struct Case {
         ElementType type;
         int order;
-        math::Vector<Real, 3> xi;
-        Real tol;
+        math::Vector<double, 3> xi;
+        double tol;
     } cases[] = {
-        {ElementType::Quad8, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
-        {ElementType::Hex20, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
-        {ElementType::Wedge15, 2, {Real(0.2), Real(0.3), Real(0.1)}, Real(1e-10)},
+        {ElementType::Quad8, 2, {double(0.17), double(-0.31), double(0)}, double(1e-10)},
+        {ElementType::Hex20, 2, {double(0.2), double(-0.1), double(0.3)}, double(1e-10)},
+        {ElementType::Wedge15, 2, {double(0.2), double(0.3), double(0.1)}, double(1e-10)},
     };
 
     for (const auto& c : cases) {
@@ -300,22 +300,22 @@ TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
         ElementType type;
         BasisType basis_type;
         int order;
-        math::Vector<Real, 3> xi;
-        Real tol;
+        math::Vector<double, 3> xi;
+        double tol;
     } cases[] = {
-        {ElementType::Line2, BasisType::Lagrange, 1, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
-        {ElementType::Line3, BasisType::Lagrange, 2, {Real(-0.25), Real(0), Real(0)}, Real(1e-12)},
-        {ElementType::Triangle3, BasisType::Lagrange, 1, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
-        {ElementType::Triangle6, BasisType::Lagrange, 2, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
-        {ElementType::Quad4, BasisType::Lagrange, 1, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
-        {ElementType::Quad8, BasisType::Serendipity, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
-        {ElementType::Quad9, BasisType::Lagrange, 2, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
-        {ElementType::Tetra4, BasisType::Lagrange, 1, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-12)},
-        {ElementType::Tetra10, BasisType::Lagrange, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
-        {ElementType::Hex8, BasisType::Lagrange, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
-        {ElementType::Hex20, BasisType::Serendipity, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
-        {ElementType::Hex27, BasisType::Lagrange, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
-        {ElementType::Wedge6, BasisType::Lagrange, 1, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-12)},
+        {ElementType::Line2, BasisType::Lagrange, 1, {double(0.15), double(0), double(0)}, double(1e-12)},
+        {ElementType::Line3, BasisType::Lagrange, 2, {double(-0.25), double(0), double(0)}, double(1e-12)},
+        {ElementType::Triangle3, BasisType::Lagrange, 1, {double(0.2), double(0.25), double(0)}, double(1e-12)},
+        {ElementType::Triangle6, BasisType::Lagrange, 2, {double(0.2), double(0.25), double(0)}, double(1e-12)},
+        {ElementType::Quad4, BasisType::Lagrange, 1, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
+        {ElementType::Quad8, BasisType::Serendipity, 2, {double(0.17), double(-0.31), double(0)}, double(1e-10)},
+        {ElementType::Quad9, BasisType::Lagrange, 2, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
+        {ElementType::Tetra4, BasisType::Lagrange, 1, {double(0.15), double(0.2), double(0.1)}, double(1e-12)},
+        {ElementType::Tetra10, BasisType::Lagrange, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
+        {ElementType::Hex8, BasisType::Lagrange, 1, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
+        {ElementType::Hex20, BasisType::Serendipity, 2, {double(0.2), double(-0.1), double(0.3)}, double(1e-10)},
+        {ElementType::Hex27, BasisType::Lagrange, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
+        {ElementType::Wedge6, BasisType::Lagrange, 1, {double(0.2), double(0.15), double(-0.3)}, double(1e-12)},
     };
 
     for (const auto& c : cases) {
@@ -333,14 +333,14 @@ TEST(BasisGradients, LagrangeCanonicalTopologiesMatchNumericalGradients) {
     const struct Case {
         ElementType type;
         int order;
-        Real tol;
+        double tol;
     } cases[] = {
-        {ElementType::Line2, 3, Real(1e-8)},
-        {ElementType::Triangle3, 3, Real(1e-7)},
-        {ElementType::Quad4, 3, Real(1e-7)},
-        {ElementType::Tetra4, 2, Real(1e-7)},
-        {ElementType::Hex8, 2, Real(1e-7)},
-        {ElementType::Wedge6, 2, Real(1e-7)},
+        {ElementType::Line2, 3, double(1e-8)},
+        {ElementType::Triangle3, 3, double(1e-7)},
+        {ElementType::Quad4, 3, double(1e-7)},
+        {ElementType::Tetra4, 2, double(1e-7)},
+        {ElementType::Hex8, 2, double(1e-7)},
+        {ElementType::Wedge6, 2, double(1e-7)},
     };
 
     for (const auto& c : cases) {
@@ -359,16 +359,16 @@ TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
     const struct Case {
         ElementType type;
         int order;
-        Real tol;
+        double tol;
     } cases[] = {
-        {ElementType::Quad4, 1, Real(1e-8)},
-        {ElementType::Quad8, 2, Real(1e-7)},
-        {ElementType::Quad4, 3, Real(1e-7)},
-        {ElementType::Quad4, 4, Real(5e-7)},
-        {ElementType::Quad4, 6, Real(2e-6)},
-        {ElementType::Hex8, 1, Real(1e-8)},
-        {ElementType::Hex20, 2, Real(1e-7)},
-        {ElementType::Wedge15, 2, Real(1e-7)},
+        {ElementType::Quad4, 1, double(1e-8)},
+        {ElementType::Quad8, 2, double(1e-7)},
+        {ElementType::Quad4, 3, double(1e-7)},
+        {ElementType::Quad4, 4, double(5e-7)},
+        {ElementType::Quad4, 6, double(2e-6)},
+        {ElementType::Hex8, 1, double(1e-8)},
+        {ElementType::Hex20, 2, double(1e-7)},
+        {ElementType::Wedge15, 2, double(1e-7)},
     };
 
     for (const auto& c : cases) {
@@ -394,7 +394,7 @@ TEST(BasisGradients, QuadrilateralSerendipityInactiveZDerivativesRemainZero) {
         expect_inactive_z_derivatives_zero(
             basis,
             serendipity_sample_points(c.type),
-            Real(1e-12));
+            double(1e-12));
     }
 }
 
@@ -402,16 +402,16 @@ TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
     const struct Case {
         ElementType type;
         int order;
-        Real tol;
+        double tol;
     } cases[] = {
-        {ElementType::Quad4, 1, Real(1e-6)},
-        {ElementType::Quad8, 2, Real(1e-6)},
-        {ElementType::Quad4, 3, Real(1e-6)},
-        {ElementType::Quad4, 4, Real(5e-6)},
-        {ElementType::Quad4, 6, Real(2e-5)},
-        {ElementType::Hex8, 1, Real(1e-6)},
-        {ElementType::Hex20, 2, Real(1e-6)},
-        {ElementType::Wedge15, 2, Real(1e-6)},
+        {ElementType::Quad4, 1, double(1e-6)},
+        {ElementType::Quad8, 2, double(1e-6)},
+        {ElementType::Quad4, 3, double(1e-6)},
+        {ElementType::Quad4, 4, double(5e-6)},
+        {ElementType::Quad4, 6, double(2e-5)},
+        {ElementType::Hex8, 1, double(1e-6)},
+        {ElementType::Hex20, 2, double(1e-6)},
+        {ElementType::Wedge15, 2, double(1e-6)},
     };
 
     for (const auto& c : cases) {
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index add2d256c..6f7d67809 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -39,12 +39,12 @@ static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
 static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
 static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
-static_assert(detail::basis_abs(Real(-2)) == Real(2));
-static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
-static_assert(detail::basis_near_zero(detail::basis_scaled_tolerance() * Real(0.5)));
+static_assert(detail::basis_abs(double(-2)) == double(2));
+static_assert(detail::basis_max(double(2), double(3)) == double(3));
+static_assert(detail::basis_near_zero(detail::basis_scaled_tolerance() * double(0.5)));
 static_assert(detail::basis_nearly_equal(
-    Real(1),
-    Real(1) + detail::basis_scaled_tolerance() * Real(0.5)));
+    double(1),
+    double(1) + detail::basis_scaled_tolerance() * double(0.5)));
 
 TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     const std::vector<std::pair<ElementType, std::size_t>> expected = {
@@ -70,16 +70,16 @@ TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     }
 }
 
-TEST(ConstexprBasis, TraitToleranceScalesWithRealPrecision) {
-    const Real eps = std::numeric_limits<Real>::epsilon();
-    const Real tol = detail::basis_scaled_tolerance();
+TEST(ConstexprBasis, TraitToleranceScalesWithDoublePrecision) {
+    const double eps = std::numeric_limits<double>::epsilon();
+    const double tol = detail::basis_scaled_tolerance();
     // Probes straddle the tolerance itself rather than hardcoding the multiplier,
     // so retuning basis_scaled_tolerance cannot silently invalidate them.
     EXPECT_GT(tol, eps);
-    EXPECT_TRUE(detail::basis_near_zero(tol * Real(0.5)));
-    EXPECT_FALSE(detail::basis_near_zero(tol * Real(2)));
-    EXPECT_TRUE(detail::basis_nearly_equal(Real(1), Real(1) + tol * Real(0.5)));
-    EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + tol * Real(2)));
+    EXPECT_TRUE(detail::basis_near_zero(tol * double(0.5)));
+    EXPECT_FALSE(detail::basis_near_zero(tol * double(2)));
+    EXPECT_TRUE(detail::basis_nearly_equal(double(1), double(1) + tol * double(0.5)));
+    EXPECT_FALSE(detail::basis_nearly_equal(double(1), double(1) + tol * double(2)));
 }
 
 TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 3ba943e04..0994d6e42 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -16,9 +16,9 @@ using namespace svmp::FE::basis;
 
 namespace {
 
-void expect_nodes_close(const std::vector<math::Vector<Real, 3>>& lhs,
-                        const std::vector<math::Vector<Real, 3>>& rhs,
-                        Real tol)
+void expect_nodes_close(const std::vector<math::Vector<double, 3>>& lhs,
+                        const std::vector<math::Vector<double, 3>>& rhs,
+                        double tol)
 {
     ASSERT_EQ(lhs.size(), rhs.size());
     for (std::size_t i = 0; i < lhs.size(); ++i) {
@@ -28,17 +28,17 @@ void expect_nodes_close(const std::vector<math::Vector<Real, 3>>& lhs,
     }
 }
 
-void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, double tol)
 {
     const auto& nodes = basis.nodes();
     ASSERT_EQ(nodes.size(), basis.size());
 
-    std::vector<Real> values;
+    std::vector<double> values;
     for (std::size_t node = 0; node < nodes.size(); ++node) {
         basis.evaluate_values(nodes[node], values);
         ASSERT_EQ(values.size(), basis.size());
         for (std::size_t i = 0; i < values.size(); ++i) {
-            const Real expected = (i == node) ? Real(1) : Real(0);
+            const double expected = (i == node) ? double(1) : double(0);
             EXPECT_NEAR(values[i], expected, tol)
                 << "node " << node << ", basis " << i;
         }
@@ -46,17 +46,17 @@ void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
 }
 
 void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
-                                            const std::vector<math::Vector<Real, 3>>& points,
-                                            Real value_tol,
-                                            Real derivative_tol)
+                                            const std::vector<math::Vector<double, 3>>& points,
+                                            double value_tol,
+                                            double derivative_tol)
 {
     for (const auto& xi : points) {
-        std::vector<Real> values;
+        std::vector<double> values;
         std::vector<Gradient> gradients;
         std::vector<Hessian> hessians;
         basis.evaluate_all(xi, values, gradients, hessians);
 
-        Real value_sum = Real(0);
+        double value_sum = double(0);
         Gradient gradient_sum = Gradient::Zero();
         Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
@@ -69,13 +69,13 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
             }
         }
 
-        EXPECT_NEAR(value_sum, Real(1), value_tol);
+        EXPECT_NEAR(value_sum, double(1), value_tol);
         for (int d = 0; d < basis.dimension(); ++d) {
-            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], double(0), derivative_tol);
             for (int e = 0; e < basis.dimension(); ++e) {
                 EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
                                         static_cast<std::size_t>(e)),
-                            Real(0),
+                            double(0),
                             derivative_tol);
             }
         }
@@ -83,9 +83,9 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
 }
 
 void expect_all_entries_finite(const LagrangeBasis& basis,
-                               const math::Vector<Real, 3>& xi)
+                               const math::Vector<double, 3>& xi)
 {
-    std::vector<Real> values;
+    std::vector<double> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
     basis.evaluate_all(xi, values, gradients, hessians);
@@ -113,29 +113,29 @@ TEST(HigherOrderWedge, CompleteAliasMatchesGeneratedNodeLayout) {
     ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(ElementType::Wedge18));
     EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge6);
     EXPECT_EQ(alias_basis.order(), 2);
-    expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
+    expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
 }
 
 TEST(HigherOrderWedge, OrderThreeIsNodalAndPartitionsUnity) {
     LagrangeBasis wedge(ElementType::Wedge6, 3);
 
-    expect_kronecker_at_nodes(wedge, Real(2e-10));
+    expect_kronecker_at_nodes(wedge, double(2e-10));
     expect_partition_gradient_hessian_sums(
         wedge,
         {
-            {Real(0.18), Real(0.22), Real(-0.2)},
-            {Real(0.12), Real(0.16), Real(0.1)},
-            {Real(0.25), Real(0.15), Real(0.45)},
+            {double(0.18), double(0.22), double(-0.2)},
+            {double(0.12), double(0.16), double(0.1)},
+            {double(0.25), double(0.15), double(0.45)},
         },
-        Real(1e-12),
-        Real(1e-9));
+        double(1e-12),
+        double(1e-9));
 }
 
 TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
     LagrangeBasis wedge(ElementType::Wedge6, 4);
 
-    expect_all_entries_finite(wedge, {Real(0.2), Real(0.1), Real(-0.6)});
-    expect_all_entries_finite(wedge, {Real(0.05), Real(0.8), Real(0.3)});
+    expect_all_entries_finite(wedge, {double(0.2), double(0.1), double(-0.6)});
+    expect_all_entries_finite(wedge, {double(0.05), double(0.8), double(0.3)});
 }
 
 // Finiteness alone cannot detect a wrong triangle-index or axis-index lookup;
@@ -146,13 +146,13 @@ TEST(HigherOrderWedge, OrderFourIsNodalAndPartitionsUnity) {
 
     // Order-4 wedge = triangle(order 4) x line(order 4) = 15 x 5 nodes.
     EXPECT_EQ(wedge.size(), 15u * 5u);
-    expect_kronecker_at_nodes(wedge, Real(1e-9));
+    expect_kronecker_at_nodes(wedge, double(1e-9));
     expect_partition_gradient_hessian_sums(
         wedge,
         {
-            {Real(0.18), Real(0.22), Real(-0.2)},
-            {Real(0.25), Real(0.15), Real(0.45)},
+            {double(0.18), double(0.22), double(-0.2)},
+            {double(0.25), double(0.15), double(0.45)},
         },
-        Real(1e-12),
-        Real(1e-7));
+        double(1e-12),
+        double(1e-7));
 }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 8288b4c37..6a5ad186e 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -21,7 +21,7 @@ using namespace svmp::FE::basis;
 
 namespace {
 
-using Point = math::Vector<Real, 3>;
+using Point = math::Vector<double, 3>;
 
 struct CanonicalCase {
     ElementType type;
@@ -29,29 +29,29 @@ struct CanonicalCase {
     std::size_t size;
     int dimension;
     std::vector<Point> points;
-    Real derivative_tol;
+    double derivative_tol;
 };
 
 const std::vector<CanonicalCase>& canonical_cases() {
     static const std::vector<CanonicalCase> cases = {
         {ElementType::Line2, 3, 4u, 1,
-         {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}},
-         Real(1e-11)},
+         {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}},
+         double(1e-11)},
         {ElementType::Triangle3, 3, 10u, 2,
-         {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}},
-         Real(1e-9)},
+         {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}},
+         double(1e-9)},
         {ElementType::Quad4, 3, 16u, 2,
-         {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}},
-         Real(1e-11)},
+         {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}},
+         double(1e-11)},
         {ElementType::Tetra4, 2, 10u, 3,
-         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}},
-         Real(1e-9)},
+         {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}},
+         double(1e-9)},
         {ElementType::Hex8, 2, 27u, 3,
-         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}},
-         Real(1e-10)},
+         {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}},
+         double(1e-10)},
         {ElementType::Wedge6, 2, 18u, 3,
-         {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}},
-         Real(1e-9)},
+         {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}},
+         double(1e-9)},
     };
     return cases;
 }
@@ -65,17 +65,17 @@ std::vector<Point> sample_points_for(ElementType type) {
     return {};
 }
 
-void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, double tol)
 {
     const auto& nodes = basis.nodes();
     ASSERT_EQ(nodes.size(), basis.size());
 
-    std::vector<Real> values;
+    std::vector<double> values;
     for (std::size_t node = 0; node < nodes.size(); ++node) {
         basis.evaluate_values(nodes[node], values);
         ASSERT_EQ(values.size(), basis.size());
         for (std::size_t i = 0; i < values.size(); ++i) {
-            EXPECT_NEAR(values[i], i == node ? Real(1) : Real(0), tol)
+            EXPECT_NEAR(values[i], i == node ? double(1) : double(0), tol)
                 << "node=" << node << " basis=" << i;
         }
     }
@@ -83,15 +83,15 @@ void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
 
 void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
                                             const std::vector<Point>& points,
-                                            Real derivative_tol)
+                                            double derivative_tol)
 {
     for (const auto& xi : points) {
-        std::vector<Real> values;
+        std::vector<double> values;
         std::vector<Gradient> gradients;
         std::vector<Hessian> hessians;
         basis.evaluate_all(xi, values, gradients, hessians);
 
-        Real value_sum = Real(0);
+        double value_sum = double(0);
         Gradient gradient_sum = Gradient::Zero();
         Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
@@ -104,13 +104,13 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
             }
         }
 
-        EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
+        EXPECT_NEAR(value_sum, double(1), double(1e-12));
         for (int d = 0; d < basis.dimension(); ++d) {
-            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], double(0), derivative_tol);
             for (int e = 0; e < basis.dimension(); ++e) {
                 EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
                                         static_cast<std::size_t>(e)),
-                            Real(0),
+                            double(0),
                             derivative_tol);
             }
         }
@@ -120,12 +120,12 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
 void expect_span_sinks_match_vector_evaluation(const LagrangeBasis& basis,
                                                const Point& xi)
 {
-    std::vector<Real> values;
+    std::vector<double> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
     basis.evaluate_all(xi, values, gradients, hessians);
 
-    std::vector<Real> span_values(basis.size());
+    std::vector<double> span_values(basis.size());
     std::vector<Gradient> span_gradients(basis.size());
     std::vector<Hessian> span_hessians(basis.size());
     basis.evaluate_values_to(xi, span_values);
@@ -133,13 +133,13 @@ void expect_span_sinks_match_vector_evaluation(const LagrangeBasis& basis,
     basis.evaluate_hessians_to(xi, span_hessians);
 
     for (std::size_t i = 0; i < basis.size(); ++i) {
-        EXPECT_NEAR(span_values[i], values[i], Real(1e-14));
+        EXPECT_NEAR(span_values[i], values[i], double(1e-14));
         for (std::size_t d = 0; d < 3u; ++d) {
-            EXPECT_NEAR(span_gradients[i][d], gradients[i][d], Real(1e-14));
+            EXPECT_NEAR(span_gradients[i][d], gradients[i][d], double(1e-14));
             for (std::size_t e = 0; e < 3u; ++e) {
                 EXPECT_NEAR(span_hessians[i](d, e),
                             hessians[i](d, e),
-                            Real(1e-14));
+                            double(1e-14));
             }
         }
     }
@@ -147,7 +147,7 @@ void expect_span_sinks_match_vector_evaluation(const LagrangeBasis& basis,
 
 void expect_nodes_close(const std::vector<Point>& lhs,
                         const std::vector<Point>& rhs,
-                        Real tol)
+                        double tol)
 {
     ASSERT_EQ(lhs.size(), rhs.size());
     for (std::size_t i = 0; i < lhs.size(); ++i) {
@@ -160,13 +160,13 @@ void expect_nodes_close(const std::vector<Point>& lhs,
 void expect_evaluations_match(const LagrangeBasis& lhs,
                               const LagrangeBasis& rhs,
                               const std::vector<Point>& points,
-                              Real tol)
+                              double tol)
 {
     ASSERT_EQ(lhs.size(), rhs.size());
 
     for (const auto& xi : points) {
-        std::vector<Real> lhs_values;
-        std::vector<Real> rhs_values;
+        std::vector<double> lhs_values;
+        std::vector<double> rhs_values;
         std::vector<Gradient> lhs_gradients;
         std::vector<Gradient> rhs_gradients;
         std::vector<Hessian> lhs_hessians;
@@ -187,43 +187,43 @@ void expect_evaluations_match(const LagrangeBasis& lhs,
     }
 }
 
-Real linear_function(const Point& p) {
-    return Real(2) + Real(3) * p[0] - Real(4) * p[1] + Real(5) * p[2];
+double linear_function(const Point& p) {
+    return double(2) + double(3) * p[0] - double(4) * p[1] + double(5) * p[2];
 }
 
 Gradient linear_gradient() {
     Gradient g = Gradient::Zero();
-    g[0] = Real(3);
-    g[1] = Real(-4);
-    g[2] = Real(5);
+    g[0] = double(3);
+    g[1] = double(-4);
+    g[2] = double(5);
     return g;
 }
 
-Real quadratic_function(const Point& p) {
-    return Real(1) + Real(2) * p[0] - p[1] + Real(0.5) * p[2] +
-           p[0] * p[0] + Real(0.75) * p[1] * p[1] - Real(0.25) * p[2] * p[2] +
-           Real(0.2) * p[0] * p[1] - Real(0.3) * p[0] * p[2] +
-           Real(0.4) * p[1] * p[2];
+double quadratic_function(const Point& p) {
+    return double(1) + double(2) * p[0] - p[1] + double(0.5) * p[2] +
+           p[0] * p[0] + double(0.75) * p[1] * p[1] - double(0.25) * p[2] * p[2] +
+           double(0.2) * p[0] * p[1] - double(0.3) * p[0] * p[2] +
+           double(0.4) * p[1] * p[2];
 }
 
 // Total degree three, so it lies in both the P3 simplex space and the Q3
 // tensor-product space.
-Real cubic_function(const Point& p) {
+double cubic_function(const Point& p) {
     return quadratic_function(p) +
-           Real(0.1) * p[0] * p[0] * p[0] -
-           Real(0.2) * p[1] * p[1] * p[1] +
-           Real(0.3) * p[2] * p[2] * p[2] +
-           Real(0.15) * p[0] * p[0] * p[1] -
-           Real(0.12) * p[0] * p[2] * p[2] +
-           Real(0.08) * p[0] * p[1] * p[2];
+           double(0.1) * p[0] * p[0] * p[0] -
+           double(0.2) * p[1] * p[1] * p[1] +
+           double(0.3) * p[2] * p[2] * p[2] +
+           double(0.15) * p[0] * p[0] * p[1] -
+           double(0.12) * p[0] * p[2] * p[2] +
+           double(0.08) * p[0] * p[1] * p[2];
 }
 
 template<typename Function>
-Real interpolate_value(const LagrangeBasis& basis,
-                       const std::vector<Real>& values,
+double interpolate_value(const LagrangeBasis& basis,
+                       const std::vector<double>& values,
                        Function&& nodal_function)
 {
-    Real result = Real(0);
+    double result = double(0);
     const auto& nodes = basis.nodes();
     for (std::size_t i = 0; i < values.size(); ++i) {
         result += values[i] * nodal_function(nodes[i]);
@@ -247,7 +247,7 @@ TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
 TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
     for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        expect_kronecker_at_nodes(basis, Real(2e-10));
+        expect_kronecker_at_nodes(basis, double(2e-10));
         expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
     }
 }
@@ -276,12 +276,12 @@ TEST(LagrangeBasis, CompleteAliasesNormalizeToCanonicalBases) {
 
         EXPECT_EQ(alias_basis.element_type(), canonical);
         EXPECT_EQ(alias_basis.order(), order);
-        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
-        expect_nodes_close(alias_basis.nodes(), canonical_basis.nodes(), Real(1e-14));
+        expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
+        expect_nodes_close(alias_basis.nodes(), canonical_basis.nodes(), double(1e-14));
         expect_evaluations_match(alias_basis,
                                  canonical_basis,
                                  sample_points_for(canonical),
-                                 Real(1e-12));
+                                 double(1e-12));
     }
 }
 
@@ -317,9 +317,9 @@ TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
 
         for (std::size_t i = 0; i < generated.size(); ++i) {
             const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
-            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node=" << i;
-            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node=" << i;
-            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[0], generated[i][0], double(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[1], generated[i][1], double(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[2], generated[i][2], double(1e-14)) << "node=" << i;
         }
     }
 }
@@ -347,29 +347,29 @@ TEST(LagrangeBasis, RemovedOrSerendipityFamiliesAreRejected) {
 // canonical Lagrange topologies and the serendipity families.
 TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
     const std::vector<std::pair<ElementType, Point>> cases = {
-        {ElementType::Line2, {Real(-0.2), Real(0), Real(0)}},
-        {ElementType::Triangle3, {Real(0.2), Real(0.3), Real(0)}},
-        {ElementType::Quad4, {Real(0.25), Real(-0.4), Real(0)}},
-        {ElementType::Tetra4, {Real(0.1), Real(0.2), Real(0.3)}},
-        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
-        {ElementType::Wedge6, {Real(0.2), Real(0.15), Real(-0.3)}},
+        {ElementType::Line2, {double(-0.2), double(0), double(0)}},
+        {ElementType::Triangle3, {double(0.2), double(0.3), double(0)}},
+        {ElementType::Quad4, {double(0.25), double(-0.4), double(0)}},
+        {ElementType::Tetra4, {double(0.1), double(0.2), double(0.3)}},
+        {ElementType::Hex8, {double(0.15), double(-0.2), double(0.25)}},
+        {ElementType::Wedge6, {double(0.2), double(0.15), double(-0.3)}},
     };
     const Gradient expected_gradient = linear_gradient();
 
     for (const auto& [type, point] : cases) {
         LagrangeBasis basis(type, 1);
-        std::vector<Real> values;
+        std::vector<double> values;
         std::vector<Gradient> gradients;
         basis.evaluate_values(point, values);
         basis.evaluate_gradients(point, gradients);
 
-        const Real interpolated =
+        const double interpolated =
             interpolate_value(basis, values, linear_function);
-        EXPECT_NEAR(interpolated, linear_function(point), Real(1e-12));
+        EXPECT_NEAR(interpolated, linear_function(point), double(1e-12));
 
         Gradient interpolated_gradient = Gradient::Zero();
         for (std::size_t i = 0; i < gradients.size(); ++i) {
-            const Real nodal_value = linear_function(basis.nodes()[i]);
+            const double nodal_value = linear_function(basis.nodes()[i]);
             for (int d = 0; d < basis.dimension(); ++d) {
                 interpolated_gradient[static_cast<std::size_t>(d)] +=
                     nodal_value * gradients[i][static_cast<std::size_t>(d)];
@@ -378,29 +378,29 @@ TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
         for (int d = 0; d < basis.dimension(); ++d) {
             EXPECT_NEAR(interpolated_gradient[static_cast<std::size_t>(d)],
                         expected_gradient[static_cast<std::size_t>(d)],
-                        Real(1e-12));
+                        double(1e-12));
         }
     }
 }
 
 TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
     const std::vector<std::pair<ElementType, Point>> cases = {
-        {ElementType::Line3, {Real(-0.2), Real(0), Real(0)}},
-        {ElementType::Triangle6, {Real(0.2), Real(0.3), Real(0)}},
-        {ElementType::Quad9, {Real(0.25), Real(-0.4), Real(0)}},
-        {ElementType::Tetra10, {Real(0.1), Real(0.2), Real(0.3)}},
-        {ElementType::Hex27, {Real(0.15), Real(-0.2), Real(0.25)}},
-        {ElementType::Wedge18, {Real(0.2), Real(0.15), Real(-0.3)}},
+        {ElementType::Line3, {double(-0.2), double(0), double(0)}},
+        {ElementType::Triangle6, {double(0.2), double(0.3), double(0)}},
+        {ElementType::Quad9, {double(0.25), double(-0.4), double(0)}},
+        {ElementType::Tetra10, {double(0.1), double(0.2), double(0.3)}},
+        {ElementType::Hex27, {double(0.15), double(-0.2), double(0.25)}},
+        {ElementType::Wedge18, {double(0.2), double(0.15), double(-0.3)}},
     };
 
     for (const auto& [type, point] : cases) {
         LagrangeBasis basis(type, 2);
-        std::vector<Real> values;
+        std::vector<double> values;
         basis.evaluate_values(point, values);
 
-        const Real interpolated =
+        const double interpolated =
             interpolate_value(basis, values, quadratic_function);
-        EXPECT_NEAR(interpolated, quadratic_function(point), Real(5e-12))
+        EXPECT_NEAR(interpolated, quadratic_function(point), double(5e-12))
             << "element=" << static_cast<int>(type);
     }
 }
@@ -417,16 +417,16 @@ TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
         ElementType type;
         int order;
         std::size_t size;
-        Real kronecker_tol;
-        Real derivative_tol;
+        double kronecker_tol;
+        double derivative_tol;
         std::vector<Point> points;
     } cases[] = {
-        {ElementType::Tetra4, 3, 20u, Real(5e-10), Real(1e-8),
-         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.3), Real(0.2), Real(0.25)}}},
-        {ElementType::Tetra4, 4, 35u, Real(1e-9), Real(1e-7),
-         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}}},
-        {ElementType::Hex8, 3, 64u, Real(5e-10), Real(1e-8),
-         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}}},
+        {ElementType::Tetra4, 3, 20u, double(5e-10), double(1e-8),
+         {{double(0.12), double(0.18), double(0.16)}, {double(0.3), double(0.2), double(0.25)}}},
+        {ElementType::Tetra4, 4, 35u, double(1e-9), double(1e-7),
+         {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}}},
+        {ElementType::Hex8, 3, 64u, double(5e-10), double(1e-8),
+         {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}}},
     };
 
     for (const auto& c : cases) {
@@ -451,15 +451,15 @@ TEST(LagrangeBasis, HigherOrderHexFaceInteriorFollowsVtkFaceOrder) {
 
     struct FaceBlock {
         std::size_t axis;  // constant axis: 0=x, 1=y, 2=z
-        Real value;        // constant coordinate on the face
+        double value;        // constant coordinate on the face
     };
     const FaceBlock blocks[] = {
-        {0u, Real(-1)},  // -X
-        {0u, Real(1)},   // +X
-        {1u, Real(-1)},  // -Y
-        {1u, Real(1)},   // +Y
-        {2u, Real(-1)},  // -Z
-        {2u, Real(1)},   // +Z
+        {0u, double(-1)},  // -X
+        {0u, double(1)},   // +X
+        {1u, double(-1)},  // -Y
+        {1u, double(1)},   // +Y
+        {2u, double(-1)},  // -Z
+        {2u, double(1)},   // +Z
     };
 
     constexpr std::size_t kFaceStart = 32u;
@@ -467,7 +467,7 @@ TEST(LagrangeBasis, HigherOrderHexFaceInteriorFollowsVtkFaceOrder) {
     for (std::size_t f = 0; f < 6u; ++f) {
         for (std::size_t m = 0; m < kPerFace; ++m) {
             const auto& node = nodes[kFaceStart + f * kPerFace + m];
-            EXPECT_NEAR(node[blocks[f].axis], blocks[f].value, Real(1e-14))
+            EXPECT_NEAR(node[blocks[f].axis], blocks[f].value, double(1e-14))
                 << "face block " << f << ", node " << m;
         }
     }
@@ -475,17 +475,17 @@ TEST(LagrangeBasis, HigherOrderHexFaceInteriorFollowsVtkFaceOrder) {
 
 TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
     const std::vector<std::pair<ElementType, Point>> cases = {
-        {ElementType::Tetra4, {Real(0.15), Real(0.2), Real(0.25)}},
-        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
+        {ElementType::Tetra4, {double(0.15), double(0.2), double(0.25)}},
+        {ElementType::Hex8, {double(0.15), double(-0.2), double(0.25)}},
     };
 
     for (const auto& [type, point] : cases) {
         LagrangeBasis basis(type, 3);
-        std::vector<Real> values;
+        std::vector<double> values;
         basis.evaluate_values(point, values);
 
-        const Real interpolated = interpolate_value(basis, values, cubic_function);
-        EXPECT_NEAR(interpolated, cubic_function(point), Real(1e-10))
+        const double interpolated = interpolate_value(basis, values, cubic_function);
+        EXPECT_NEAR(interpolated, cubic_function(point), double(1e-10))
             << "element=" << static_cast<int>(type);
     }
 }
@@ -498,40 +498,40 @@ TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
     EXPECT_EQ(basis.dimension(), 0);
     ASSERT_EQ(basis.nodes().size(), 1u);
 
-    const Point xi{Real(0.3), Real(-0.4), Real(0.1)};
-    std::vector<Real> values;
+    const Point xi{double(0.3), double(-0.4), double(0.1)};
+    std::vector<double> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
     basis.evaluate_all(xi, values, gradients, hessians);
 
     ASSERT_EQ(values.size(), 1u);
-    EXPECT_EQ(values[0], Real(1));
+    EXPECT_EQ(values[0], double(1));
     for (std::size_t d = 0; d < 3u; ++d) {
-        EXPECT_EQ(gradients[0][d], Real(0));
+        EXPECT_EQ(gradients[0][d], double(0));
         for (std::size_t e = 0; e < 3u; ++e) {
-            EXPECT_EQ(hessians[0](d, e), Real(0));
+            EXPECT_EQ(hessians[0](d, e), double(0));
         }
     }
 
-    Real span_value = Real(-1);
+    double span_value = double(-1);
     Gradient span_gradient;
-    span_gradient[0] = span_gradient[1] = span_gradient[2] = Real(-1);
+    span_gradient[0] = span_gradient[1] = span_gradient[2] = double(-1);
     Hessian span_hessian;
     for (std::size_t d = 0; d < 3u; ++d) {
         for (std::size_t e = 0; e < 3u; ++e) {
-            span_hessian(d, e) = Real(-1);
+            span_hessian(d, e) = double(-1);
         }
     }
-    basis.evaluate_values_to(xi, std::span<Real>(&span_value, 1u));
+    basis.evaluate_values_to(xi, std::span<double>(&span_value, 1u));
     basis.evaluate_gradients_to(xi, std::span<Gradient>(&span_gradient, 1u));
     basis.evaluate_hessians_to(xi, std::span<Hessian>(&span_hessian, 1u));
-    EXPECT_EQ(span_value, Real(1));
+    EXPECT_EQ(span_value, double(1));
     for (std::size_t d = 0; d < 3u; ++d) {
-        EXPECT_EQ(span_gradient[d], Real(0));
+        EXPECT_EQ(span_gradient[d], double(0));
     }
     for (std::size_t d = 0; d < 3u; ++d) {
         for (std::size_t e = 0; e < 3u; ++e) {
-            EXPECT_EQ(span_hessian(d, e), Real(0));
+            EXPECT_EQ(span_hessian(d, e), double(0));
         }
     }
 }
@@ -555,18 +555,18 @@ TEST(LagrangeBasis, OrderZeroBasesAreConstantUnity) {
         EXPECT_EQ(basis.size(), 1u) << "element=" << static_cast<int>(type);
 
         for (const auto& xi : sample_points_for(type)) {
-            std::vector<Real> values;
+            std::vector<double> values;
             std::vector<Gradient> gradients;
             std::vector<Hessian> hessians;
             basis.evaluate_all(xi, values, gradients, hessians);
 
             ASSERT_EQ(values.size(), 1u);
-            EXPECT_NEAR(values[0], Real(1), Real(1e-14))
+            EXPECT_NEAR(values[0], double(1), double(1e-14))
                 << "element=" << static_cast<int>(type);
             for (std::size_t d = 0; d < 3u; ++d) {
-                EXPECT_NEAR(gradients[0][d], Real(0), Real(1e-14));
+                EXPECT_NEAR(gradients[0][d], double(0), double(1e-14));
                 for (std::size_t e = 0; e < 3u; ++e) {
-                    EXPECT_NEAR(hessians[0](d, e), Real(0), Real(1e-14));
+                    EXPECT_NEAR(hessians[0](d, e), double(0), double(1e-14));
                 }
             }
         }
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 4fbc321a8..55b580219 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -21,15 +21,15 @@ using namespace svmp::FE::basis;
 namespace {
 
 void expect_partition_of_unity(const SerendipityBasis& basis,
-                               const math::Vector<Real, 3>& xi,
-                               Real tolerance = Real(1e-10))
+                               const math::Vector<double, 3>& xi,
+                               double tolerance = double(1e-10))
 {
-    std::vector<Real> values;
+    std::vector<double> values;
     std::vector<Gradient> gradients;
     basis.evaluate_values(xi, values);
     basis.evaluate_gradients(xi, gradients);
 
-    Real value_sum = Real(0);
+    double value_sum = double(0);
     Gradient gradient_sum = Gradient::Zero();
     for (std::size_t i = 0; i < values.size(); ++i) {
         value_sum += values[i];
@@ -38,34 +38,34 @@ void expect_partition_of_unity(const SerendipityBasis& basis,
         }
     }
 
-    EXPECT_NEAR(value_sum, Real(1), tolerance);
+    EXPECT_NEAR(value_sum, double(1), tolerance);
     for (int component = 0; component < basis.dimension(); ++component) {
         EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(component)],
-                    Real(0),
+                    double(0),
                     tolerance);
     }
 }
 
 void expect_nodal_delta(const SerendipityBasis& basis,
-                        const std::vector<math::Vector<Real, 3>>& nodes,
-                        Real tolerance)
+                        const std::vector<math::Vector<double, 3>>& nodes,
+                        double tolerance)
 {
     ASSERT_EQ(nodes.size(), basis.size());
     for (std::size_t node = 0; node < nodes.size(); ++node) {
-        std::vector<Real> values;
+        std::vector<double> values;
         basis.evaluate_values(nodes[node], values);
         ASSERT_EQ(values.size(), basis.size());
         for (std::size_t dof = 0; dof < values.size(); ++dof) {
-            EXPECT_NEAR(values[dof], dof == node ? Real(1) : Real(0), tolerance)
+            EXPECT_NEAR(values[dof], dof == node ? double(1) : double(0), tolerance)
                 << "node=" << node << " dof=" << dof;
         }
     }
 }
 
-std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
+std::vector<math::Vector<double, 3>> reference_nodes(ElementType type,
                                                    std::size_t count)
 {
-    std::vector<math::Vector<Real, 3>> nodes;
+    std::vector<math::Vector<double, 3>> nodes;
     nodes.reserve(count);
     for (std::size_t i = 0; i < count; ++i) {
         nodes.push_back(ReferenceNodeLayout::get_node_coords(type, i));
@@ -74,14 +74,14 @@ std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
 }
 
 template<typename Function>
-Real interpolate_nodal_function(const SerendipityBasis& basis,
-                                const math::Vector<Real, 3>& xi,
+double interpolate_nodal_function(const SerendipityBasis& basis,
+                                const math::Vector<double, 3>& xi,
                                 Function&& nodal_function)
 {
-    std::vector<Real> values;
+    std::vector<double> values;
     basis.evaluate_values(xi, values);
 
-    Real result = Real(0);
+    double result = double(0);
     const auto& nodes = basis.nodes();
     for (std::size_t i = 0; i < values.size(); ++i) {
         result += values[i] * nodal_function(nodes[i]);
@@ -115,26 +115,26 @@ std::size_t expected_quad_serendipity_size(int order) {
     return boundary + (m + 1u) * (m + 2u) / 2u;
 }
 
-Real integer_power_for_test(Real base, int exponent) {
-    Real result = Real(1);
+double integer_power_for_test(double base, int exponent) {
+    double result = double(1);
     for (int k = 0; k < exponent; ++k) {
         result *= base;
     }
     return result;
 }
 
-Real monomial_value_for_test(const math::Vector<Real, 3>& p,
+double monomial_value_for_test(const math::Vector<double, 3>& p,
                              const std::array<int, 2>& exponent) {
     return integer_power_for_test(p[0], exponent[0]) *
            integer_power_for_test(p[1], exponent[1]);
 }
 
-std::vector<Real> quadrilateral_vandermonde_for_test(
-    const std::vector<math::Vector<Real, 3>>& nodes,
+std::vector<double> quadrilateral_vandermonde_for_test(
+    const std::vector<math::Vector<double, 3>>& nodes,
     const std::vector<std::array<int, 2>>& exponents)
 {
     const std::size_t n = nodes.size();
-    std::vector<Real> vandermonde(n * n, Real(0));
+    std::vector<double> vandermonde(n * n, double(0));
     for (std::size_t row = 0; row < n; ++row) {
         for (std::size_t col = 0; col < n; ++col) {
             vandermonde[row * n + col] =
@@ -144,22 +144,22 @@ std::vector<Real> quadrilateral_vandermonde_for_test(
     return vandermonde;
 }
 
-void expect_no_duplicate_nodes(const std::vector<math::Vector<Real, 3>>& nodes,
-                               Real tolerance)
+void expect_no_duplicate_nodes(const std::vector<math::Vector<double, 3>>& nodes,
+                               double tolerance)
 {
     for (std::size_t a = 0; a < nodes.size(); ++a) {
         for (std::size_t b = a + 1u; b < nodes.size(); ++b) {
-            const Real dx = std::abs(nodes[a][0] - nodes[b][0]);
-            const Real dy = std::abs(nodes[a][1] - nodes[b][1]);
+            const double dx = std::abs(nodes[a][0] - nodes[b][0]);
+            const double dy = std::abs(nodes[a][1] - nodes[b][1]);
             EXPECT_GT(std::max(dx, dy), tolerance)
                 << "duplicate nodes " << a << " and " << b;
         }
     }
 }
 
-void expect_nodes_near(const std::vector<math::Vector<Real, 3>>& actual,
-                       const std::vector<math::Vector<Real, 3>>& expected,
-                       Real tolerance)
+void expect_nodes_near(const std::vector<math::Vector<double, 3>>& actual,
+                       const std::vector<math::Vector<double, 3>>& expected,
+                       double tolerance)
 {
     ASSERT_EQ(actual.size(), expected.size());
     for (std::size_t i = 0; i < actual.size(); ++i) {
@@ -172,16 +172,16 @@ void expect_nodes_near(const std::vector<math::Vector<Real, 3>>& actual,
 
 // Every monomial here has superlinear degree at most three, so it lies in the
 // order-three quadrilateral serendipity space.
-Real cubic_serendipity_function(const math::Vector<Real, 3>& p) {
-    const Real x = p[0];
-    const Real y = p[1];
-    return Real(1) + Real(2) * x - y + Real(3) * x * y +
-           x * x * x - Real(2) * y * y * y +
-           Real(0.5) * x * x * x * y - Real(0.25) * x * y * y * y;
+double cubic_serendipity_function(const math::Vector<double, 3>& p) {
+    const double x = p[0];
+    const double y = p[1];
+    return double(1) + double(2) * x - y + double(3) * x * y +
+           x * x * x - double(2) * y * y * y +
+           double(0.5) * x * x * x * y - double(0.25) * x * y * y * y;
 }
 
-Real bilinear_function(const math::Vector<Real, 3>& p) {
-    return Real(2) - Real(3) * p[0] + Real(4) * p[1] + Real(0.5) * p[0] * p[1];
+double bilinear_function(const math::Vector<double, 3>& p) {
+    return double(2) - double(3) * p[0] + double(4) * p[1] + double(0.5) * p[0] * p[1];
 }
 
 } // namespace
@@ -191,9 +191,9 @@ TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
     SerendipityBasis explicit_quad4_basis(ElementType::Quad4, 2);
 
     EXPECT_EQ(basis.size(), 8u);
-    expect_nodes_near(basis.nodes(), explicit_quad4_basis.nodes(), Real(1e-14));
-    expect_nodal_delta(basis, basis.nodes(), Real(1e-10));
-    expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)});
+    expect_nodes_near(basis.nodes(), explicit_quad4_basis.nodes(), double(1e-14));
+    expect_nodal_delta(basis, basis.nodes(), double(1e-10));
+    expect_partition_of_unity(basis, {double(0.17), double(-0.31), double(0)});
 }
 
 TEST(SerendipityBasis, Hex20IsNodalAndPartitionsUnity) {
@@ -202,8 +202,8 @@ TEST(SerendipityBasis, Hex20IsNodalAndPartitionsUnity) {
     EXPECT_EQ(basis.size(), 20u);
     expect_nodal_delta(basis,
                        reference_nodes(ElementType::Hex20, basis.size()),
-                       Real(1e-10));
-    expect_partition_of_unity(basis, {Real(0.2), Real(-0.1), Real(0.3)});
+                       double(1e-10));
+    expect_partition_of_unity(basis, {double(0.2), double(-0.1), double(0.3)});
 }
 
 TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
@@ -212,8 +212,8 @@ TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
     EXPECT_EQ(basis.size(), 15u);
     expect_nodal_delta(basis,
                        reference_nodes(ElementType::Wedge15, basis.size()),
-                       Real(1e-9));
-    expect_partition_of_unity(basis, {Real(0.2), Real(0.3), Real(0.1)});
+                       double(1e-9));
+    expect_partition_of_unity(basis, {double(0.2), double(0.3), double(0.1)});
 }
 
 TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
@@ -229,7 +229,7 @@ TEST(SerendipityBasis, QuadrilateralOrderZeroNormalizesToLinear) {
 
     EXPECT_EQ(basis.order(), 1);
     EXPECT_EQ(basis.size(), 4u);
-    expect_nodal_delta(basis, basis.nodes(), Real(1e-12));
+    expect_nodal_delta(basis, basis.nodes(), double(1e-12));
 }
 
 // Explicit Quad4 serendipity orders run the documented monomial selection,
@@ -254,18 +254,18 @@ TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity)
         ASSERT_EQ(basis.nodes().size(), c.size);
 
         for (const auto& node : basis.nodes()) {
-            EXPECT_LE(std::abs(node[0]), Real(1));
-            EXPECT_LE(std::abs(node[1]), Real(1));
+            EXPECT_LE(std::abs(node[0]), double(1));
+            EXPECT_LE(std::abs(node[1]), double(1));
         }
 
-        expect_nodal_delta(basis, basis.nodes(), Real(1e-9));
-        expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-9));
-        expect_partition_of_unity(basis, {Real(-0.45), Real(0.25), Real(0)}, Real(1e-9));
+        expect_nodal_delta(basis, basis.nodes(), double(1e-9));
+        expect_partition_of_unity(basis, {double(0.17), double(-0.31), double(0)}, double(1e-9));
+        expect_partition_of_unity(basis, {double(-0.45), double(0.25), double(0)}, double(1e-9));
     }
 }
 
 TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrderTen) {
-    constexpr Real kTol = Real(1e-14);
+    constexpr double kTol = double(1e-14);
 
     for (int order = 1; order <= 10; ++order) {
         SerendipityBasis basis(ElementType::Quad4, order);
@@ -280,23 +280,23 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
         expect_no_duplicate_nodes(nodes, kTol);
 
         for (std::size_t i = 0; i < nodes.size(); ++i) {
-            EXPECT_NEAR(nodes[i][2], Real(0), kTol) << "order=" << order
+            EXPECT_NEAR(nodes[i][2], double(0), kTol) << "order=" << order
                                                     << " node=" << i;
-            EXPECT_LE(std::abs(nodes[i][0]), Real(1)) << "order=" << order
+            EXPECT_LE(std::abs(nodes[i][0]), double(1)) << "order=" << order
                                                        << " node=" << i;
-            EXPECT_LE(std::abs(nodes[i][1]), Real(1)) << "order=" << order
+            EXPECT_LE(std::abs(nodes[i][1]), double(1)) << "order=" << order
                                                        << " node=" << i;
 
             const bool on_boundary =
-                std::abs(std::abs(nodes[i][0]) - Real(1)) <= kTol ||
-                std::abs(std::abs(nodes[i][1]) - Real(1)) <= kTol;
+                std::abs(std::abs(nodes[i][0]) - double(1)) <= kTol ||
+                std::abs(std::abs(nodes[i][1]) - double(1)) <= kTol;
             if (i < boundary_count) {
                 EXPECT_TRUE(on_boundary) << "order=" << order << " node=" << i;
             } else {
                 EXPECT_FALSE(on_boundary) << "order=" << order << " node=" << i;
-                EXPECT_LT(std::abs(nodes[i][0]), Real(1)) << "order=" << order
+                EXPECT_LT(std::abs(nodes[i][0]), double(1)) << "order=" << order
                                                            << " node=" << i;
-                EXPECT_LT(std::abs(nodes[i][1]), Real(1)) << "order=" << order
+                EXPECT_LT(std::abs(nodes[i][1]), double(1)) << "order=" << order
                                                            << " node=" << i;
             }
         }
@@ -304,16 +304,16 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
         std::size_t index = boundary_count;
         if (order >= 4) {
             const int m = order - 4;
-            const Real y_denominator = Real(m + 2);
+            const double y_denominator = double(m + 2);
             for (int row = 0; row <= m; ++row) {
                 const int row_count = m + 1 - row;
-                const Real expected_y =
-                    Real(-1) + Real(2) * Real(row + 1) / y_denominator;
-                const Real x_denominator = Real(row_count + 1);
+                const double expected_y =
+                    double(-1) + double(2) * double(row + 1) / y_denominator;
+                const double x_denominator = double(row_count + 1);
                 for (int col = 0; col < row_count; ++col) {
                     ASSERT_LT(index, nodes.size());
-                    const Real expected_x =
-                        Real(-1) + Real(2) * Real(col + 1) / x_denominator;
+                    const double expected_x =
+                        double(-1) + double(2) * double(col + 1) / x_denominator;
                     EXPECT_NEAR(nodes[index][0], expected_x, kTol)
                         << "order=" << order << " row=" << row << " col=" << col;
                     EXPECT_NEAR(nodes[index][1], expected_y, kTol)
@@ -329,36 +329,36 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
 TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
     SerendipityBasis basis(ElementType::Quad4, 1);
 
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.25), Real(-0.4), Real(0)},
-        {Real(-0.7), Real(0.6), Real(0)},
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.25), double(-0.4), double(0)},
+        {double(-0.7), double(0.6), double(0)},
     };
     for (const auto& xi : points) {
         EXPECT_NEAR(interpolate_nodal_function(basis, xi, bilinear_function),
                     bilinear_function(xi),
-                    Real(1e-12));
+                    double(1e-12));
     }
 }
 
 TEST(SerendipityBasis, QuadrilateralOrderThreeReproducesSerendipityCubics) {
     SerendipityBasis basis(ElementType::Quad4, 3);
 
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.25), Real(-0.4), Real(0)},
-        {Real(-0.7), Real(0.6), Real(0)},
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.25), double(-0.4), double(0)},
+        {double(-0.7), double(0.6), double(0)},
     };
     for (const auto& xi : points) {
         EXPECT_NEAR(interpolate_nodal_function(basis, xi, cubic_serendipity_function),
                     cubic_serendipity_function(xi),
-                    Real(1e-11));
+                    double(1e-11));
     }
 }
 
 TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.25), Real(-0.4), Real(0)},
-        {Real(-0.7), Real(0.6), Real(0)},
-        {Real(0.11), Real(0.23), Real(0)},
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.25), double(-0.4), double(0)},
+        {double(-0.7), double(0.6), double(0)},
+        {double(0.11), double(0.23), double(0)},
     };
 
     for (int order = 1; order <= 10; ++order) {
@@ -366,17 +366,17 @@ TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
         const auto exponents = quad_serendipity_exponents_for_test(order);
         ASSERT_EQ(exponents.size(), basis.size()) << "order=" << order;
 
-        const Real tolerance = (order <= 7) ? Real(1e-10) : Real(2e-8);
+        const double tolerance = (order <= 7) ? double(1e-10) : double(2e-8);
         for (const auto& exponent : exponents) {
             for (const auto& xi : points) {
-                const Real interpolated =
+                const double interpolated =
                     interpolate_nodal_function(
                         basis,
                         xi,
-                        [&exponent](const math::Vector<Real, 3>& node) {
+                        [&exponent](const math::Vector<double, 3>& node) {
                             return monomial_value_for_test(node, exponent);
                         });
-                const Real expected = monomial_value_for_test(xi, exponent);
+                const double expected = monomial_value_for_test(xi, exponent);
                 EXPECT_NEAR(interpolated, expected, tolerance)
                     << "order=" << order << " ax=" << exponent[0]
                     << " ay=" << exponent[1] << " xi=(" << xi[0] << ","
@@ -412,15 +412,15 @@ TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
     EXPECT_EQ(serendipity.dimension(), 3);
     expect_nodal_delta(serendipity,
                        reference_nodes(ElementType::Hex8, serendipity.size()),
-                       Real(1e-12));
+                       double(1e-12));
 
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.2), Real(-0.1), Real(0.3)},
-        {Real(-0.35), Real(0.25), Real(-0.15)},
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
     };
     for (const auto& xi : points) {
-        std::vector<Real> s_values;
-        std::vector<Real> l_values;
+        std::vector<double> s_values;
+        std::vector<double> l_values;
         std::vector<Gradient> s_gradients;
         std::vector<Gradient> l_gradients;
         std::vector<Hessian> s_hessians;
@@ -430,11 +430,11 @@ TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
 
         ASSERT_EQ(s_values.size(), l_values.size());
         for (std::size_t i = 0; i < s_values.size(); ++i) {
-            EXPECT_NEAR(s_values[i], l_values[i], Real(1e-13));
+            EXPECT_NEAR(s_values[i], l_values[i], double(1e-13));
             for (std::size_t d = 0; d < 3u; ++d) {
-                EXPECT_NEAR(s_gradients[i][d], l_gradients[i][d], Real(1e-13));
+                EXPECT_NEAR(s_gradients[i][d], l_gradients[i][d], double(1e-13));
                 for (std::size_t e = 0; e < 3u; ++e) {
-                    EXPECT_NEAR(s_hessians[i](d, e), l_hessians[i](d, e), Real(1e-13));
+                    EXPECT_NEAR(s_hessians[i](d, e), l_hessians[i](d, e), double(1e-13));
                 }
             }
         }
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
index f21e37dd4..96890f241 100644
--- a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -17,12 +17,12 @@ using namespace svmp::FE::math;
 
 namespace {
 
-Real multiply_entry(const std::vector<Real>& A,
-                    const std::vector<Real>& B,
+double multiply_entry(const std::vector<double>& A,
+                    const std::vector<double>& B,
                     std::size_t n,
                     std::size_t row,
                     std::size_t col) {
-    Real sum = Real(0);
+    double sum = double(0);
     for (std::size_t k = 0; k < n; ++k) {
         sum += A[row * n + k] * B[k * n + col];
     }
@@ -32,78 +32,78 @@ Real multiply_entry(const std::vector<Real>& A,
 } // namespace
 
 TEST(DenseLinearAlgebra, InvertsScaledMatrix) {
-    const std::vector<Real> A{
-        Real(1.0e9), Real(2.0e6),
-        Real(3.0e3), Real(4.0)
+    const std::vector<double> A{
+        double(1.0e9), double(2.0e6),
+        double(3.0e3), double(4.0)
     };
 
     const auto inv = invert_dense_matrix(A, 2u, "scaled 2x2");
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
-            const Real expected = (row == col) ? Real(1) : Real(0);
-            EXPECT_NEAR(multiply_entry(A, inv, 2u, row, col), expected, Real(1.0e-10));
+            const double expected = (row == col) ? double(1) : double(0);
+            EXPECT_NEAR(multiply_entry(A, inv, 2u, row, col), expected, double(1.0e-10));
         }
     }
 }
 
 TEST(DenseLinearAlgebra, FactorizationSolvesMultipleRightHandSides) {
-    const std::vector<Real> A{
-        Real(4), Real(2), Real(0),
-        Real(2), Real(5), Real(1),
-        Real(0), Real(1), Real(3)
+    const std::vector<double> A{
+        double(4), double(2), double(0),
+        double(2), double(5), double(1),
+        double(0), double(1), double(3)
     };
 
     const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3");
     EXPECT_EQ(solver.diagnostics.rank, 3u);
 
-    const std::vector<Real> rhs{Real(2), Real(4), Real(6)};
-    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    const std::vector<double> rhs{double(2), double(4), double(6)};
+    const auto x = solver.solve(std::span<const double>(rhs.data(), rhs.size()));
     ASSERT_EQ(x.size(), 3u);
 
     for (std::size_t row = 0; row < 3u; ++row) {
-        Real ax = Real(0);
+        double ax = double(0);
         for (std::size_t col = 0; col < 3u; ++col) {
             ax += A[row * 3u + col] * x[col];
         }
-        EXPECT_NEAR(ax, rhs[row], Real(1.0e-12));
+        EXPECT_NEAR(ax, rhs[row], double(1.0e-12));
     }
 
-    std::vector<Real> second_rhs{Real(1), Real(-2), Real(0.5)};
+    std::vector<double> second_rhs{double(1), double(-2), double(0.5)};
     const auto original_second_rhs = second_rhs;
-    solver.solve_in_place(std::span<Real>(second_rhs.data(), second_rhs.size()));
+    solver.solve_in_place(std::span<double>(second_rhs.data(), second_rhs.size()));
     for (std::size_t row = 0; row < 3u; ++row) {
-        Real ax = Real(0);
+        double ax = double(0);
         for (std::size_t col = 0; col < 3u; ++col) {
             ax += A[row * 3u + col] * second_rhs[col];
         }
-        EXPECT_NEAR(ax, original_second_rhs[row], Real(1.0e-12));
+        EXPECT_NEAR(ax, original_second_rhs[row], double(1.0e-12));
     }
 }
 
 TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
-    const std::vector<Real> A{
-        Real(4), Real(2), Real(0),
-        Real(2), Real(5), Real(1),
-        Real(0), Real(1), Real(3)
+    const std::vector<double> A{
+        double(4), double(2), double(0),
+        double(2), double(5), double(1),
+        double(0), double(1), double(3)
     };
 
     const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3 block");
 
-    std::vector<Real> rhs{
-        Real(2), Real(1),
-        Real(4), Real(-2),
-        Real(6), Real(0.5)
+    std::vector<double> rhs{
+        double(2), double(1),
+        double(4), double(-2),
+        double(6), double(0.5)
     };
     const auto original_rhs = rhs;
-    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 2u);
+    solver.solve_in_place(std::span<double>(rhs.data(), rhs.size()), 2u);
 
     for (std::size_t rhs_col = 0; rhs_col < 2u; ++rhs_col) {
         for (std::size_t row = 0; row < 3u; ++row) {
-            Real ax = Real(0);
+            double ax = double(0);
             for (std::size_t col = 0; col < 3u; ++col) {
                 ax += A[row * 3u + col] * rhs[col * 2u + rhs_col];
             }
-            EXPECT_NEAR(ax, original_rhs[row * 2u + rhs_col], Real(1.0e-12));
+            EXPECT_NEAR(ax, original_rhs[row * 2u + rhs_col], double(1.0e-12));
         }
     }
 }
@@ -113,40 +113,40 @@ TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
 // the inverse path used by SerendipityBasis, and the permutation replay in
 // solve_in_place.
 TEST(DenseLinearAlgebra, FactorizationPivotsThroughZeroLeadingDiagonal) {
-    const std::vector<Real> swap_2x2{
-        Real(0), Real(1),
-        Real(1), Real(0)
+    const std::vector<double> swap_2x2{
+        double(0), double(1),
+        double(1), double(0)
     };
 
     const auto solver = factor_dense_matrix(swap_2x2, 2u, "swap 2x2");
-    const std::vector<Real> rhs{Real(3), Real(7)};
-    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    const std::vector<double> rhs{double(3), double(7)};
+    const auto x = solver.solve(std::span<const double>(rhs.data(), rhs.size()));
     ASSERT_EQ(x.size(), 2u);
-    EXPECT_NEAR(x[0], Real(7), Real(1.0e-14));
-    EXPECT_NEAR(x[1], Real(3), Real(1.0e-14));
+    EXPECT_NEAR(x[0], double(7), double(1.0e-14));
+    EXPECT_NEAR(x[1], double(3), double(1.0e-14));
 
     const auto inv = invert_dense_matrix(swap_2x2, 2u, "swap 2x2");
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
-            EXPECT_NEAR(inv[row * 2u + col], swap_2x2[row * 2u + col], Real(1.0e-14));
+            EXPECT_NEAR(inv[row * 2u + col], swap_2x2[row * 2u + col], double(1.0e-14));
         }
     }
 
     // Every column requires a row exchange during elimination.
-    const std::vector<Real> permuted_scaled{
-        Real(0), Real(0), Real(1), Real(0),
-        Real(1), Real(0), Real(0), Real(0),
-        Real(0), Real(0), Real(0), Real(2),
-        Real(0), Real(3), Real(0), Real(0)
+    const std::vector<double> permuted_scaled{
+        double(0), double(0), double(1), double(0),
+        double(1), double(0), double(0), double(0),
+        double(0), double(0), double(0), double(2),
+        double(0), double(3), double(0), double(0)
     };
 
     const auto inv4 = invert_dense_matrix(permuted_scaled, 4u, "permuted scaled 4x4");
     for (std::size_t row = 0; row < 4u; ++row) {
         for (std::size_t col = 0; col < 4u; ++col) {
-            const Real expected = (row == col) ? Real(1) : Real(0);
+            const double expected = (row == col) ? double(1) : double(0);
             EXPECT_NEAR(multiply_entry(permuted_scaled, inv4, 4u, row, col),
                         expected,
-                        Real(1.0e-14));
+                        double(1.0e-14));
         }
     }
 }
@@ -154,59 +154,59 @@ TEST(DenseLinearAlgebra, FactorizationPivotsThroughZeroLeadingDiagonal) {
 TEST(DenseLinearAlgebra, WideMultiRhsSolveWithPivoting) {
     // Requires a row swap in column 0 and uses a wide right-hand-side block to
     // exercise the row-interleaved multi-RHS layout end to end.
-    const std::vector<Real> A{
-        Real(0), Real(2), Real(1),
-        Real(4), Real(1), Real(0),
-        Real(1), Real(0), Real(3)
+    const std::vector<double> A{
+        double(0), double(2), double(1),
+        double(4), double(1), double(0),
+        double(1), double(0), double(3)
     };
     constexpr std::size_t kRhsCount = 33u;
 
     const auto solver = factor_dense_matrix(A, 3u, "pivoting 3x3");
 
-    std::vector<Real> rhs(3u * kRhsCount, Real(0));
+    std::vector<double> rhs(3u * kRhsCount, double(0));
     for (std::size_t row = 0; row < 3u; ++row) {
         for (std::size_t r = 0; r < kRhsCount; ++r) {
             rhs[row * kRhsCount + r] =
-                Real(1) + static_cast<Real>(row) - Real(0.25) * static_cast<Real>(r % 7u);
+                double(1) + static_cast<double>(row) - double(0.25) * static_cast<double>(r % 7u);
         }
     }
     const auto original_rhs = rhs;
 
-    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), kRhsCount);
+    solver.solve_in_place(std::span<double>(rhs.data(), rhs.size()), kRhsCount);
 
     for (std::size_t r = 0; r < kRhsCount; ++r) {
         for (std::size_t row = 0; row < 3u; ++row) {
-            Real ax = Real(0);
+            double ax = double(0);
             for (std::size_t col = 0; col < 3u; ++col) {
                 ax += A[row * 3u + col] * rhs[col * kRhsCount + r];
             }
-            EXPECT_NEAR(ax, original_rhs[row * kRhsCount + r], Real(1.0e-12))
+            EXPECT_NEAR(ax, original_rhs[row * kRhsCount + r], double(1.0e-12))
                 << "rhs column " << r << ", row " << row;
         }
     }
 }
 
 TEST(DenseLinearAlgebra, SolveInPlaceValidatesInputs) {
-    const std::vector<Real> identity{
-        Real(1), Real(0),
-        Real(0), Real(1)
+    const std::vector<double> identity{
+        double(1), double(0),
+        double(0), double(1)
     };
     const auto solver = factor_dense_matrix(identity, 2u, "identity 2x2");
 
-    std::vector<Real> rhs{Real(1), Real(2)};
-    EXPECT_THROW(solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 0u),
+    std::vector<double> rhs{double(1), double(2)};
+    EXPECT_THROW(solver.solve_in_place(std::span<double>(rhs.data(), rhs.size()), 0u),
                  FEException);
 
-    std::vector<Real> wrong_size{Real(1), Real(2), Real(3)};
+    std::vector<double> wrong_size{double(1), double(2), double(3)};
     EXPECT_THROW(
-        solver.solve_in_place(std::span<Real>(wrong_size.data(), wrong_size.size()), 1u),
+        solver.solve_in_place(std::span<double>(wrong_size.data(), wrong_size.size()), 1u),
         FEException);
 
     DenseLUSolver unfactored;
     unfactored.n = 2u;
     unfactored.label = "unfactored";
     EXPECT_FALSE(unfactored.empty());
-    EXPECT_THROW(unfactored.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 1u),
+    EXPECT_THROW(unfactored.solve_in_place(std::span<double>(rhs.data(), rhs.size()), 1u),
                  FEException);
 }
 
@@ -219,24 +219,24 @@ TEST(DenseLinearAlgebra, DiagnosticValidationRejectsRankMismatch) {
 }
 
 TEST(DenseLinearAlgebra, RankHandlesNonSquareMatrices) {
-    const std::vector<Real> wide_full{
-        Real(1), Real(0), Real(2),
-        Real(0), Real(1), Real(-1)
+    const std::vector<double> wide_full{
+        double(1), double(0), double(2),
+        double(0), double(1), double(-1)
     };
     EXPECT_EQ(dense_matrix_rank(wide_full, 2u, 3u), 2u);
 
-    const std::vector<Real> tall_rank_one{
-        Real(1), Real(2),
-        Real(2), Real(4),
-        Real(3), Real(6)
+    const std::vector<double> tall_rank_one{
+        double(1), double(2),
+        double(2), double(4),
+        double(3), double(6)
     };
     EXPECT_EQ(dense_matrix_rank(tall_rank_one, 3u, 2u), 1u);
 }
 
 TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
-    const std::vector<Real> high_condition{
-        Real(1), Real(0),
-        Real(0), Real(1.0e-13)
+    const std::vector<double> high_condition{
+        double(1), double(0),
+        double(0), double(1.0e-13)
     };
 
     const auto result =
@@ -248,10 +248,10 @@ TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
 
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
-            const Real expected = (row == col) ? Real(1) : Real(0);
+            const double expected = (row == col) ? double(1) : double(0);
             EXPECT_NEAR(multiply_entry(high_condition, result.inverse, 2u, row, col),
                         expected,
-                        Real(1.0e-12));
+                        double(1.0e-12));
         }
     }
 }
@@ -260,7 +260,7 @@ TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
     DenseInverseResult result;
     result.diagnostics.rank = 2u;
     result.diagnostics.condition_estimate =
-        dense_matrix_condition_error_threshold() * Real(10);
+        dense_matrix_condition_error_threshold() * double(10);
 
     EXPECT_GT(result.diagnostics.condition_estimate,
               dense_matrix_condition_error_threshold());
@@ -270,9 +270,9 @@ TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
 }
 
 TEST(DenseLinearAlgebra, ThrowsForScaleAwareSingularPivot) {
-    const std::vector<Real> singular{
-        Real(1.0e12), Real(2.0e12),
-        Real(0.5e12), Real(1.0e12)
+    const std::vector<double> singular{
+        double(1.0e12), double(2.0e12),
+        double(0.5e12), double(1.0e12)
     };
 
     EXPECT_THROW((void)invert_dense_matrix(singular, 2u, "singular 2x2"),
@@ -280,9 +280,9 @@ TEST(DenseLinearAlgebra, ThrowsForScaleAwareSingularPivot) {
 }
 
 TEST(DenseLinearAlgebra, FactorizationThrowsForRankDeficientMatrix) {
-    const std::vector<Real> singular{
-        Real(1), Real(2),
-        Real(2), Real(4)
+    const std::vector<double> singular{
+        double(1), double(2),
+        double(2), double(4)
     };
 
     EXPECT_THROW((void)factor_dense_matrix(singular, 2u, "rank-one 2x2"),
@@ -290,34 +290,34 @@ TEST(DenseLinearAlgebra, FactorizationThrowsForRankDeficientMatrix) {
 }
 
 TEST(DenseLinearAlgebra, RankUsesScaleAwareTolerance) {
-    const std::vector<Real> rank_one{
-        Real(1.0e8), Real(2.0e8),
-        Real(3.0e8), Real(6.0e8)
+    const std::vector<double> rank_one{
+        double(1.0e8), double(2.0e8),
+        double(3.0e8), double(6.0e8)
     };
     EXPECT_EQ(dense_matrix_rank(rank_one, 2u, 2u), 1u);
 
-    const std::vector<Real> full_rank{
-        Real(1.0e8), Real(2.0e8),
-        Real(3.0e8), Real(6.1e8)
+    const std::vector<double> full_rank{
+        double(1.0e8), double(2.0e8),
+        double(3.0e8), double(6.1e8)
     };
     EXPECT_EQ(dense_matrix_rank(full_rank, 2u, 2u), 2u);
 }
 
 TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
-    const std::vector<Real> diagonal{
-        Real(4), Real(0),
-        Real(0), Real(0.5)
+    const std::vector<double> diagonal{
+        double(4), double(0),
+        double(0), double(0.5)
     };
     const auto full =
         dense_matrix_diagnostics(diagonal, 2u, 2u, "diagonal 2x2");
     EXPECT_EQ(full.rank, 2u);
-    EXPECT_NEAR(full.largest_singular_value, Real(4), Real(1.0e-14));
-    EXPECT_NEAR(full.smallest_retained_singular_value, Real(0.5), Real(1.0e-14));
-    EXPECT_NEAR(full.condition_estimate, Real(8), Real(1.0e-14));
+    EXPECT_NEAR(full.largest_singular_value, double(4), double(1.0e-14));
+    EXPECT_NEAR(full.smallest_retained_singular_value, double(0.5), double(1.0e-14));
+    EXPECT_NEAR(full.condition_estimate, double(8), double(1.0e-14));
 
-    const std::vector<Real> rank_one{
-        Real(1), Real(2),
-        Real(2), Real(4)
+    const std::vector<double> rank_one{
+        double(1), double(2),
+        double(2), double(4)
     };
     const auto deficient =
         dense_matrix_diagnostics(rank_one, 2u, 2u, "rank-one 2x2");
@@ -326,20 +326,20 @@ TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquations) {
-    const std::vector<Real> rank_one{
-        Real(1), Real(2),
-        Real(2), Real(4)
+    const std::vector<double> rank_one{
+        double(1), double(2),
+        double(2), double(4)
     };
 
     const auto pinv =
         rank_revealing_pseudo_inverse(rank_one, 2u, 2u, "rank-one 2x2");
     EXPECT_EQ(pinv.rank, 1u);
-    EXPECT_NEAR(pinv.inverse[0], Real(0.04), Real(1.0e-13));
-    EXPECT_NEAR(pinv.inverse[1], Real(0.08), Real(1.0e-13));
-    EXPECT_NEAR(pinv.inverse[2], Real(0.08), Real(1.0e-13));
-    EXPECT_NEAR(pinv.inverse[3], Real(0.16), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[0], double(0.04), double(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[1], double(0.08), double(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[2], double(0.08), double(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[3], double(0.16), double(1.0e-13));
 
-    std::vector<Real> projection(4u, Real(0));
+    std::vector<double> projection(4u, double(0));
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
             for (std::size_t a = 0; a < 2u; ++a) {
@@ -351,23 +351,23 @@ TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquation
             }
             EXPECT_NEAR(projection[row * 2u + col],
                         rank_one[row * 2u + col],
-                        Real(1.0e-12));
+                        double(1.0e-12));
         }
     }
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseDropsNearZeroSingularValues) {
-    const std::vector<Real> near_singular{
-        Real(1), Real(0),
-        Real(0), Real(1.0e-18)
+    const std::vector<double> near_singular{
+        double(1), double(0),
+        double(0), double(1.0e-18)
     };
 
     const auto pinv =
         rank_revealing_pseudo_inverse(near_singular, 2u, 2u, "near-singular 2x2");
     EXPECT_EQ(pinv.rank, 1u);
-    EXPECT_GT(pinv.tolerance, Real(1.0e-18));
-    EXPECT_NEAR(pinv.inverse[0], Real(1), Real(1.0e-14));
-    EXPECT_NEAR(pinv.inverse[1], Real(0), Real(1.0e-14));
-    EXPECT_NEAR(pinv.inverse[2], Real(0), Real(1.0e-14));
-    EXPECT_NEAR(pinv.inverse[3], Real(0), Real(1.0e-14));
+    EXPECT_GT(pinv.tolerance, double(1.0e-18));
+    EXPECT_NEAR(pinv.inverse[0], double(1), double(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[1], double(0), double(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[2], double(0), double(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[3], double(0), double(1.0e-14));
 }

From 113984e86a8f03dc2b854765e66ae1156abac3d5 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 22 Jun 2026 11:48:55 -0700
Subject: [PATCH 44/91] using cmake c++ standard variable for the Trilinos
 flags

---
 Code/Source/solver/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index eace4d0b2..bac65c976 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -89,7 +89,7 @@ endif()
 # add trilinos flags and defines
 if(USE_TRILINOS)
   ADD_DEFINITIONS(-DWITH_TRILINOS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CMAKE_CXX_STANDARD}")
 endif()
 
 # Build with the PETSc linear algebra package.

From 661c1d3a6abcf6ddfca65836d69b70762b15fca6 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 22 Jun 2026 12:52:45 -0700
Subject: [PATCH 45/91] tet/wedge no longer build and discard per-node Hessians

---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 164 +++++++++++-------
 1 file changed, 106 insertions(+), 58 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 0233eb18a..42f052579 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -138,8 +138,13 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
 // Sentinel node index meaning "skip nothing" in product_excluding below.
 constexpr std::size_t kNoSkip = std::numeric_limits<std::size_t>::max();
 
-// Evaluate 1D Lagrange polynomials and derivatives at a point.
-void evaluate_1d_lagrange(double x, const std::vector<double>& nodes, AxisEval& out) {
+// Evaluate 1D Lagrange polynomials and derivatives at a point. `level` selects
+// how many derivative orders to compute: 0 for values only, 1 to also fill the
+// first derivative, and 2 to also fill the second. The output arrays stay sized
+// at n regardless of level so the tensor-product assembly can index them
+// unconditionally; only the higher-order computation loops are skipped.
+void evaluate_1d_lagrange(double x, const std::vector<double>& nodes, AxisEval& out,
+                          int level) {
     const std::size_t n = nodes.size();
     out.value.assign(n, double(0));
     out.first.assign(n, double(0));
@@ -173,31 +178,37 @@ void evaluate_1d_lagrange(double x, const std::vector<double>& nodes, AxisEval&
 
         out.value[i] = product_excluding() / denom;
 
-        double first = double(0);
-        for (std::size_t m = 0; m < n; ++m) {
-            if (m != i) {
-                first += product_excluding(m);
+        if (level >= 1) {
+            double first = double(0);
+            for (std::size_t m = 0; m < n; ++m) {
+                if (m != i) {
+                    first += product_excluding(m);
+                }
             }
+            out.first[i] = first / denom;
         }
-        out.first[i] = first / denom;
 
-        double second = double(0);
-        for (std::size_t m = 0; m < n; ++m) {
-            if (m == i) {
-                continue;
-            }
-            for (std::size_t l = 0; l < n; ++l) {
-                if (l != i && l != m) {
-                    second += product_excluding(m, l);
+        if (level >= 2) {
+            double second = double(0);
+            for (std::size_t m = 0; m < n; ++m) {
+                if (m == i) {
+                    continue;
+                }
+                for (std::size_t l = 0; l < n; ++l) {
+                    if (l != i && l != m) {
+                        second += product_excluding(m, l);
+                    }
                 }
             }
+            out.second[i] = second / denom;
         }
-        out.second[i] = second / denom;
     }
 }
 
-// Evaluate one barycentric polynomial factor and derivatives.
-std::array<double, 3> simplex_factor(int alpha, double lambda, int order) {
+// Evaluate one barycentric polynomial factor and its derivatives. `level`
+// selects how far the recurrence runs: 0 for the value only, 1 to also produce
+// the first derivative, and 2 to also produce the second.
+std::array<double, 3> simplex_factor(int alpha, double lambda, int order, int level) {
     double value = double(1);
     double first = double(0);
     double second = double(0);
@@ -209,29 +220,42 @@ std::array<double, 3> simplex_factor(int alpha, double lambda, int order) {
         const double old_first = first;
         const double old_second = second;
         value = old_value * factor * inv;
-        first = (old_first * factor + old_value * double(order)) * inv;
-        second = (old_second * factor + double(2) * old_first * double(order)) * inv;
+        if (level >= 1) {
+            first = (old_first * factor + old_value * double(order)) * inv;
+        }
+        if (level >= 2) {
+            second = (old_second * factor + double(2) * old_first * double(order)) * inv;
+        }
     }
 
     return {value, first, second};
 }
 
-// Evaluate simplex Lagrange basis functions and derivatives.
+// Evaluate simplex Lagrange basis functions and the requested derivatives.
+// Gradients and Hessians are assembled only when asked for; `out.gradient` and
+// `out.hessian` are left empty otherwise so a values-only request neither
+// allocates those buffers nor runs the derivative loops.
 void evaluate_simplex(const Vec3& xi,
                       BasisTopology top,
                       int order,
                       const std::vector<LagrangeBasis::SimplexExponent>& exponents,
-                      SimplexEval& out) {
+                      SimplexEval& out,
+                      bool want_gradient,
+                      bool want_hessian) {
     const std::size_t n = exponents.size();
     out.value.assign(n, double(0));
-    out.gradient.assign(n, Gradient::Zero());
-    out.hessian.assign(n, Hessian::Zero());
+    out.gradient.assign(want_gradient ? n : std::size_t{0}, Gradient::Zero());
+    out.hessian.assign(want_hessian ? n : std::size_t{0}, Hessian::Zero());
 
     if (n == 1u && order == 0) {
         out.value[0] = double(1);
         return;
     }
 
+    // A Hessian factor also needs the first-derivative recurrence, so the
+    // per-factor work runs to the highest requested order.
+    const int factor_level = want_hessian ? 2 : (want_gradient ? 1 : 0);
+
     const std::size_t bary_count = top == BasisTopology::Triangle ? 3u : 4u;
     std::array<double, 4> lambda{double(0), double(0), double(0), double(0)};
     std::array<Gradient, 4> lambda_grad;
@@ -257,7 +281,7 @@ void evaluate_simplex(const Vec3& xi,
     for (std::size_t i = 0; i < n; ++i) {
         std::array<std::array<double, 3>, 4> f{};
         for (std::size_t a = 0; a < bary_count; ++a) {
-            f[a] = simplex_factor(exponents[i][a], lambda[a], order);
+            f[a] = simplex_factor(exponents[i][a], lambda[a], order, factor_level);
         }
 
         double value = double(1);
@@ -266,30 +290,34 @@ void evaluate_simplex(const Vec3& xi,
         }
         out.value[i] = value;
 
-        for (std::size_t a = 0; a < bary_count; ++a) {
-            double product = f[a][1];
-            for (std::size_t b = 0; b < bary_count; ++b) {
-                if (b != a) {
-                    product *= f[b][0];
+        if (want_gradient) {
+            for (std::size_t a = 0; a < bary_count; ++a) {
+                double product = f[a][1];
+                for (std::size_t b = 0; b < bary_count; ++b) {
+                    if (b != a) {
+                        product *= f[b][0];
+                    }
+                }
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    out.gradient[i][c] += product * lambda_grad[a][c];
                 }
-            }
-            for (std::size_t c = 0; c < 3u; ++c) {
-                out.gradient[i][c] += product * lambda_grad[a][c];
             }
         }
 
-        for (std::size_t a = 0; a < bary_count; ++a) {
-            for (std::size_t b = 0; b < bary_count; ++b) {
-                double product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
-                for (std::size_t k = 0; k < bary_count; ++k) {
-                    if (k != a && k != b) {
-                        product *= f[k][0];
+        if (want_hessian) {
+            for (std::size_t a = 0; a < bary_count; ++a) {
+                for (std::size_t b = 0; b < bary_count; ++b) {
+                    double product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
+                    for (std::size_t k = 0; k < bary_count; ++k) {
+                        if (k != a && k != b) {
+                            product *= f[k][0];
+                        }
                     }
-                }
-                for (std::size_t r = 0; r < 3u; ++r) {
-                    for (std::size_t c = 0; c < 3u; ++c) {
-                        out.hessian[i](r, c) +=
-                            product * lambda_grad[a][r] * lambda_grad[b][c];
+                    for (std::size_t r = 0; r < 3u; ++r) {
+                        for (std::size_t c = 0; c < 3u; ++c) {
+                            out.hessian[i](r, c) +=
+                                product * lambda_grad[a][r] * lambda_grad[b][c];
+                        }
                     }
                 }
             }
@@ -430,15 +458,17 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
                                                std::span<double> values_out,
                                                std::span<Gradient> gradients_out,
                                                std::span<Hessian> hessians_out) const {
+    const int level = !hessians_out.empty() ? 2 : (!gradients_out.empty() ? 1 : 0);
+
     AxisEval ax;
     AxisEval ay;
     AxisEval az;
-    evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
+    evaluate_1d_lagrange(xi[0], nodes_1d_, ax, level);
     if (dimension_ >= 2) {
-        evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+        evaluate_1d_lagrange(xi[1], nodes_1d_, ay, level);
     }
     if (dimension_ >= 3) {
-        evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+        evaluate_1d_lagrange(xi[2], nodes_1d_, az, level);
     }
 
     for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
@@ -482,16 +512,21 @@ void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
                                         std::span<double> values_out,
                                         std::span<Gradient> gradients_out,
                                         std::span<Hessian> hessians_out) const {
+    const bool want_values = !values_out.empty();
+    const bool want_gradients = !gradients_out.empty();
+    const bool want_hessians = !hessians_out.empty();
+
     SimplexEval simplex;
-    evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
+    evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex,
+                     want_gradients, want_hessians);
     for (std::size_t i = 0; i < simplex.value.size(); ++i) {
-        if (!values_out.empty()) {
+        if (want_values) {
             values_out[i] = simplex.value[i];
         }
-        if (!gradients_out.empty()) {
+        if (want_gradients) {
             gradients_out[i] = simplex.gradient[i];
         }
-        if (!hessians_out.empty()) {
+        if (want_hessians) {
             hessians_out[i] = simplex.hessian[i];
         }
     }
@@ -502,28 +537,41 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
                                       std::span<double> values_out,
                                       std::span<Gradient> gradients_out,
                                       std::span<Hessian> hessians_out) const {
+    const bool want_values = !values_out.empty();
+    const bool want_gradients = !gradients_out.empty();
+    const bool want_hessians = !hessians_out.empty();
+
+    // The wedge gradient pairs the triangle gradient with the through-axis value,
+    // and the wedge Hessian reuses the triangle gradient for its mixed terms, so
+    // the triangle factor must supply gradients whenever the wedge needs either
+    // gradients or Hessians.
+    const bool want_tri_gradient = want_gradients || want_hessians;
+    const int z_level = want_hessians ? 2 : (want_gradients ? 1 : 0);
+
     SimplexEval tri;
     AxisEval z_axis;
-    evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
-    evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
+    evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri,
+                     want_tri_gradient, want_hessians);
+    evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis, z_level);
 
     for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
         const auto [tri_idx, z_idx] = wedge_indices_[node];
         const double tv = tri.value[tri_idx];
         const double zv = z_axis.value[z_idx];
-        const double dz = z_axis.first[z_idx];
-        const double d2z = z_axis.second[z_idx];
 
-        if (!values_out.empty()) {
+        if (want_values) {
             values_out[node] = tv * zv;
         }
-        if (!gradients_out.empty()) {
+        if (want_gradients) {
+            const double dz = z_axis.first[z_idx];
             Gradient& g = gradients_out[node];
             g[0] = tri.gradient[tri_idx][0] * zv;
             g[1] = tri.gradient[tri_idx][1] * zv;
             g[2] = tv * dz;
         }
-        if (!hessians_out.empty()) {
+        if (want_hessians) {
+            const double dz = z_axis.first[z_idx];
+            const double d2z = z_axis.second[z_idx];
             Hessian& h = hessians_out[node];
             const Hessian& th = tri.hessian[tri_idx];
             const Gradient& tg = tri.gradient[tri_idx];

From 361edebbf2626d51601385a37108b642add09ad7 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 22 Jun 2026 15:09:25 -0700
Subject: [PATCH 46/91] reword the doc comment to match intent for numerical
 gradient and hessian

---
 Code/Source/solver/FE/Basis/BasisFunction.h | 28 +++++++++++----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index be6418c50..583f6da2f 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -263,27 +263,29 @@ class BasisFunction {
 protected:
     /// \brief Approximate gradients by centered finite differences of values.
     ///
-    /// \details This helper exists as a development and fallback utility for
-    /// basis implementations that do not yet provide analytical gradients. It
-    /// is useful for prototyping new basis families and for checking analytical
-    /// derivative formulas in tests. Production element assembly should prefer
-    /// analytical gradients when available because finite differences introduce
+    /// \details This helper is primarily a verification utility for tests: it
+    /// provides a basis-independent reference that checks a concrete basis's
+    /// analytical evaluate_gradients() against centered finite differences of
+    /// evaluate_values(). It lives on the base class so any BasisFunction can be
+    /// checked uniformly, and having no production caller is by design — every
+    /// shipped basis supplies analytical gradients. Centered differences add
     /// truncation/roundoff sensitivity and require multiple value evaluations
-    /// per reference coordinate.
+    /// per reference coordinate, so analytical gradients are always preferred
+    /// outside this testing context.
     void numerical_gradient(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients,
                             double eps = double(1e-6)) const;
 
     /// \brief Approximate Hessians by centered finite differences of gradients.
     ///
-    /// \details This helper exists for the same reason as numerical_gradient:
-    /// it provides a simple reference implementation for prototyping and
-    /// derivative verification when analytical second derivatives are not yet
-    /// implemented. It depends on evaluate_gradients(), so it is only available
-    /// for basis implementations that can already provide gradients. Analytical
-    /// Hessians should be used in performance-sensitive solver paths because
+    /// \details Companion verification utility to numerical_gradient: it checks
+    /// a basis's analytical evaluate_hessians() against centered finite
+    /// differences of evaluate_gradients(). Because it differentiates gradients,
+    /// it is only meaningful for bases that already provide them. Like
+    /// numerical_gradient it is test-support rather than a production fallback —
     /// finite-difference Hessians amplify numerical error and require repeated
-    /// gradient evaluations.
+    /// gradient evaluations, so analytical Hessians are used everywhere outside
+    /// tests.
     void numerical_hessian(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians,
                            double eps = double(1e-5)) const;

From c0ecbdb951549c4ef2a805da6a844b229bdeada6 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 22 Jun 2026 15:39:48 -0700
Subject: [PATCH 47/91] removing code that is no longer within the production
 code for basis functions

---
 Code/Source/solver/FE/Basis/BasisFunction.h   | 16 -------
 Code/Source/solver/FE/Basis/BasisTraits.h     | 16 -------
 Code/Source/solver/FE/Common/Types.h          | 42 -------------------
 .../FE/Basis/test_BasisErrorPaths.cpp         | 18 ++++++++
 .../FE/Basis/test_ConstexprBasis.cpp          |  9 ----
 5 files changed, 18 insertions(+), 83 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 583f6da2f..5b7d22a67 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -133,22 +133,6 @@ using Gradient = math::Vector<double, 3>;
 /// \brief Hessian matrix type used by basis evaluators.
 using Hessian  = math::Matrix<double, 3, 3>;
 
-[[nodiscard]] inline Hessian make_symmetric_hessian(double xx,
-                                                    double yy,
-                                                    double zz,
-                                                    double xy,
-                                                    double xz,
-                                                    double yz) {
-    Hessian hessian = Hessian::Zero();
-    hessian(0, 0) = xx;
-    hessian(1, 1) = yy;
-    hessian(2, 2) = zz;
-    hessian(0, 1) = hessian(1, 0) = xy;
-    hessian(0, 2) = hessian(2, 0) = xz;
-    hessian(1, 2) = hessian(2, 1) = yz;
-    return hessian;
-}
-
 /// \brief Throw BasisEvaluationException when an output span is smaller than the
 /// basis size. \p label is the full "Class::method" context used in the message,
 /// so each basis family passes its own qualified name.
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index c9df8789c..9c5e0945d 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -106,22 +106,6 @@ namespace detail {
     return topology(type) == BasisTopology::Wedge;
 }
 
-// Pyramids are a valid mesh cell family but not a supported basis topology, so
-// this classifier reads the mesh family directly: topology() maps pyramids to
-// Unknown, yet a truthful is_pyramid keeps the predicate set complete and ready
-// for future pyramid support.
-[[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
-    return to_mesh_family(type) == CellFamily::Pyramid;
-}
-
-[[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
-    return is_triangle(type) || is_tetrahedron(type);
-}
-
-[[nodiscard]] constexpr bool is_tensor_product(ElementType type) noexcept {
-    return is_line(type) || is_quadrilateral(type) || is_hexahedron(type);
-}
-
 [[nodiscard]] constexpr int reference_dimension(ElementType type) noexcept {
     return element_dimension(type);
 }
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 38afe4086..f085e7e8f 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -307,25 +307,6 @@ enum class AssemblyStrategy : std::uint8_t {
     Hybrid             ///< Mixed strategy
 };
 
-/**
- * @brief Status codes for FE operations
- */
-enum class FEStatus : std::uint8_t {
-    Success           = 0,    ///< Operation completed successfully
-    InvalidArgument   = 1,    ///< An argument failed validation
-    InvalidElement    = 2,    ///< Unsupported or malformed element
-    SingularMapping   = 3,    ///< Element mapping Jacobian is singular
-    QuadratureError   = 4,    ///< Quadrature rule construction or evaluation failed
-    AssemblyError     = 5,    ///< Global assembly failure
-    BackendError      = 6,    ///< Linear-algebra backend failure
-    NotImplemented    = 7,    ///< Requested feature is not implemented
-    ConvergenceError  = 8,    ///< Iterative process failed to converge
-    AllocationError   = 9,    ///< Memory allocation failure
-    MPIError          = 10,   ///< MPI communication failure
-    IOError           = 11,   ///< File or stream I/O failure
-    Unknown           = 255   ///< Unclassified error
-};
-
 // ============================================================================
 // Geometric Types
 // ============================================================================
@@ -545,29 +526,6 @@ constexpr int element_dimension(ElementType elem) noexcept {
     }
 }
 
-/**
- * @brief Convert status code to string for error reporting
- * @param status Status code to describe.
- * @return Static human-readable description of the status.
- */
-inline const char* status_to_string(FEStatus status) noexcept {
-    switch(status) {
-        case FEStatus::Success:          return "Success";
-        case FEStatus::InvalidArgument:  return "Invalid argument";
-        case FEStatus::InvalidElement:   return "Invalid element";
-        case FEStatus::SingularMapping:  return "Singular mapping";
-        case FEStatus::QuadratureError:  return "Quadrature error";
-        case FEStatus::AssemblyError:    return "Assembly error";
-        case FEStatus::BackendError:     return "Backend error";
-        case FEStatus::NotImplemented:   return "Not implemented";
-        case FEStatus::ConvergenceError: return "Convergence error";
-        case FEStatus::AllocationError:  return "Allocation error";
-        case FEStatus::MPIError:         return "MPI error";
-        case FEStatus::IOError:          return "I/O error";
-        default:                         return "Unknown error";
-    }
-}
-
 /// @}
 
 } // namespace FE
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 0a3048d65..05b657a7a 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -20,6 +20,24 @@ using namespace svmp::FE::basis;
 
 namespace {
 
+// Build a symmetric 3x3 Hessian from its six independent components. Local to
+// this test; the production basis evaluators fill Hessians directly.
+[[nodiscard]] Hessian make_symmetric_hessian(double xx,
+                                             double yy,
+                                             double zz,
+                                             double xy,
+                                             double xz,
+                                             double yz) {
+    Hessian hessian = Hessian::Zero();
+    hessian(0, 0) = xx;
+    hessian(1, 1) = yy;
+    hessian(2, 2) = zz;
+    hessian(0, 1) = hessian(1, 0) = xy;
+    hessian(0, 2) = hessian(2, 0) = xz;
+    hessian(1, 2) = hessian(2, 1) = yz;
+    return hessian;
+}
+
 class MinimalScalarBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index 6f7d67809..b1215c354 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -25,15 +25,6 @@ static_assert(is_quadrilateral(ElementType::Quad8));
 static_assert(is_tetrahedron(ElementType::Tetra10));
 static_assert(is_hexahedron(ElementType::Hex20));
 static_assert(is_wedge(ElementType::Wedge18));
-static_assert(is_pyramid(ElementType::Pyramid5));
-static_assert(is_pyramid(ElementType::Pyramid14));
-static_assert(is_simplex(ElementType::Triangle3));
-static_assert(is_simplex(ElementType::Tetra4));
-static_assert(!is_simplex(ElementType::Wedge6));
-static_assert(is_tensor_product(ElementType::Line2));
-static_assert(is_tensor_product(ElementType::Quad9));
-static_assert(is_tensor_product(ElementType::Hex27));
-static_assert(!is_tensor_product(ElementType::Wedge6));
 static_assert(topology(ElementType::Pyramid5) == BasisTopology::Unknown);
 static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);

From 5bd59f611a13e3ae8d65df72d5bae46edc23c175 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 23 Jun 2026 11:29:15 -0700
Subject: [PATCH 48/91] updating doxygen documentation to use the block comment
 style

---
 Code/Source/solver/FE/Basis/BasisFactory.h    |  44 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   | 424 ++++++++++--------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 310 +++++++------
 .../solver/FE/Basis/NodeOrderingConventions.h |  10 +-
 .../Source/solver/FE/Basis/SerendipityBasis.h | 332 +++++++-------
 Code/Source/solver/FE/Common/FEException.h    | 256 ++++++-----
 Code/Source/solver/FE/Common/Types.h          | 150 ++++---
 Code/Source/solver/FE/FE.h                    |  30 +-
 Code/Source/solver/FE/Math/Matrix.h           |   8 +-
 Code/Source/solver/FE/Math/Vector.h           |  24 +-
 10 files changed, 876 insertions(+), 712 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index 2e1154c10..252a76226 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -38,28 +38,32 @@ namespace basis_factory {
 
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
 
-/// \brief Return the default basis request (family and order) for an element type.
-///
-/// \details This is the single source of truth for which basis family and
-/// polynomial order a given element type uses by default: serendipity node
-/// layouts (Quad8, Hex20, Wedge15) select the quadratic serendipity family,
-/// and every complete Lagrange element selects the Lagrange family at the
-/// order given by its node layout. Solver-facing adapters should translate
-/// their element names to ElementType and delegate the basis choice here
-/// rather than tabulating family/order themselves.
-///
-/// \param element_type Element type to select a default basis for.
-/// \return Basis request suitable for create().
-/// \throws BasisElementCompatibilityException If no default basis is defined
-///         for the element type.
+/**
+ * @brief Return the default basis request (family and order) for an element type.
+ *
+ * @details This is the single source of truth for which basis family and
+ * polynomial order a given element type uses by default: serendipity node
+ * layouts (Quad8, Hex20, Wedge15) select the quadratic serendipity family,
+ * and every complete Lagrange element selects the Lagrange family at the
+ * order given by its node layout. Solver-facing adapters should translate
+ * their element names to ElementType and delegate the basis choice here
+ * rather than tabulating family/order themselves.
+ *
+ * @param element_type Element type to select a default basis for.
+ * @return Basis request suitable for create().
+ * @throws BasisElementCompatibilityException If no default basis is defined
+ *         for the element type.
+ */
 [[nodiscard]] BasisRequest default_basis_request(ElementType element_type);
 
-/// \brief Create the default basis for an element type.
-///
-/// \details Equivalent to create(default_basis_request(element_type)).
-///
-/// \param element_type Element type to create a default basis for.
-/// \return Shared basis instance.
+/**
+ * @brief Create the default basis for an element type.
+ *
+ * @details Equivalent to create(default_basis_request(element_type)).
+ *
+ * @param element_type Element type to create a default basis for.
+ * @return Shared basis instance.
+ */
 [[nodiscard]] std::shared_ptr<BasisFunction> create_default_for(ElementType element_type);
 
 } // namespace basis_factory
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 5b7d22a67..6144b4274 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -13,133 +13,139 @@
 #include <span>
 #include <vector>
 
-/// \defgroup FE_Basis Basis
-/// \ingroup FE
-/// \brief Basis-function interfaces, concrete basis families, and reference-node conventions.
-///
-/// \details
-/// ## Scope
-///
-/// The Basis module owns reference-element shape functions. It provides the
-/// number of basis functions and the values and derivatives,
-/// \f$N_i\f$, \f$\partial N_i / \partial \xi_j\f$, and
-/// \f$\partial^2 N_i / \partial \xi_j \partial \xi_k\f$ at reference
-/// points. It does not own mesh storage, quadrature selection, field
-/// formulation policy, or transformation of derivatives to physical
-/// coordinates. Those decisions stay with the solver layer that has the mesh,
-/// material model, and equation context.
-///
-/// The main pieces are:
-/// - BasisFunction (BasisFunction.h): the abstract query and evaluation
-///   contract for code that does not need to know the concrete family.
-/// - \ref FE_LagrangeBasis "LagrangeBasis" and
-///   \ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
-///   families, including analytical first and second derivatives in reference
-///   coordinates.
-/// - basis_factory (BasisFactory.h): runtime construction from a BasisRequest.
-///   basis_factory::default_basis_request() centralizes the family/order that
-///   matches each supported element's public node layout.
-/// - ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
-///   coordinates and the output ordering used by every basis evaluator.
-/// - BasisTraits.h and BasisExceptions.h: topology classification,
-///   compile-time helpers, and module-specific exception types.
-///
-/// ## Object and evaluation contract
-///
-/// A basis object is immutable after construction. It represents one reference
-/// topology, basis family, and effective polynomial order, and can be shared
-/// safely across evaluations. Construction may build node lattices or invert
-/// interpolation matrices, so callers should construct through basis_factory
-/// and cache one instance for each distinct basis request instead of rebuilding
-/// inside element loops.
-///
-/// Every evaluator takes a three-component reference coordinate. For
-/// lower-dimensional elements, only the first dimension() components are
-/// active. Returned gradients always have three components and Hessians are
-/// always 3-by-3 matrices; inactive reference directions are expected to be
-/// zero for conforming lower-dimensional bases. The std::vector overloads are
-/// convenient for setup, tests, and adapter code. The *_to overloads write to
-/// caller-owned spans and are the allocation-free path for assembly.
-///
-/// Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
-/// solver's native node order. A caller that stores elements in another local
-/// ordering must apply the appropriate permutation at the boundary between the
-/// basis module and that storage format.
-///
-/// ## Inputs and ownership
-///
-/// Constructing and evaluating a basis combines several independent choices:
-///
-/// - **Element topology comes from the mesh.** The mesh cell type is translated
-///   to ElementType, which defines the reference topology and public node
-///   layout. This is structural information, not a complete discretization
-///   policy.
-/// - **Geometry interpolation follows the mesh nodes.** The basis used for the
-///   reference-to-physical map must be compatible with the element's node
-///   count and ordering. For that case, callers normally use
-///   basis_factory::create_default_for(element_type), which selects the
-///   Lagrange or serendipity space associated with that element layout. A
-///   Tetra10 mesh therefore implies a quadratic geometry map; a Hex20 mesh
-///   implies the supported Hex20 serendipity geometry basis.
-/// - **Field approximation is chosen by the formulation.** Field bases do not
-///   have to match the geometry map. Mixed formulations, stabilized methods,
-///   enrichment, and convergence studies may use different families or orders
-///   for different fields on the same mesh topology. Those bases should be
-///   requested explicitly with basis_factory::create() and a BasisRequest
-///   naming the desired family and order.
-/// - **Evaluation points come from the caller.** Quadrature rules, probe
-///   points, interpolation targets, and error-sampling locations are outside
-///   this module. The basis only evaluates at the reference coordinates it is
-///   given.
-///
-/// \dot "Basis inputs and responsibilities"
-/// digraph fe_basis_information_flow {
-///   rankdir=LR;
-///   node [shape=box, fontname=Helvetica, fontsize=10];
-///   mesh     [label="Mesh element type"];
-///   request  [label="BasisRequest\nfamily + order"];
-///   topology [label="Reference topology\nand node layout"];
-///   basis    [label="Basis object", style=filled, fillcolor=lightgray];
-///   points   [label="Reference points"];
-///   outputs  [label="Reference values\nand derivatives"];
-///   mesh -> topology;
-///   request -> basis;
-///   topology -> basis;
-///   basis -> outputs;
-///   points -> outputs;
-/// }
-/// \enddot
-///
-/// ## Reference scope and the solver adapter
-///
-/// The solver-facing adapter in nn.cpp is the boundary between this reference
-/// basis contract and legacy solver storage. It translates solver element
-/// enums to ElementType, obtains cached default bases for mesh/face shape
-/// tables, permutes from ReferenceNodeLayout order into solver node order, and
-/// stores N, Nx, and, where needed, packed Nxx at Gauss points. At that stage
-/// Nx and Nxx are still derivatives with respect to reference coordinates.
-/// Physical-coordinate derivatives are formed later, for a particular
-/// configuration and element geometry, by composing the cached reference data
-/// with the mapping Jacobian (nn::gnn for first derivatives and nn::gn_nxx for
-/// second derivatives).
+/**
+ * @defgroup FE_Basis Basis
+ * @ingroup FE
+ * @brief Basis-function interfaces, concrete basis families, and reference-node conventions.
+ *
+ * @details
+ * ## Scope
+ *
+ * The Basis module owns reference-element shape functions. It provides the
+ * number of basis functions and the values and derivatives,
+ * @f$N_i@f$, @f$\partial N_i / \partial \xi_j@f$, and
+ * @f$\partial^2 N_i / \partial \xi_j \partial \xi_k@f$ at reference
+ * points. It does not own mesh storage, quadrature selection, field
+ * formulation policy, or transformation of derivatives to physical
+ * coordinates. Those decisions stay with the solver layer that has the mesh,
+ * material model, and equation context.
+ *
+ * The main pieces are:
+ * - BasisFunction (BasisFunction.h): the abstract query and evaluation
+ *   contract for code that does not need to know the concrete family.
+ * - @ref FE_LagrangeBasis "LagrangeBasis" and
+ *   @ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
+ *   families, including analytical first and second derivatives in reference
+ *   coordinates.
+ * - basis_factory (BasisFactory.h): runtime construction from a BasisRequest.
+ *   basis_factory::default_basis_request() centralizes the family/order that
+ *   matches each supported element's public node layout.
+ * - ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
+ *   coordinates and the output ordering used by every basis evaluator.
+ * - BasisTraits.h and BasisExceptions.h: topology classification,
+ *   compile-time helpers, and module-specific exception types.
+ *
+ * ## Object and evaluation contract
+ *
+ * A basis object is immutable after construction. It represents one reference
+ * topology, basis family, and effective polynomial order, and can be shared
+ * safely across evaluations. Construction may build node lattices or invert
+ * interpolation matrices, so callers should construct through basis_factory
+ * and cache one instance for each distinct basis request instead of rebuilding
+ * inside element loops.
+ *
+ * Every evaluator takes a three-component reference coordinate. For
+ * lower-dimensional elements, only the first dimension() components are
+ * active. Returned gradients always have three components and Hessians are
+ * always 3-by-3 matrices; inactive reference directions are expected to be
+ * zero for conforming lower-dimensional bases. The std::vector overloads are
+ * convenient for setup, tests, and adapter code. The *_to overloads write to
+ * caller-owned spans and are the allocation-free path for assembly.
+ *
+ * Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
+ * solver's native node order. A caller that stores elements in another local
+ * ordering must apply the appropriate permutation at the boundary between the
+ * basis module and that storage format.
+ *
+ * ## Inputs and ownership
+ *
+ * Constructing and evaluating a basis combines several independent choices:
+ *
+ * - **Element topology comes from the mesh.** The mesh cell type is translated
+ *   to ElementType, which defines the reference topology and public node
+ *   layout. This is structural information, not a complete discretization
+ *   policy.
+ * - **Geometry interpolation follows the mesh nodes.** The basis used for the
+ *   reference-to-physical map must be compatible with the element's node
+ *   count and ordering. For that case, callers normally use
+ *   basis_factory::create_default_for(element_type), which selects the
+ *   Lagrange or serendipity space associated with that element layout. A
+ *   Tetra10 mesh therefore implies a quadratic geometry map; a Hex20 mesh
+ *   implies the supported Hex20 serendipity geometry basis.
+ * - **Field approximation is chosen by the formulation.** Field bases do not
+ *   have to match the geometry map. Mixed formulations, stabilized methods,
+ *   enrichment, and convergence studies may use different families or orders
+ *   for different fields on the same mesh topology. Those bases should be
+ *   requested explicitly with basis_factory::create() and a BasisRequest
+ *   naming the desired family and order.
+ * - **Evaluation points come from the caller.** Quadrature rules, probe
+ *   points, interpolation targets, and error-sampling locations are outside
+ *   this module. The basis only evaluates at the reference coordinates it is
+ *   given.
+ *
+ * @dot "Basis inputs and responsibilities"
+ * digraph fe_basis_information_flow {
+ *   rankdir=LR;
+ *   node [shape=box, fontname=Helvetica, fontsize=10];
+ *   mesh     [label="Mesh element type"];
+ *   request  [label="BasisRequest\nfamily + order"];
+ *   topology [label="Reference topology\nand node layout"];
+ *   basis    [label="Basis object", style=filled, fillcolor=lightgray];
+ *   points   [label="Reference points"];
+ *   outputs  [label="Reference values\nand derivatives"];
+ *   mesh -> topology;
+ *   request -> basis;
+ *   topology -> basis;
+ *   basis -> outputs;
+ *   points -> outputs;
+ * }
+ * @enddot
+ *
+ * ## Reference scope and the solver adapter
+ *
+ * The solver-facing adapter in nn.cpp is the boundary between this reference
+ * basis contract and legacy solver storage. It translates solver element
+ * enums to ElementType, obtains cached default bases for mesh/face shape
+ * tables, permutes from ReferenceNodeLayout order into solver node order, and
+ * stores N, Nx, and, where needed, packed Nxx at Gauss points. At that stage
+ * Nx and Nxx are still derivatives with respect to reference coordinates.
+ * Physical-coordinate derivatives are formed later, for a particular
+ * configuration and element geometry, by composing the cached reference data
+ * with the mapping Jacobian (nn::gnn for first derivatives and nn::gn_nxx for
+ * second derivatives).
+ */
 
 namespace svmp {
 namespace FE {
 namespace basis {
 
-/// \brief Gradient vector type used by basis evaluators.
+/** @brief Gradient vector type used by basis evaluators. */
 using Gradient = math::Vector<double, 3>;
 
-/// \brief Hessian matrix type used by basis evaluators.
+/** @brief Hessian matrix type used by basis evaluators. */
 using Hessian  = math::Matrix<double, 3, 3>;
 
-/// \brief Throw BasisEvaluationException when an output span is smaller than the
-/// basis size. \p label is the full "Class::method" context used in the message,
-/// so each basis family passes its own qualified name.
+/**
+ * @brief Throw BasisEvaluationException when an output span is smaller than the
+ * basis size. \p label is the full "Class::method" context used in the message,
+ * so each basis family passes its own qualified name.
+ */
 void require_span_size(std::size_t actual, std::size_t expected, const char* label);
 
-/// \brief Check a requested output span unless it is empty, following the
-/// "skip this output" convention used by the combined evaluators.
+/**
+ * @brief Check a requested output span unless it is empty, following the
+ * "skip this output" convention used by the combined evaluators.
+ */
 template <typename T>
 void require_requested_span_size(std::span<T> output,
                                  std::size_t expected,
@@ -149,127 +155,159 @@ void require_requested_span_size(std::span<T> output,
     }
 }
 
-/// \brief Abstract interface for finite-element basis-function families.
-/// \ingroup FE_Basis
-///
-/// BasisFunction defines the common query and evaluation API used by solver
-/// code that does not need to know the concrete basis implementation. Derived
-/// classes provide values at minimum and can override analytical gradients,
-/// Hessians, combined evaluation, and span output paths. The interface
-/// is deliberately limited to reference-space quantities; callers own node
-/// ordering translation, physical mapping, and any field-level discretization
-/// policy.
+/**
+ * @brief Abstract interface for finite-element basis-function families.
+ * @ingroup FE_Basis
+ *
+ * BasisFunction defines the common query and evaluation API used by solver
+ * code that does not need to know the concrete basis implementation. Derived
+ * classes provide values at minimum and can override analytical gradients,
+ * Hessians, combined evaluation, and span output paths. The interface
+ * is deliberately limited to reference-space quantities; callers own node
+ * ordering translation, physical mapping, and any field-level discretization
+ * policy.
+ */
 class BasisFunction {
 public:
-    /// \brief Destroy a basis function through the abstract interface.
+    /** @brief Destroy a basis function through the abstract interface. */
     virtual ~BasisFunction() = default;
 
-    /// \brief Return the concrete basis family.
-    /// \return Basis family identifier.
+    /**
+     * @brief Return the concrete basis family.
+     * @return Basis family identifier.
+     */
     virtual BasisType basis_type() const noexcept = 0;
 
-    /// \brief Return the canonical element type represented by this basis.
-    /// \return Element type used for node layout and evaluation.
+    /**
+     * @brief Return the canonical element type represented by this basis.
+     * @return Element type used for node layout and evaluation.
+     */
     virtual ElementType element_type() const noexcept = 0;
 
-    /// \brief Return the reference-space dimension of the basis.
-    /// \return Reference dimension, from zero for points through three for volume elements.
+    /**
+     * @brief Return the reference-space dimension of the basis.
+     * @return Reference dimension, from zero for points through three for volume elements.
+     */
     virtual int dimension() const noexcept = 0;
 
-    /// \brief Return the polynomial order represented by this basis.
-    /// \return Effective polynomial order after any element-family normalization.
+    /**
+     * @brief Return the polynomial order represented by this basis.
+     * @return Effective polynomial order after any element-family normalization.
+     */
     virtual int order() const noexcept = 0;
 
-    /// \brief Return the number of basis functions and reference nodes.
-    /// \return Basis function count.
+    /**
+     * @brief Return the number of basis functions and reference nodes.
+     * @return Basis function count.
+     */
     virtual std::size_t size() const noexcept = 0;
 
-    /// \brief Return the reference interpolation nodes in basis ordering.
-    ///
-    /// \details Nodal families return one reference-element coordinate per basis
-    /// function, in the same order as the evaluator outputs. Bases that do not
-    /// define interpolation nodes (non-nodal families, or abstract base usage)
-    /// return an empty vector. The returned reference is valid for the lifetime
-    /// of the basis object.
-    ///
-    /// \return Reference node coordinates: size() entries for nodal families,
-    ///         empty otherwise.
+    /**
+     * @brief Return the reference interpolation nodes in basis ordering.
+     *
+     * @details Nodal families return one reference-element coordinate per basis
+     * function, in the same order as the evaluator outputs. Bases that do not
+     * define interpolation nodes (non-nodal families, or abstract base usage)
+     * return an empty vector. The returned reference is valid for the lifetime
+     * of the basis object.
+     *
+     * @return Reference node coordinates: size() entries for nodal families,
+     *         empty otherwise.
+     */
     virtual const std::vector<math::Vector<double, 3>>& nodes() const noexcept;
 
-    /// \brief Evaluate basis function values at a reference coordinate.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
+    /**
+     * @brief Evaluate basis function values at a reference coordinate.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     */
     virtual void evaluate_values(const math::Vector<double, 3>& xi,
                                  std::vector<double>& values) const = 0;
 
-    /// \brief Evaluate basis gradients at a reference coordinate.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients Receives one three-component gradient per basis function.
-    /// \throws BasisEvaluationException If gradients are not available for the basis.
+    /**
+     * @brief Evaluate basis gradients at a reference coordinate.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients Receives one three-component gradient per basis function.
+     * @throws BasisEvaluationException If gradients are not available for the basis.
+     */
     virtual void evaluate_gradients(const math::Vector<double, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
 
-    /// \brief Evaluate basis Hessians at a reference coordinate.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
-    /// \throws BasisEvaluationException If Hessians are not available for the basis.
+    /**
+     * @brief Evaluate basis Hessians at a reference coordinate.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     * @throws BasisEvaluationException If Hessians are not available for the basis.
+     */
     virtual void evaluate_hessians(const math::Vector<double, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
 
-    /// \brief Evaluate values, gradients, and Hessians together.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
-    /// \param gradients Receives one three-component gradient per basis function.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /**
+     * @brief Evaluate values, gradients, and Hessians together.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     * @param gradients Receives one three-component gradient per basis function.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     */
     virtual void evaluate_all(const math::Vector<double, 3>& xi,
                               std::vector<double>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
-    /// \brief Evaluate basis values into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate basis values into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values_out Output span with at least size() entries.
+     */
     virtual void evaluate_values_to(const math::Vector<double, 3>& xi,
                                     std::span<double> values_out) const;
 
-    /// \brief Evaluate basis gradients into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate basis gradients into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients_out Output span with at least size() entries.
+     */
     virtual void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                        std::span<Gradient> gradients_out) const;
 
-    /// \brief Evaluate basis Hessians into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate basis Hessians into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians_out Output span with at least size() entries.
+     */
     virtual void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                       std::span<Hessian> hessians_out) const;
 
 protected:
-    /// \brief Approximate gradients by centered finite differences of values.
-    ///
-    /// \details This helper is primarily a verification utility for tests: it
-    /// provides a basis-independent reference that checks a concrete basis's
-    /// analytical evaluate_gradients() against centered finite differences of
-    /// evaluate_values(). It lives on the base class so any BasisFunction can be
-    /// checked uniformly, and having no production caller is by design — every
-    /// shipped basis supplies analytical gradients. Centered differences add
-    /// truncation/roundoff sensitivity and require multiple value evaluations
-    /// per reference coordinate, so analytical gradients are always preferred
-    /// outside this testing context.
+    /**
+     * @brief Approximate gradients by centered finite differences of values.
+     *
+     * @details This helper is primarily a verification utility for tests: it
+     * provides a basis-independent reference that checks a concrete basis's
+     * analytical evaluate_gradients() against centered finite differences of
+     * evaluate_values(). It lives on the base class so any BasisFunction can be
+     * checked uniformly, and having no production caller is by design — every
+     * shipped basis supplies analytical gradients. Centered differences add
+     * truncation/roundoff sensitivity and require multiple value evaluations
+     * per reference coordinate, so analytical gradients are always preferred
+     * outside this testing context.
+     */
     void numerical_gradient(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients,
                             double eps = double(1e-6)) const;
 
-    /// \brief Approximate Hessians by centered finite differences of gradients.
-    ///
-    /// \details Companion verification utility to numerical_gradient: it checks
-    /// a basis's analytical evaluate_hessians() against centered finite
-    /// differences of evaluate_gradients(). Because it differentiates gradients,
-    /// it is only meaningful for bases that already provide them. Like
-    /// numerical_gradient it is test-support rather than a production fallback —
-    /// finite-difference Hessians amplify numerical error and require repeated
-    /// gradient evaluations, so analytical Hessians are used everywhere outside
-    /// tests.
+    /**
+     * @brief Approximate Hessians by centered finite differences of gradients.
+     *
+     * @details Companion verification utility to numerical_gradient: it checks
+     * a basis's analytical evaluate_hessians() against centered finite
+     * differences of evaluate_gradients(). Because it differentiates gradients,
+     * it is only meaningful for bases that already provide them. Like
+     * numerical_gradient it is test-support rather than a production fallback —
+     * finite-difference Hessians amplify numerical error and require repeated
+     * gradient evaluations, so analytical Hessians are used everywhere outside
+     * tests.
+     */
     void numerical_hessian(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians,
                            double eps = double(1e-5)) const;
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index be41d9b54..fc21599f3 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -15,191 +15,213 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-/// \defgroup FE_LagrangeBasis LagrangeBasis
-/// \ingroup FE_Basis
-/// \brief Construction and evaluation API for nodal Lagrange finite-element bases.
-///
-/// \details This group documents the complete nodal Lagrange basis evaluator
-/// used by the FE library. The implementation covers tensor-product,
-/// simplex, and wedge reference topologies with exact analytical first and
-/// second derivatives in reference coordinates.
-/// @{
-
-/// \brief Nodal Lagrange basis on supported reference finite elements.
-///
-/// \details LagrangeBasis represents the nodal interpolation basis associated
-/// with an equispaced reference-node lattice. It supports point, line,
-/// quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
-/// elements. Named complete quadratic elements such as Line3, Triangle6,
-/// Quad9, Tetra10, Hex27, and Wedge18 are normalized to their canonical
-/// linear topology plus effective order 2.
-///
-/// Tensor-product elements use the one-dimensional nodal polynomials
-/// \f[
-///   l_i(x) = \prod_{j \ne i} \frac{x - x_j}{x_i - x_j}
-/// \f]
-/// on equispaced coordinates in \f$[-1, 1]\f$. Multi-dimensional basis
-/// functions are products of the active axis polynomials, for example
-/// \f$N_{ijk}(r,s,t) = l_i(r)l_j(s)l_k(t)\f$ on a hexahedron.
-///
-/// Simplex elements use barycentric coordinates and integer lattice
-/// exponents. For a node with exponent tuple \f$\alpha\f$, where
-/// \f$\sum_a \alpha_a = p\f$, the basis is assembled from scaled
-/// falling-factorial factors,
-/// \f[
-///   N_\alpha(\lambda) =
-///   \prod_a \prod_{m=0}^{\alpha_a-1}
-///   \frac{p\lambda_a - m}{m + 1}.
-/// \f]
-/// Gradients and Hessians are evaluated analytically by differentiating these
-/// factors and applying the barycentric-coordinate chain rule.
-///
-/// Wedge elements are treated as a tensor product between a triangle simplex
-/// basis and a one-dimensional through-axis basis:
-/// \f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)\f$.
-///
-/// The vector-returning evaluators are convenient API wrappers. The `*_to`
-/// methods write to caller-provided spans and are intended for assembly paths
-/// that avoid temporary allocations.
+/**
+ * @defgroup FE_LagrangeBasis LagrangeBasis
+ * @ingroup FE_Basis
+ * @brief Construction and evaluation API for nodal Lagrange finite-element bases.
+ *
+ * @details This group documents the complete nodal Lagrange basis evaluator
+ * used by the FE library. The implementation covers tensor-product,
+ * simplex, and wedge reference topologies with exact analytical first and
+ * second derivatives in reference coordinates.
+ * @{
+ */
+
+/**
+ * @brief Nodal Lagrange basis on supported reference finite elements.
+ *
+ * @details LagrangeBasis represents the nodal interpolation basis associated
+ * with an equispaced reference-node lattice. It supports point, line,
+ * quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
+ * elements. Named complete quadratic elements such as Line3, Triangle6,
+ * Quad9, Tetra10, Hex27, and Wedge18 are normalized to their canonical
+ * linear topology plus effective order 2.
+ *
+ * Tensor-product elements use the one-dimensional nodal polynomials
+ * @f[
+ *   l_i(x) = \prod_{j \ne i} \frac{x - x_j}{x_i - x_j}
+ * @f]
+ * on equispaced coordinates in @f$[-1, 1]@f$. Multi-dimensional basis
+ * functions are products of the active axis polynomials, for example
+ * @f$N_{ijk}(r,s,t) = l_i(r)l_j(s)l_k(t)@f$ on a hexahedron.
+ *
+ * Simplex elements use barycentric coordinates and integer lattice
+ * exponents. For a node with exponent tuple @f$\alpha@f$, where
+ * @f$\sum_a \alpha_a = p@f$, the basis is assembled from scaled
+ * falling-factorial factors,
+ * @f[
+ *   N_\alpha(\lambda) =
+ *   \prod_a \prod_{m=0}^{\alpha_a-1}
+ *   \frac{p\lambda_a - m}{m + 1}.
+ * @f]
+ * Gradients and Hessians are evaluated analytically by differentiating these
+ * factors and applying the barycentric-coordinate chain rule.
+ *
+ * Wedge elements are treated as a tensor product between a triangle simplex
+ * basis and a one-dimensional through-axis basis:
+ * @f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)@f$.
+ *
+ * The vector-returning evaluators are convenient API wrappers. The `*_to`
+ * methods write to caller-provided spans and are intended for assembly paths
+ * that avoid temporary allocations.
+ */
 class LagrangeBasis final : public BasisFunction {
 public:
-    /// \brief Axis-index tuple for tensor-product reference nodes.
+    /** @brief Axis-index tuple for tensor-product reference nodes. */
     using TensorNodeIndex = std::array<std::size_t, 3>;
 
-    /// \brief Barycentric exponent tuple for simplex reference nodes.
+    /** @brief Barycentric exponent tuple for simplex reference nodes. */
     using SimplexExponent = std::array<int, 4>;
 
-    /// \brief Triangle-node and axis-node tuple for wedge reference nodes.
+    /** @brief Triangle-node and axis-node tuple for wedge reference nodes. */
     using WedgeNodeIndex = std::array<std::size_t, 2>;
 
-    /// \brief Construct a Lagrange basis for an element type and polynomial order.
-    ///
-    /// \details The constructor normalizes complete higher-order aliases to the
-    /// canonical topology and effective polynomial order, builds the reference
-    /// node coordinates, and precomputes topology-specific lookup data used by
-    /// evaluation. Tensor-product bases store per-axis node indices, simplex
-    /// bases store barycentric exponent tuples, and wedge bases store the
-    /// triangle-node/axis-node decomposition.
-    ///
-    /// \param type Element type used to determine topology and reference-node layout.
-    /// \param order Requested polynomial order.
-    /// \throws BasisConfigurationException If the effective order is negative.
-    /// \throws BasisElementCompatibilityException If the element type is unsupported.
+    /**
+     * @brief Construct a Lagrange basis for an element type and polynomial order.
+     *
+     * @details The constructor normalizes complete higher-order aliases to the
+     * canonical topology and effective polynomial order, builds the reference
+     * node coordinates, and precomputes topology-specific lookup data used by
+     * evaluation. Tensor-product bases store per-axis node indices, simplex
+     * bases store barycentric exponent tuples, and wedge bases store the
+     * triangle-node/axis-node decomposition.
+     *
+     * @param type Element type used to determine topology and reference-node layout.
+     * @param order Requested polynomial order.
+     * @throws BasisConfigurationException If the effective order is negative.
+     * @throws BasisElementCompatibilityException If the element type is unsupported.
+     */
     LagrangeBasis(ElementType type, int order);
 
-    /// \copydoc BasisFunction::basis_type()
+    /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Lagrange; }
 
-    /// \copydoc BasisFunction::element_type()
+    /** @copydoc BasisFunction::element_type() */
     ElementType element_type() const noexcept final { return element_type_; }
 
-    /// \copydoc BasisFunction::dimension()
+    /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
 
-    /// \copydoc BasisFunction::order()
+    /** @copydoc BasisFunction::order() */
     int order() const noexcept final { return order_; }
 
-    /// \copydoc BasisFunction::size()
+    /** @copydoc BasisFunction::size() */
     std::size_t size() const noexcept final { return nodes_.size(); }
 
-    /// \brief Return the reference interpolation nodes in basis ordering.
-    ///
-    /// \details The returned node order matches the basis-function order used
-    /// by all evaluators. Coordinates are reference-element coordinates:
-    /// tensor-product axes use \f$[-1,1]\f$, triangles and tetrahedra use the
-    /// repository's simplex reference coordinates, and wedges combine triangle
-    /// reference coordinates with a \f$[-1,1]\f$ through-axis coordinate.
-    ///
-    /// \return Reference node coordinates, one per basis function.
+    /**
+     * @brief Return the reference interpolation nodes in basis ordering.
+     *
+     * @details The returned node order matches the basis-function order used
+     * by all evaluators. Coordinates are reference-element coordinates:
+     * tensor-product axes use @f$[-1,1]@f$, triangles and tetrahedra use the
+     * repository's simplex reference coordinates, and wedges combine triangle
+     * reference coordinates with a @f$[-1,1]@f$ through-axis coordinate.
+     *
+     * @return Reference node coordinates, one per basis function.
+     */
     const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
-    /// \brief Evaluate Lagrange basis function values at a reference coordinate.
-    ///
-    /// \details Values satisfy the nodal interpolation property
-    /// \f$N_i(x_j)=\delta_{ij}\f$ at the basis nodes. Tensor-product values are
-    /// products of one-dimensional Lagrange polynomials. Simplex values are
-    /// products of barycentric falling-factorial factors. Wedge values are
-    /// products of triangle simplex values and through-axis Lagrange values.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
+    /**
+     * @brief Evaluate Lagrange basis function values at a reference coordinate.
+     *
+     * @details Values satisfy the nodal interpolation property
+     * @f$N_i(x_j)=\delta_{ij}@f$ at the basis nodes. Tensor-product values are
+     * products of one-dimensional Lagrange polynomials. Simplex values are
+     * products of barycentric falling-factorial factors. Wedge values are
+     * products of triangle simplex values and through-axis Lagrange values.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     */
     void evaluate_values(const math::Vector<double, 3>& xi,
                          std::vector<double>& values) const final;
 
-    /// \brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
-    ///
-    /// \details Gradients are derivatives with respect to reference
-    /// coordinates, not physical coordinates. Tensor-product gradients apply
-    /// the product rule to the active axis polynomials. Simplex gradients
-    /// differentiate the barycentric factors and multiply by the constant
-    /// gradients of the barycentric coordinates. Wedge gradients combine the
-    /// triangle gradient in the first two components with the through-axis
-    /// derivative in the third component.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients Receives one three-component gradient per basis function.
+    /**
+     * @brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
+     *
+     * @details Gradients are derivatives with respect to reference
+     * coordinates, not physical coordinates. Tensor-product gradients apply
+     * the product rule to the active axis polynomials. Simplex gradients
+     * differentiate the barycentric factors and multiply by the constant
+     * gradients of the barycentric coordinates. Wedge gradients combine the
+     * triangle gradient in the first two components with the through-axis
+     * derivative in the third component.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients Receives one three-component gradient per basis function.
+     */
     void evaluate_gradients(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
 
-    /// \brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
-    ///
-    /// \details Hessians are second derivatives in reference coordinates and
-    /// are stored as 3-by-3 matrices. Tensor-product Hessians contain pure
-    /// second axis derivatives on the diagonal and mixed product-rule terms
-    /// off diagonal. Simplex Hessians are assembled from first and second
-    /// derivatives of the barycentric factors. Wedge Hessians contain triangle
-    /// Hessian terms, through-axis second derivatives, and mixed
-    /// triangle/through-axis derivative products.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /**
+     * @brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
+     *
+     * @details Hessians are second derivatives in reference coordinates and
+     * are stored as 3-by-3 matrices. Tensor-product Hessians contain pure
+     * second axis derivatives on the diagonal and mixed product-rule terms
+     * off diagonal. Simplex Hessians are assembled from first and second
+     * derivatives of the barycentric factors. Wedge Hessians contain triangle
+     * Hessian terms, through-axis second derivatives, and mixed
+     * triangle/through-axis derivative products.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     */
     void evaluate_hessians(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate Lagrange values, gradients, and Hessians together.
-    ///
-    /// \details This is the allocation-friendly vector API for callers that
-    /// need all basis quantities at the same quadrature point. The underlying
-    /// evaluator computes only topology-local polynomial data once and then
-    /// fills all requested outputs.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
-    /// \param gradients Receives one three-component gradient per basis function.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /**
+     * @brief Evaluate Lagrange values, gradients, and Hessians together.
+     *
+     * @details This is the allocation-friendly vector API for callers that
+     * need all basis quantities at the same quadrature point. The underlying
+     * evaluator computes only topology-local polynomial data once and then
+     * fills all requested outputs.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     * @param gradients Receives one three-component gradient per basis function.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     */
     void evaluate_all(const math::Vector<double, 3>& xi,
                       std::vector<double>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate Lagrange basis values into caller-provided storage.
-    ///
-    /// \details This is the low-allocation API intended for element assembly
-    /// loops. The span is filled in basis-node order and no vector resizing is
-    /// performed.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate Lagrange basis values into caller-provided storage.
+     *
+     * @details This is the low-allocation API intended for element assembly
+     * loops. The span is filled in basis-node order and no vector resizing is
+     * performed.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values_out Output span with at least size() entries.
+     */
     void evaluate_values_to(const math::Vector<double, 3>& xi,
                             std::span<double> values_out) const final;
 
-    /// \brief Evaluate Lagrange basis gradients into caller-provided storage.
-    ///
-    /// \details Gradients are written in basis-node order with one
-    /// three-component gradient per node.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate Lagrange basis gradients into caller-provided storage.
+     *
+     * @details Gradients are written in basis-node order with one
+     * three-component gradient per node.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients_out Output span with at least size() entries.
+     */
     void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate Lagrange basis Hessians into caller-provided storage.
-    ///
-    /// \details Hessians are written in basis-node order with one 3-by-3
-    /// Hessian per node.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate Lagrange basis Hessians into caller-provided storage.
+     *
+     * @details Hessians are written in basis-node order with one 3-by-3
+     * Hessian per node.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians_out Output span with at least size() entries.
+     */
     void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                               std::span<Hessian> hessians_out) const final;
 
@@ -243,7 +265,7 @@ class LagrangeBasis final : public BasisFunction {
                            std::span<Hessian> hessians_out) const;
 };
 
-/// @}
+/** @} */
 
 } // namespace basis
 } // namespace FE
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 043668c47..951bd854c 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -15,10 +15,12 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-/// \brief Equispaced 1D reference coordinate on [-1, 1]: -1 + 2 i / order.
-///
-/// Shared by the reference-node layout generators and the Lagrange tensor-axis
-/// node initialization so the lattice formula lives in a single place.
+/**
+ * @brief Equispaced 1D reference coordinate on [-1, 1]: -1 + 2 i / order.
+ *
+ * Shared by the reference-node layout generators and the Lagrange tensor-axis
+ * node initialization so the lattice formula lives in a single place.
+ */
 [[nodiscard]] inline constexpr double line_coord_pm_one(int i, int order) noexcept {
     if (order <= 0) {
         return double(0);
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 5a1471436..29bc2e7da 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -18,196 +18,218 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-/// \defgroup FE_SerendipityBasis SerendipityBasis
-/// \ingroup FE_Basis
-/// \brief Construction and evaluation API for reduced serendipity finite-element bases.
-///
-/// \details This group documents reduced degree-of-freedom basis families that
-/// preserve nodal interpolation on supported element boundaries while omitting
-/// selected interior tensor-product modes. These bases are used for standard
-/// serendipity elements and geometry-mode mappings that intentionally use a
-/// lower-order interpolation space.
-/// @{
-
-/// \brief Reduced-degree-of-freedom serendipity basis on supported reference elements.
-///
-/// \details SerendipityBasis implements nodal bases for Quad4/Quad8,
-/// Hex8/Hex20, and Wedge15. Compared with a complete tensor-product Lagrange
-/// basis of the same nominal order, a serendipity basis removes selected
-/// interior modes while retaining nodal interpolation on the supported node
-/// layout.
-///
-/// Quadrilateral serendipity bases are built from monomials
-/// \f$x^{a_x}y^{a_y}\f$ whose superlinear degree is at most the requested
-/// order. In this implementation the superlinear degree is
-/// \f[
-///   sldeg(x^{a_x}y^{a_y}) =
-///   \begin{cases} a_x, & a_x > 1 \\ 0, & a_x \le 1 \end{cases}
-///   +
-///   \begin{cases} a_y, & a_y > 1 \\ 0, & a_y \le 1 \end{cases}.
-/// \f]
-/// The nodal basis is recovered by inverting the Vandermonde interpolation
-/// matrix at the selected reference nodes. Values, gradients, and Hessians are
-/// then evaluated by differentiating the monomial vector and applying the
-/// inverse Vandermonde coefficients.
-/// For order \f$p \ge 1\f$, this space has \f$4p\f$ boundary modes for
-/// \f$p \le 3\f$ and
-/// \f[
-///   4p + \frac{(p - 3)(p - 2)}{2}
-/// \f]
-/// modes for \f$p \ge 4\f$.
-///
-/// The quadrilateral node set is unisolvent by construction. If
-/// \f$s(x,y)\f$ in this space vanishes at the \f$p + 1\f$ distinct nodes on
-/// every edge, each edge restriction is a degree-\f$p\f$ one-variable
-/// polynomial with \f$p + 1\f$ roots, so all edge restrictions vanish. Thus
-/// \f$s\f$ is divisible by the boundary bubble
-/// \f$(1 - x^2)(1 - y^2)\f$, and the quotient lies in
-/// \f$P_{p-4}\f$ (with no quotient for \f$p < 4\f$). For \f$p \ge 4\f$, the
-/// interior nodes form triangular rows for \f$P_{p-4}\f$: the first row has
-/// \f$m + 1\f$ distinct \f$x\f$ values, the next row has \f$m\f$, and so on
-/// for \f$m = p - 4\f$. A total-degree polynomial that vanishes on those rows
-/// is zero by induction over rows, because each vanished row factors out one
-/// linear term in \f$y\f$. The interpolation Vandermonde is therefore
-/// nonsingular for the implemented quadrilateral serendipity space.
-///
-/// `SerendipityBasis(ElementType::Quad4, p)` supports explicit
-/// arbitrary-order quadrilateral serendipity requests for \f$p \ge 1\f$
-/// (requests below one are normalized to one). `ElementType::Quad8` remains
-/// the standard quadratic eight-node layout and is valid only with order 2.
-/// Solver-default basis selection remains separate: `basis_factory` maps the
-/// complete Quad4 layout to the default linear Lagrange basis and maps Quad8 to
-/// quadratic serendipity unless a caller explicitly requests a different
-/// supported basis.
-///
-/// Hex8 uses the standard trilinear corner basis
-/// \f$(1 \pm r)(1 \pm s)(1 \pm t)/8\f$. Hex20 and Wedge15 use tabulated
-/// polynomial coefficient tables over monomial bases; analytical gradients and
-/// Hessians are obtained by differentiating those monomials. Hex20 evaluation
-/// is reordered through ReferenceNodeLayout so the output matches the public
-/// basis ordering.
+/**
+ * @defgroup FE_SerendipityBasis SerendipityBasis
+ * @ingroup FE_Basis
+ * @brief Construction and evaluation API for reduced serendipity finite-element bases.
+ *
+ * @details This group documents reduced degree-of-freedom basis families that
+ * preserve nodal interpolation on supported element boundaries while omitting
+ * selected interior tensor-product modes. These bases are used for standard
+ * serendipity elements and geometry-mode mappings that intentionally use a
+ * lower-order interpolation space.
+ * @{
+ */
+
+/**
+ * @brief Reduced-degree-of-freedom serendipity basis on supported reference elements.
+ *
+ * @details SerendipityBasis implements nodal bases for Quad4/Quad8,
+ * Hex8/Hex20, and Wedge15. Compared with a complete tensor-product Lagrange
+ * basis of the same nominal order, a serendipity basis removes selected
+ * interior modes while retaining nodal interpolation on the supported node
+ * layout.
+ *
+ * Quadrilateral serendipity bases are built from monomials
+ * @f$x^{a_x}y^{a_y}@f$ whose superlinear degree is at most the requested
+ * order. In this implementation the superlinear degree is
+ * @f[
+ *   sldeg(x^{a_x}y^{a_y}) =
+ *   \begin{cases} a_x, & a_x > 1 \\ 0, & a_x \le 1 \end{cases}
+ *   +
+ *   \begin{cases} a_y, & a_y > 1 \\ 0, & a_y \le 1 \end{cases}.
+ * @f]
+ * The nodal basis is recovered by inverting the Vandermonde interpolation
+ * matrix at the selected reference nodes. Values, gradients, and Hessians are
+ * then evaluated by differentiating the monomial vector and applying the
+ * inverse Vandermonde coefficients.
+ * For order @f$p \ge 1@f$, this space has @f$4p@f$ boundary modes for
+ * @f$p \le 3@f$ and
+ * @f[
+ *   4p + \frac{(p - 3)(p - 2)}{2}
+ * @f]
+ * modes for @f$p \ge 4@f$.
+ *
+ * The quadrilateral node set is unisolvent by construction. If
+ * @f$s(x,y)@f$ in this space vanishes at the @f$p + 1@f$ distinct nodes on
+ * every edge, each edge restriction is a degree-@f$p@f$ one-variable
+ * polynomial with @f$p + 1@f$ roots, so all edge restrictions vanish. Thus
+ * @f$s@f$ is divisible by the boundary bubble
+ * @f$(1 - x^2)(1 - y^2)@f$, and the quotient lies in
+ * @f$P_{p-4}@f$ (with no quotient for @f$p < 4@f$). For @f$p \ge 4@f$, the
+ * interior nodes form triangular rows for @f$P_{p-4}@f$: the first row has
+ * @f$m + 1@f$ distinct @f$x@f$ values, the next row has @f$m@f$, and so on
+ * for @f$m = p - 4@f$. A total-degree polynomial that vanishes on those rows
+ * is zero by induction over rows, because each vanished row factors out one
+ * linear term in @f$y@f$. The interpolation Vandermonde is therefore
+ * nonsingular for the implemented quadrilateral serendipity space.
+ *
+ * `SerendipityBasis(ElementType::Quad4, p)` supports explicit
+ * arbitrary-order quadrilateral serendipity requests for @f$p \ge 1@f$
+ * (requests below one are normalized to one). `ElementType::Quad8` remains
+ * the standard quadratic eight-node layout and is valid only with order 2.
+ * Solver-default basis selection remains separate: `basis_factory` maps the
+ * complete Quad4 layout to the default linear Lagrange basis and maps Quad8 to
+ * quadratic serendipity unless a caller explicitly requests a different
+ * supported basis.
+ *
+ * Hex8 uses the standard trilinear corner basis
+ * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Hex20 and Wedge15 use tabulated
+ * polynomial coefficient tables over monomial bases; analytical gradients and
+ * Hessians are obtained by differentiating those monomials. Hex20 evaluation
+ * is reordered through ReferenceNodeLayout so the output matches the public
+ * basis ordering.
+ */
 class SerendipityBasis final : public BasisFunction {
 public:
-    /// \brief Construct a serendipity basis for an element type and polynomial order.
-    ///
-    /// \details The constructor selects the topology-specific interpolation
-    /// space, computes the reference node coordinates, and initializes any
-    /// coefficient tables needed for evaluation. Quadrilateral bases build and
-    /// invert a Vandermonde matrix for the selected serendipity monomials.
-    /// Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
-    /// linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
-    /// wedges, only quadratic Wedge15 is supported. Quad4 supports explicit
-    /// quadrilateral serendipity requests of any order \f$p \ge 1\f$; Quad8 is
-    /// restricted to order 2.
-    ///
-    /// \param type Element type used to determine topology and reference-node layout.
-    /// \param order Requested polynomial order.
-    /// \throws BasisConfigurationException If the requested order is invalid.
-    /// \throws BasisElementCompatibilityException If the element type is unsupported.
+    /**
+     * @brief Construct a serendipity basis for an element type and polynomial order.
+     *
+     * @details The constructor selects the topology-specific interpolation
+     * space, computes the reference node coordinates, and initializes any
+     * coefficient tables needed for evaluation. Quadrilateral bases build and
+     * invert a Vandermonde matrix for the selected serendipity monomials.
+     * Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
+     * linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
+     * wedges, only quadratic Wedge15 is supported. Quad4 supports explicit
+     * quadrilateral serendipity requests of any order @f$p \ge 1@f$; Quad8 is
+     * restricted to order 2.
+     *
+     * @param type Element type used to determine topology and reference-node layout.
+     * @param order Requested polynomial order.
+     * @throws BasisConfigurationException If the requested order is invalid.
+     * @throws BasisElementCompatibilityException If the element type is unsupported.
+     */
     SerendipityBasis(ElementType type, int order);
 
-    /// \copydoc BasisFunction::basis_type()
+    /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
 
-    /// \copydoc BasisFunction::element_type()
+    /** @copydoc BasisFunction::element_type() */
     ElementType element_type() const noexcept final { return element_type_; }
 
-    /// \copydoc BasisFunction::dimension()
+    /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
 
-    /// \copydoc BasisFunction::order()
+    /** @copydoc BasisFunction::order() */
     int order() const noexcept final { return order_; }
 
-    /// \copydoc BasisFunction::size()
+    /** @copydoc BasisFunction::size() */
     std::size_t size() const noexcept final { return size_; }
 
-    /// \brief Return the reference interpolation nodes in basis ordering.
-    ///
-    /// \details Node coordinates are the points at which the serendipity basis
-    /// satisfies the nodal interpolation property. Quadrilateral nodes are
-    /// placed first on the boundary and then, for higher order requests, at the
-    /// selected interior points needed to make the reduced monomial space
-    /// unisolvent. Hexahedral and wedge nodes are taken from
-    /// ReferenceNodeLayout. For high-order Quad4 serendipity, the deterministic
-    /// interior row ordering is an implementation convention; callers should
-    /// pair it with basis values from the same object rather than assume an
-    /// external mesh ordering contract beyond the supported Quad4/Quad8
-    /// production layouts.
-    ///
-    /// \return Reference node coordinates, one per basis function.
+    /**
+     * @brief Return the reference interpolation nodes in basis ordering.
+     *
+     * @details Node coordinates are the points at which the serendipity basis
+     * satisfies the nodal interpolation property. Quadrilateral nodes are
+     * placed first on the boundary and then, for higher order requests, at the
+     * selected interior points needed to make the reduced monomial space
+     * unisolvent. Hexahedral and wedge nodes are taken from
+     * ReferenceNodeLayout. For high-order Quad4 serendipity, the deterministic
+     * interior row ordering is an implementation convention; callers should
+     * pair it with basis values from the same object rather than assume an
+     * external mesh ordering contract beyond the supported Quad4/Quad8
+     * production layouts.
+     *
+     * @return Reference node coordinates, one per basis function.
+     */
     const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
-    /// \brief Evaluate serendipity basis function values at a reference coordinate.
-    ///
-    /// \details For quadrilateral bases, this evaluates the serendipity
-    /// monomial vector and multiplies by the inverse Vandermonde matrix to
-    /// obtain nodal shape-function values. For Hex8, values are the standard
-    /// trilinear corner products. For Hex20 and Wedge15, values are evaluated
-    /// from the stored polynomial coefficient tables.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
+    /**
+     * @brief Evaluate serendipity basis function values at a reference coordinate.
+     *
+     * @details For quadrilateral bases, this evaluates the serendipity
+     * monomial vector and multiplies by the inverse Vandermonde matrix to
+     * obtain nodal shape-function values. For Hex8, values are the standard
+     * trilinear corner products. For Hex20 and Wedge15, values are evaluated
+     * from the stored polynomial coefficient tables.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     */
     void evaluate_values(const math::Vector<double, 3>& xi,
                          std::vector<double>& values) const final;
 
-    /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
-    ///
-    /// \details Gradients are derivatives with respect to reference
-    /// coordinates. Quadrilateral gradients differentiate the monomial vector
-    /// before applying the inverse Vandermonde coefficients. Hex8 gradients are
-    /// direct derivatives of the trilinear corner products. Hex20 and Wedge15
-    /// gradients are computed by differentiating the tabulated monomial
-    /// expansions.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients Receives one three-component gradient per basis function.
+    /**
+     * @brief Evaluate analytical serendipity basis gradients at a reference coordinate.
+     *
+     * @details Gradients are derivatives with respect to reference
+     * coordinates. Quadrilateral gradients differentiate the monomial vector
+     * before applying the inverse Vandermonde coefficients. Hex8 gradients are
+     * direct derivatives of the trilinear corner products. Hex20 and Wedge15
+     * gradients are computed by differentiating the tabulated monomial
+     * expansions.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients Receives one three-component gradient per basis function.
+     */
     void evaluate_gradients(const math::Vector<double, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
 
-    /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
-    ///
-    /// \details Hessians are second derivatives in reference coordinates and
-    /// are stored as 3-by-3 matrices. Quadrilateral Hessians use second
-    /// derivatives of the monomial vector and inverse Vandermonde coefficients.
-    /// Hex8 Hessians are computed directly from the trilinear corner products.
-    /// Hex20 and Wedge15 Hessians are computed by differentiating their
-    /// polynomial coefficient tables twice.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /**
+     * @brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
+     *
+     * @details Hessians are second derivatives in reference coordinates and
+     * are stored as 3-by-3 matrices. Quadrilateral Hessians use second
+     * derivatives of the monomial vector and inverse Vandermonde coefficients.
+     * Hex8 Hessians are computed directly from the trilinear corner products.
+     * Hex20 and Wedge15 Hessians are computed by differentiating their
+     * polynomial coefficient tables twice.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     */
     void evaluate_hessians(const math::Vector<double, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate serendipity values, gradients, and Hessians together.
-    ///
-    /// \details This vector API is backed by the same span-based evaluator as
-    /// the assembly-oriented `*_to` methods, so topology-specific polynomial
-    /// setup can be shared for a quadrature point.
-    ///
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values Receives one value per basis function.
-    /// \param gradients Receives one three-component gradient per basis function.
-    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /**
+     * @brief Evaluate serendipity values, gradients, and Hessians together.
+     *
+     * @details This vector API is backed by the same span-based evaluator as
+     * the assembly-oriented `*_to` methods, so topology-specific polynomial
+     * setup can be shared for a quadrature point.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values Receives one value per basis function.
+     * @param gradients Receives one three-component gradient per basis function.
+     * @param hessians Receives one 3-by-3 Hessian per basis function.
+     */
     void evaluate_all(const math::Vector<double, 3>& xi,
                       std::vector<double>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate serendipity basis values into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate serendipity basis values into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values_out Output span with at least size() entries.
+     */
     void evaluate_values_to(const math::Vector<double, 3>& xi,
                             std::span<double> values_out) const final;
 
-    /// \brief Evaluate serendipity basis gradients into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate serendipity basis gradients into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param gradients_out Output span with at least size() entries.
+     */
     void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate serendipity basis Hessians into caller-provided storage.
-    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output span with at least size() entries.
+    /**
+     * @brief Evaluate serendipity basis Hessians into caller-provided storage.
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param hessians_out Output span with at least size() entries.
+     */
     void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                               std::span<Hessian> hessians_out) const final;
 
@@ -227,7 +249,7 @@ class SerendipityBasis final : public BasisFunction {
                          std::span<Hessian> hessians_out) const;
 };
 
-/// @}
+/** @} */
 
 } // namespace basis
 } // namespace FE
diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index 5fdb4b365..d8779aa22 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -22,31 +22,33 @@
 namespace svmp {
 namespace FE {
 
-/// \defgroup FE_CommonExceptions Exceptions
-/// \ingroup FE_Common
-/// \brief FE exception hierarchy.
-///
-/// \details All FE-specific exceptions derive from FEException, which itself
-/// derives from the shared solver ExceptionBase. Specialized subclasses carry
-/// structured context (element type, DOF index, backend name and error code,
-/// iteration counts, Jacobian determinants) so call sites can report
-/// actionable diagnostics.
-///
-/// Throw FE exceptions through the canonical core helpers in Core/Exception.h:
-///
-/// \code
-/// svmp::raise<ExceptionT>(SVMP_HERE, message);
-/// svmp::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
-/// svmp::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
-/// svmp::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
-/// svmp::check_index<ExceptionT>(index, size, SVMP_HERE);
-/// svmp::not_implemented<ExceptionT>(feature, SVMP_HERE);
-/// \endcode
-///
-/// throw_if() is failure-condition based. check_arg() is
-/// success-condition based. FE owns exception types; helper spelling is owned
-/// by the core layer.
-/// @{
+/**
+ * @defgroup FE_CommonExceptions Exceptions
+ * @ingroup FE_Common
+ * @brief FE exception hierarchy.
+ *
+ * @details All FE-specific exceptions derive from FEException, which itself
+ * derives from the shared solver ExceptionBase. Specialized subclasses carry
+ * structured context (element type, DOF index, backend name and error code,
+ * iteration counts, Jacobian determinants) so call sites can report
+ * actionable diagnostics.
+ *
+ * Throw FE exceptions through the canonical core helpers in Core/Exception.h:
+ *
+ * @code
+ * svmp::raise<ExceptionT>(SVMP_HERE, message);
+ * svmp::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
+ * svmp::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
+ * svmp::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
+ * svmp::check_index<ExceptionT>(index, size, SVMP_HERE);
+ * svmp::not_implemented<ExceptionT>(feature, SVMP_HERE);
+ * @endcode
+ *
+ * throw_if() is failure-condition based. check_arg() is
+ * success-condition based. FE owns exception types; helper spelling is owned
+ * by the core layer.
+ * @{
+ */
 
 /**
  * @brief Base exception type for errors originating in the FE library
@@ -57,12 +59,14 @@ namespace FE {
  */
 class FEException : public ExceptionBase {
 public:
-    /// @brief Construct with a message and optional status code and source location.
-    /// @param message Human-readable error description.
-    /// @param status Status code classifying the failure.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional status code and source location.
+     * @param message Human-readable error description.
+     * @param status Status code classifying the failure.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     FEException(const std::string& message,
                 StatusCode status = StatusCode::Unknown,
                 const char* file = "",
@@ -77,11 +81,13 @@ class FEException : public ExceptionBase {
     {
     }
 
-    /// @brief Construct with a message and source location, using an Unknown status.
-    /// @param message Human-readable error description.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and source location, using an Unknown status.
+     * @param message Human-readable error description.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     FEException(const std::string& message,
                 const char* file,
                 int line,
@@ -90,8 +96,10 @@ class FEException : public ExceptionBase {
     {
     }
 
-    /// @brief Status code classifying the failure.
-    /// @return The status code recorded at construction.
+    /**
+     * @brief Status code classifying the failure.
+     * @return The status code recorded at construction.
+     */
     StatusCode status() const noexcept { return status_code(); }
 };
 
@@ -100,11 +108,13 @@ class FEException : public ExceptionBase {
  */
 class InvalidArgumentException : public FEException {
 public:
-    /// @brief Construct with a message and optional source location.
-    /// @param message Human-readable error description.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional source location.
+     * @param message Human-readable error description.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     InvalidArgumentException(const std::string& message,
                              const char* file = "",
                              int line = 0,
@@ -122,12 +132,14 @@ class InvalidArgumentException : public FEException {
  */
 class InvalidElementException : public FEException {
 public:
-    /// @brief Construct with a message and optional element-type context.
-    /// @param message Human-readable error description.
-    /// @param element_type Name of the offending element type; appended to the message when non-empty.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional element-type context.
+     * @param message Human-readable error description.
+     * @param element_type Name of the offending element type; appended to the message when non-empty.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     InvalidElementException(const std::string& message,
                             std::string element_type = "",
                             const char* file = "",
@@ -142,8 +154,10 @@ class InvalidElementException : public FEException {
     {
     }
 
-    /// @brief Name of the offending element type.
-    /// @return Element-type name; empty when not provided.
+    /**
+     * @brief Name of the offending element type.
+     * @return Element-type name; empty when not provided.
+     */
     const std::string& element_type() const noexcept { return element_type_; }
 
 private:
@@ -167,12 +181,14 @@ class InvalidElementException : public FEException {
  */
 class DofException : public FEException {
 public:
-    /// @brief Construct with a message and optional DOF-index context.
-    /// @param message Human-readable error description.
-    /// @param dof_index Offending DOF index; appended to the message unless it equals invalid_dof_index().
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional DOF-index context.
+     * @param message Human-readable error description.
+     * @param dof_index Offending DOF index; appended to the message unless it equals invalid_dof_index().
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     DofException(const std::string& message,
                  long long dof_index = invalid_dof_index(),
                  const char* file = "",
@@ -187,11 +203,15 @@ class DofException : public FEException {
     {
     }
 
-    /// @brief Offending DOF index.
-    /// @return DOF index; invalid_dof_index() when not provided.
+    /**
+     * @brief Offending DOF index.
+     * @return DOF index; invalid_dof_index() when not provided.
+     */
     long long dof_index() const noexcept { return dof_index_; }
-    /// @brief Sentinel meaning "no DOF index attached".
-    /// @return The sentinel value -1.
+    /**
+     * @brief Sentinel meaning "no DOF index attached".
+     * @return The sentinel value -1.
+     */
     static constexpr long long invalid_dof_index() noexcept { return -1; }
 
 private:
@@ -213,11 +233,13 @@ class DofException : public FEException {
  */
 class AssemblyException : public FEException {
 public:
-    /// @brief Construct with a message and optional source location.
-    /// @param message Human-readable error description.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional source location.
+     * @param message Human-readable error description.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     AssemblyException(const std::string& message,
                       const char* file = "",
                       int line = 0,
@@ -235,13 +257,15 @@ class AssemblyException : public FEException {
  */
 class BackendException : public FEException {
 public:
-    /// @brief Construct with a message and optional backend context.
-    /// @param message Human-readable error description.
-    /// @param backend_name Name of the failing backend; appended to the message when non-empty.
-    /// @param error_code Backend-native error code; appended to the message when nonzero.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional backend context.
+     * @param message Human-readable error description.
+     * @param backend_name Name of the failing backend; appended to the message when non-empty.
+     * @param error_code Backend-native error code; appended to the message when nonzero.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     BackendException(const std::string& message,
                      std::string backend_name = "",
                      int error_code = 0,
@@ -258,11 +282,15 @@ class BackendException : public FEException {
     {
     }
 
-    /// @brief Name of the failing backend.
-    /// @return Backend name; empty when not provided.
+    /**
+     * @brief Name of the failing backend.
+     * @return Backend name; empty when not provided.
+     */
     const std::string& backend_name() const noexcept { return backend_name_; }
-    /// @brief Backend-native error code.
-    /// @return Error code; zero when not provided.
+    /**
+     * @brief Backend-native error code.
+     * @return Error code; zero when not provided.
+     */
     int error_code() const noexcept { return error_code_; }
 
 private:
@@ -297,11 +325,13 @@ class BackendException : public FEException {
  */
 class NotImplementedException : public FEException {
 public:
-    /// @brief Construct from the name of the missing feature.
-    /// @param feature Description of the unimplemented feature.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct from the name of the missing feature.
+     * @param feature Description of the unimplemented feature.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     NotImplementedException(const std::string& feature,
                             const char* file = "",
                             int line = 0,
@@ -320,11 +350,13 @@ class NotImplementedException : public FEException {
  */
 class NotInitializedException : public FEException {
 public:
-  /// @brief Construct from the name of the uninitialized feature.
-  /// @param feature Description of the missing initialization.
-  /// @param file Source file where the error was raised.
-  /// @param line Source line where the error was raised.
-  /// @param function Function where the error was raised.
+  /**
+   * @brief Construct from the name of the uninitialized feature.
+   * @param feature Description of the missing initialization.
+   * @param file Source file where the error was raised.
+   * @param line Source line where the error was raised.
+   * @param function Function where the error was raised.
+   */
   NotInitializedException(const std::string &feature,
                           const char *file,
                           int line = 0,
@@ -346,13 +378,15 @@ class NotInitializedException : public FEException {
  */
 class ConvergenceException : public FEException {
 public:
-    /// @brief Construct with a message and optional iteration context.
-    /// @param message Human-readable error description.
-    /// @param iteration Iteration at which the failure was detected; appended to the message when non-negative.
-    /// @param residual Final residual; appended to the message when positive.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and optional iteration context.
+     * @param message Human-readable error description.
+     * @param iteration Iteration at which the failure was detected; appended to the message when non-negative.
+     * @param residual Final residual; appended to the message when positive.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     ConvergenceException(const std::string& message,
                          int iteration = -1,
                          double residual = 0.0,
@@ -369,11 +403,15 @@ class ConvergenceException : public FEException {
     {
     }
 
-    /// @brief Iteration at which the failure was detected.
-    /// @return Iteration count; -1 when not provided.
+    /**
+     * @brief Iteration at which the failure was detected.
+     * @return Iteration count; -1 when not provided.
+     */
     int iteration() const noexcept { return iteration_; }
-    /// @brief Final residual value.
-    /// @return Residual; 0.0 when not provided.
+    /**
+     * @brief Final residual value.
+     * @return Residual; 0.0 when not provided.
+     */
     double residual() const noexcept { return residual_; }
 
 private:
@@ -405,12 +443,14 @@ class ConvergenceException : public FEException {
  */
 class SingularMappingException : public FEException {
 public:
-    /// @brief Construct with a message and the offending Jacobian determinant.
-    /// @param message Human-readable error description.
-    /// @param jacobian_det Jacobian determinant at the failure point; appended to the message.
-    /// @param file Source file where the error was raised.
-    /// @param line Source line where the error was raised.
-    /// @param function Function where the error was raised.
+    /**
+     * @brief Construct with a message and the offending Jacobian determinant.
+     * @param message Human-readable error description.
+     * @param jacobian_det Jacobian determinant at the failure point; appended to the message.
+     * @param file Source file where the error was raised.
+     * @param line Source line where the error was raised.
+     * @param function Function where the error was raised.
+     */
     SingularMappingException(const std::string& message,
                              double jacobian_det = 0.0,
                              const char* file = "",
@@ -425,8 +465,10 @@ class SingularMappingException : public FEException {
     {
     }
 
-    /// @brief Jacobian determinant at the failure point.
-    /// @return The determinant recorded at construction.
+    /**
+     * @brief Jacobian determinant at the failure point.
+     * @return The determinant recorded at construction.
+     */
     double jacobian_det() const noexcept { return jacobian_det_; }
 
 private:
@@ -439,7 +481,7 @@ class SingularMappingException : public FEException {
     double jacobian_det_ = 0.0;
 };
 
-/// @}
+/** @} */
 
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index f085e7e8f..d90388114 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -16,18 +16,18 @@
 
 #if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
 #  include "Mesh/Core/MeshTypes.h"
-/// Nonzero when FE shares scalar/index types with the Mesh library.
+/** Nonzero when FE shares scalar/index types with the Mesh library. */
 #  define SVMP_FE_HAS_MESH_TYPES 1
 #else
 // Build FE without Mesh types unless explicitly enabled.
-/// Nonzero when FE shares scalar/index types with the Mesh library.
+/** Nonzero when FE shares scalar/index types with the Mesh library. */
 #  define SVMP_FE_HAS_MESH_TYPES 0
 #endif
 
 #if !SVMP_FE_HAS_MESH_TYPES
 namespace svmp {
 #ifndef SVMP_CELL_FAMILY_DEFINED
-/// Guard marking that svmp::CellFamily has been defined.
+/** Guard marking that svmp::CellFamily has been defined. */
 #define SVMP_CELL_FAMILY_DEFINED 1
 /**
  * @brief Minimal fallback for svmp::CellFamily when the Mesh library is unavailable
@@ -58,29 +58,33 @@ enum class CellFamily {
 #include <type_traits>
 #include <limits>
 
-/// \defgroup FE_Common Common
-/// \ingroup FE
-/// \brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
-///
-/// \details The Common module collects the foundational definitions that the
-/// rest of the FE library builds on: index and scalar type aliases; element,
-/// basis, quadrature, and field enumerations; sentinel constants and strong
-/// type wrappers; and the FE exception hierarchy together with its
-/// argument-checking helpers.
+/**
+ * @defgroup FE_Common Common
+ * @ingroup FE
+ * @brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
+ *
+ * @details The Common module collects the foundational definitions that the
+ * rest of the FE library builds on: index and scalar type aliases; element,
+ * basis, quadrature, and field enumerations; sentinel constants and strong
+ * type wrappers; and the FE exception hierarchy together with its
+ * argument-checking helpers.
+ */
 
 namespace svmp {
 namespace FE {
 
-/// \defgroup FE_CommonTypes Types
-/// \ingroup FE_Common
-/// \brief Core type aliases, enumerations, constants, geometric types, and compile-time traits.
-///
-/// \details This group documents the index and identifier types used for
-/// element-local and global numbering, the element/basis/quadrature/field
-/// enumerations shared across modules, sentinel constants, reference- and
-/// physical-space geometric aliases, and the strong-type utilities that
-/// prevent accidental mixing of conceptually distinct values.
-/// @{
+/**
+ * @defgroup FE_CommonTypes Types
+ * @ingroup FE_Common
+ * @brief Core type aliases, enumerations, constants, geometric types, and compile-time traits.
+ *
+ * @details This group documents the index and identifier types used for
+ * element-local and global numbering, the element/basis/quadrature/field
+ * enumerations shared across modules, sentinel constants, reference- and
+ * physical-space geometric aliases, and the strong-type utilities that
+ * prevent accidental mixing of conceptually distinct values.
+ * @{
+ */
 
 // ============================================================================
 // Index Types
@@ -111,14 +115,20 @@ using GlobalIndex = std::int64_t;
 struct DofIndex {
     GlobalIndex value;  ///< Underlying global DOF index; negative values are invalid.
 
-    /// @brief Construct a DOF index, defaulting to the invalid sentinel.
-    /// @param v Global DOF index value.
+    /**
+     * @brief Construct a DOF index, defaulting to the invalid sentinel.
+     * @param v Global DOF index value.
+     */
     constexpr explicit DofIndex(GlobalIndex v = -1) noexcept : value(v) {}
-    /// @brief Convert to the underlying global index value.
-    /// @return The stored global index.
+    /**
+     * @brief Convert to the underlying global index value.
+     * @return The stored global index.
+     */
     constexpr operator GlobalIndex() const noexcept { return value; }
-    /// @brief Check whether this index refers to a valid DOF.
-    /// @return True when the stored value is non-negative.
+    /**
+     * @brief Check whether this index refers to a valid DOF.
+     * @return True when the stored value is non-negative.
+     */
     constexpr bool is_valid() const noexcept { return value >= 0; }
 };
 
@@ -149,17 +159,19 @@ using MeshGlobalId = std::int64_t;      ///< Global mesh entity identifier.
 // Constants
 // ============================================================================
 
-/// Sentinel for an unset or out-of-range local index.
+/** Sentinel for an unset or out-of-range local index. */
 constexpr LocalIndex INVALID_LOCAL_INDEX = std::numeric_limits<LocalIndex>::max();
-/// Sentinel for an unset or out-of-range global index.
+/** Sentinel for an unset or out-of-range global index. */
 constexpr GlobalIndex INVALID_GLOBAL_INDEX = -1;
-/// Sentinel FieldId meaning "uninitialized / no field".
+/** Sentinel FieldId meaning "uninitialized / no field". */
 constexpr FieldId INVALID_FIELD_ID = std::numeric_limits<FieldId>::max();
-/// Sentinel FieldId for geometry-only quantities (no DOF dependence).
-/// Uses first registered field's space for quadrature, but logically decoupled
-/// from any specific field's DOFs.
+/**
+ * Sentinel FieldId for geometry-only quantities (no DOF dependence).
+ * Uses first registered field's space for quadrature, but logically decoupled
+ * from any specific field's DOFs.
+ */
 constexpr FieldId GEOMETRY_FIELD_ID = std::numeric_limits<FieldId>::max() - 1;
-/// Sentinel for an unset or out-of-range block identifier.
+/** Sentinel for an unset or out-of-range block identifier. */
 constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
 
 /**
@@ -177,17 +189,17 @@ constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
  */
 constexpr FieldId CURRENT_SOLUTION_FIELD_ID = std::numeric_limits<FieldId>::max();
 
-/// Preferred cache-line/SIMD alignment for performance-critical arrays.
+/** Preferred cache-line/SIMD alignment for performance-critical arrays. */
 inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
 
-/// Alignment for small fixed-size math objects that are commonly passed by value.
+/** Alignment for small fixed-size math objects that are commonly passed by value. */
 inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
 
 // ============================================================================
 // Field Value Entry (for point evaluation of field-dependent expressions)
 // ============================================================================
 
-/// Maximum number of components in a FieldValueEntry (3x3 tensor).
+/** Maximum number of components in a FieldValueEntry (3x3 tensor). */
 constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
 
 /**
@@ -347,44 +359,58 @@ using Jacobian = std::array<std::array<double, static_cast<std::size_t>(Referenc
 template<typename T, typename Tag>
 class StrongType {
 public:
-    /// @brief Underlying value type.
+    /** @brief Underlying value type. */
     using ValueType = T;
 
-    /// @brief Value-initialize the wrapped value.
+    /** @brief Value-initialize the wrapped value. */
     constexpr StrongType() noexcept(std::is_nothrow_default_constructible_v<T>)
         : value_{} {}
 
-    /// @brief Wrap an explicit value.
-    /// @param value Value to store.
+    /**
+     * @brief Wrap an explicit value.
+     * @param value Value to store.
+     */
     constexpr explicit StrongType(T value) noexcept(std::is_nothrow_move_constructible_v<T>)
         : value_(std::move(value)) {}
 
-    /// @brief Access the wrapped value.
-    /// @return Reference to the wrapped value.
+    /**
+     * @brief Access the wrapped value.
+     * @return Reference to the wrapped value.
+     */
     constexpr T& get() noexcept { return value_; }
-    /// @brief Access the wrapped value.
-    /// @return Reference to the wrapped value.
+    /**
+     * @brief Access the wrapped value.
+     * @return Reference to the wrapped value.
+     */
     constexpr const T& get() const noexcept { return value_; }
 
-    /// @brief Explicitly convert back to the underlying type.
-    /// @return Copy of the wrapped value.
+    /**
+     * @brief Explicitly convert back to the underlying type.
+     * @return Copy of the wrapped value.
+     */
     constexpr explicit operator T() const noexcept { return value_; }
 
-    /// @brief Compare wrapped values for equality.
-    /// @param other Wrapper to compare against.
-    /// @return True when the wrapped values are equal.
+    /**
+     * @brief Compare wrapped values for equality.
+     * @param other Wrapper to compare against.
+     * @return True when the wrapped values are equal.
+     */
     constexpr bool operator==(const StrongType& other) const noexcept {
         return value_ == other.value_;
     }
-    /// @brief Compare wrapped values for inequality.
-    /// @param other Wrapper to compare against.
-    /// @return True when the wrapped values differ.
+    /**
+     * @brief Compare wrapped values for inequality.
+     * @param other Wrapper to compare against.
+     * @return True when the wrapped values differ.
+     */
     constexpr bool operator!=(const StrongType& other) const noexcept {
         return value_ != other.value_;
     }
-    /// @brief Order by wrapped value.
-    /// @param other Wrapper to compare against.
-    /// @return True when this wrapped value orders before the other.
+    /**
+     * @brief Order by wrapped value.
+     * @param other Wrapper to compare against.
+     * @return True when this wrapped value orders before the other.
+     */
     constexpr bool operator<(const StrongType& other) const noexcept {
         return value_ < other.value_;
     }
@@ -399,9 +425,9 @@ struct QuadratureWeightTag {};  ///< Tag type for quadrature weights.
 struct BasisValueTag {};        ///< Tag type for basis-function values.
 struct BasisGradientTag {};     ///< Tag type for basis-function gradients.
 
-/// Type-safe index of a quadrature point within a rule.
+/** Type-safe index of a quadrature point within a rule. */
 using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
-/// Type-safe quadrature weight value.
+/** Type-safe quadrature weight value. */
 using QuadratureWeight = StrongType<double, QuadratureWeightTag>;
 
 // ============================================================================
@@ -423,7 +449,7 @@ struct is_index_type<GlobalIndex> : std::true_type {};
 template<>
 struct is_index_type<DofIndex> : std::true_type {};
 
-/// Convenience variable template for is_index_type.
+/** Convenience variable template for is_index_type. */
 template<typename T>
 inline constexpr bool is_index_type_v = is_index_type<T>::value;
 
@@ -436,7 +462,7 @@ struct is_field_type : std::false_type {};
 template<>
 struct is_field_type<FieldType> : std::true_type {};
 
-/// Convenience variable template for is_field_type.
+/** Convenience variable template for is_field_type. */
 template<typename T>
 inline constexpr bool is_field_type_v = is_field_type<T>::value;
 
@@ -526,7 +552,7 @@ constexpr int element_dimension(ElementType elem) noexcept {
     }
 }
 
-/// @}
+/** @} */
 
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/FE.h b/Code/Source/solver/FE/FE.h
index 1d3bba72b..125660942 100644
--- a/Code/Source/solver/FE/FE.h
+++ b/Code/Source/solver/FE/FE.h
@@ -4,19 +4,23 @@
 #ifndef SVMP_FE_FE_H
 #define SVMP_FE_FE_H
 
-/// \file FE.h
-/// \brief Library-level Doxygen group for the finite-element support code.
-///
-/// This header intentionally contains no declarations. It gives Doxygen a
-/// header-based home for the top-level FE group; submodule groups attach to it
-/// from their own headers, including FE_Basis (Basis/BasisFunction.h),
-/// FE_Common (Common/Types.h), and FE_Math (Math/Vector.h).
+/**
+ * @file FE.h
+ * @brief Library-level Doxygen group for the finite-element support code.
+ *
+ * This header intentionally contains no declarations. It gives Doxygen a
+ * header-based home for the top-level FE group; submodule groups attach to it
+ * from their own headers, including FE_Basis (Basis/BasisFunction.h),
+ * FE_Common (Common/Types.h), and FE_Math (Math/Vector.h).
+ */
 
-/// \defgroup FE FE Library
-/// \brief Finite-element interfaces and utilities used by the solver.
-///
-/// The FE library groups basis functions, math utilities, assembly interfaces,
-/// and related support code that can be built and consumed as a coherent
-/// finite-element component.
+/**
+ * @defgroup FE FE Library
+ * @brief Finite-element interfaces and utilities used by the solver.
+ *
+ * The FE library groups basis functions, math utilities, assembly interfaces,
+ * and related support code that can be built and consumed as a coherent
+ * finite-element component.
+ */
 
 #endif // SVMP_FE_FE_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index ce1d4a612..90aa7681a 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -22,9 +22,11 @@
 
 #include <cstddef>
 
-/// \defgroup FE_MatrixMath Matrix
-/// \ingroup FE_Math
-/// \brief Fixed-size matrix type aliases.
+/**
+ * @defgroup FE_MatrixMath Matrix
+ * @ingroup FE_Math
+ * @brief Fixed-size matrix type aliases.
+ */
 
 namespace svmp {
 namespace FE {
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index b234bac49..41466e9db 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -19,17 +19,19 @@
 
 #include <cstddef>
 
-/// \defgroup FE_Math Math
-/// \ingroup FE
-/// \brief Linear algebra vocabulary types and dense utilities for finite-element computations.
-///
-/// \details The Math module defines the fixed-size vector and matrix types
-/// used in element-level kernels (as aliases of Eigen types) and dense linear
-/// algebra utilities used by basis construction and local transforms.
-///
-/// \defgroup FE_VectorMath Vector
-/// \ingroup FE_Math
-/// \brief Fixed-size vector type aliases.
+/**
+ * @defgroup FE_Math Math
+ * @ingroup FE
+ * @brief Linear algebra vocabulary types and dense utilities for finite-element computations.
+ *
+ * @details The Math module defines the fixed-size vector and matrix types
+ * used in element-level kernels (as aliases of Eigen types) and dense linear
+ * algebra utilities used by basis construction and local transforms.
+ *
+ * @defgroup FE_VectorMath Vector
+ * @ingroup FE_Math
+ * @brief Fixed-size vector type aliases.
+ */
 
 namespace svmp {
 namespace FE {

From d56d39091041010d8c28ff685b82733740f1656a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 23 Jun 2026 16:21:31 -0700
Subject: [PATCH 49/91] Hex8 and Hex20 constructor branch now validates the
 element/order pairing

---
 .../Source/solver/FE/Basis/SerendipityBasis.cpp | 17 +++++++++--------
 .../unitTests/FE/Basis/test_BasisErrorPaths.cpp |  6 ++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 141b0df77..f73557473 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -396,15 +396,16 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
     } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
         dimension_ = 3;
-        if (order_ < 1) order_ = 1;
-        if (order_ == 1) {
-            size_ = 8;
-        } else if (order_ == 2) {
-            size_ = 20;
-        } else {
-            svmp::raise<BasisConfigurationException>(SVMP_HERE,
-                "SerendipityBasis supports up to quadratic on hexahedra");
+        if (order_ < 1) {
+            order_ = 1;
         }
+        svmp::throw_if<BasisConfigurationException>(
+            type == ElementType::Hex8 && order_ != 1, SVMP_HERE,
+            "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
+        svmp::throw_if<BasisConfigurationException>(
+            type == ElementType::Hex20 && order_ != 2, SVMP_HERE,
+            "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
+        size_ = (type == ElementType::Hex8) ? 8u : 20u;
     } else if (type == ElementType::Wedge15) {
         dimension_ = 3;
         if (order_ < 2) {
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 05b657a7a..2699da052 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -186,6 +186,12 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2),
                  BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex8, 2),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex20, 1),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex20, 3),
+                 BasisConfigurationException);
 }
 
 TEST(BasisErrorPaths, BasisFactoryRejectsNonC0Continuity) {

From edd52b7359e32514ef5afc94ccd0a0cd136d9404 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 23 Jun 2026 23:02:21 -0700
Subject: [PATCH 50/91] remove FP processing and passing indexing directly for
 lagrange basis functions

---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 136 +++----
 .../FE/Basis/NodeOrderingConventions.cpp      | 357 +++++++++++++-----
 .../solver/FE/Basis/NodeOrderingConventions.h |  37 ++
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  66 ++++
 4 files changed, 416 insertions(+), 180 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 42f052579..cdb1eeed4 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -6,7 +6,6 @@
 
 #include <algorithm>
 #include <array>
-#include <cmath>
 #include <limits>
 #include <span>
 #include <string>
@@ -79,59 +78,22 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
     return {canonical, normalized_order};
 }
 
-// Convert a coordinate on [-1, 1] to an equispaced axis node index.
-std::size_t axis_index_pm_one(double coord, int order) {
-    if (order <= 0) {
-        return 0u;
-    }
-    const double scaled = (coord + double(1)) * double(order) / double(2);
-    const long long rounded = std::llround(scaled);
-    svmp::throw_if<BasisConstructionException>(
-        rounded < 0 || rounded > static_cast<long long>(order) ||
-            !detail::basis_nearly_equal(scaled, static_cast<double>(rounded)),
-        SVMP_HERE,
-        "LagrangeBasis: tensor-product node coordinate is off the equispaced lattice");
-    return static_cast<std::size_t>(rounded);
-}
-
-// Convert a simplex barycentric coordinate to a lattice index.
-int simplex_lattice_index(double value, int order) {
-    if (order <= 0) {
-        return 0;
-    }
-    const double scaled = value * double(order);
-    const long long rounded = std::llround(scaled);
-    svmp::throw_if<BasisConstructionException>(
-        rounded < 0 || rounded > static_cast<long long>(order) ||
-            !detail::basis_nearly_equal(scaled, static_cast<double>(rounded)),
-        SVMP_HERE,
-        "LagrangeBasis: simplex node coordinate is off the lattice");
-    return static_cast<int>(rounded);
-}
-
-// Compute simplex interpolation exponents from a reference node.
-LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
-                                                           BasisTopology top,
-                                                           int order) {
+// Convert an integer lattice index (i, j[, k]) into the barycentric exponent
+// tuple (order - i - j - k, i, j, k). The lattice already carries the exact
+// coordinate indices, so no floating-point round-trip is needed; the accessor's
+// structural invariants guarantee i + j + k <= order, hence e[0] >= 0.
+LagrangeBasis::SimplexExponent simplex_exponent_from_lattice(const std::array<int, 3>& idx,
+                                                            BasisTopology top,
+                                                            int order) {
     LagrangeBasis::SimplexExponent e{0, 0, 0, 0};
-    if (order <= 0) {
-        return e;
-    }
-    if (top == BasisTopology::Triangle) {
-        e[1] = simplex_lattice_index(p[0], order);
-        e[2] = simplex_lattice_index(p[1], order);
-        e[0] = order - e[1] - e[2];
+    e[1] = idx[0];
+    e[2] = idx[1];
+    if (top == BasisTopology::Tetrahedron) {
+        e[3] = idx[2];
+        e[0] = order - idx[0] - idx[1] - idx[2];
     } else {
-        e[1] = simplex_lattice_index(p[0], order);
-        e[2] = simplex_lattice_index(p[1], order);
-        e[3] = simplex_lattice_index(p[2], order);
-        e[0] = order - e[1] - e[2] - e[3];
+        e[0] = order - idx[0] - idx[1];
     }
-    // e[0] is order minus the other exponents, so the exponents sum to order by
-    // construction; a negative e[0] means the node coordinates are off-lattice.
-    svmp::throw_if<BasisConstructionException>(
-        e[0] < 0, SVMP_HERE,
-        "LagrangeBasis: simplex node coordinate yields a negative implied exponent");
     return e;
 }
 
@@ -389,52 +351,62 @@ void LagrangeBasis::build_point_nodes() {
 // Build nodes and axis indices for tensor-product elements.
 void LagrangeBasis::build_tensor_product_nodes() {
     init_equispaced_1d_nodes();
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    tensor_indices_.reserve(nodes_.size());
-    for (const auto& node : nodes_) {
-        TensorNodeIndex idx{0u, 0u, 0u};
-        idx[0] = axis_index_pm_one(node[0], order_);
-        if (dimension_ >= 2) {
-            idx[1] = axis_index_pm_one(node[1], order_);
-        }
-        if (dimension_ >= 3) {
-            idx[2] = axis_index_pm_one(node[2], order_);
-        }
-        tensor_indices_.push_back(idx);
+    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    nodes_ = layout.coords;
+    tensor_indices_.reserve(layout.lattice.size());
+    for (const auto& idx : layout.lattice) {
+        // The lattice already holds the per-axis equispaced node index (unused
+        // axes are zero), so no coordinate-to-index inversion is needed.
+        tensor_indices_.push_back(TensorNodeIndex{
+            static_cast<std::size_t>(idx[0]),
+            static_cast<std::size_t>(idx[1]),
+            static_cast<std::size_t>(idx[2])});
     }
 }
 
 // Build nodes and barycentric exponents for simplex elements.
 void LagrangeBasis::build_simplex_nodes() {
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    simplex_exponents_.reserve(nodes_.size());
-    for (const auto& node : nodes_) {
-        simplex_exponents_.push_back(simplex_exponent_from_point(node, topology_, order_));
+    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    nodes_ = layout.coords;
+    simplex_exponents_.reserve(layout.lattice.size());
+    for (const auto& idx : layout.lattice) {
+        simplex_exponents_.push_back(simplex_exponent_from_lattice(idx, topology_, order_));
     }
 }
 
 // Build nodes and mixed triangle-axis lookup data for wedge elements.
 void LagrangeBasis::build_wedge_nodes() {
     init_equispaced_1d_nodes();
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    const auto tri_nodes =
-        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
-    simplex_exponents_.reserve(tri_nodes.size());
-    for (const auto& tri_node : tri_nodes) {
+    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    nodes_ = layout.coords;
+
+    const auto tri_layout =
+        ReferenceNodeLayout::get_lagrange_lattice(ElementType::Triangle3, order_);
+    simplex_exponents_.reserve(tri_layout.lattice.size());
+    for (const auto& idx : tri_layout.lattice) {
         simplex_exponents_.push_back(
-            simplex_exponent_from_point(tri_node, BasisTopology::Triangle, order_));
+            simplex_exponent_from_lattice(idx, BasisTopology::Triangle, order_));
+    }
+
+    // Map a triangle cross-section lattice (i, j) to its triangle-node ordinal
+    // through the flat key i * (order + 1) + j, so each wedge node's triangle
+    // index is an exact integer lookup.
+    const int stride = order_ + 1;
+    std::vector<int> tri_ordinal_for_key(static_cast<std::size_t>(stride * stride), -1);
+    for (std::size_t t = 0; t < tri_layout.lattice.size(); ++t) {
+        const auto& idx = tri_layout.lattice[t];
+        tri_ordinal_for_key[static_cast<std::size_t>(idx[0] * stride + idx[1])] =
+            static_cast<int>(t);
     }
 
-    wedge_indices_.reserve(nodes_.size());
-    for (const auto& node : nodes_) {
-        const auto tri_exp =
-            simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
-        auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
-        svmp::throw_if<BasisConstructionException>(it == simplex_exponents_.end(), SVMP_HERE,
+    wedge_indices_.reserve(layout.lattice.size());
+    for (const auto& idx : layout.lattice) {
+        const int tri_ordinal =
+            tri_ordinal_for_key[static_cast<std::size_t>(idx[0] * stride + idx[1])];
+        svmp::throw_if<BasisConstructionException>(tri_ordinal < 0, SVMP_HERE,
                                                  "LagrangeBasis: wedge node triangle index lookup failed");
-        const std::size_t tri_index =
-            static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
-        wedge_indices_.push_back({tri_index, axis_index_pm_one(node[2], order_)});
+        wedge_indices_.push_back({static_cast<std::size_t>(tri_ordinal),
+                                  static_cast<std::size_t>(idx[2])});
     }
 }
 
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index b09892507..bf488da5f 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -14,6 +14,7 @@ namespace basis {
 namespace {
 
 using Point = math::Vector<double, 3>;
+using Lattice = std::array<int, 3>;
 
 // Maps public Hex20 ReferenceNodeLayout slots to the internal coefficient-table
 // basis columns used by kHex20Coefficients. Wedge15 and quadrilateral
@@ -33,103 +34,176 @@ double line_coord_zero_one(int i, int order) {
     return static_cast<double>(i) / static_cast<double>(order);
 }
 
-void append_triangle_face_interior(std::vector<Point>& nodes,
+// Interpolate an integer lattice index along an edge between two corner
+// vertices: index = (LA * (order - m) + LB * m) / order. The division is exact
+// because edge endpoints are element corners (each component is 0 or order), so
+// the result is the integer lattice point at parameter m / order.
+Lattice lerp_lattice(const Lattice& a, const Lattice& b, int m, int order) {
+    Lattice result{0, 0, 0};
+    for (std::size_t d = 0; d < 3u; ++d) {
+        const int numerator = a[d] * (order - m) + b[d] * m;
+        svmp::throw_if<BasisConstructionException>(
+            numerator % order != 0, SVMP_HERE,
+            "ReferenceNodeLayout: non-integral edge lattice index");
+        result[d] = numerator / order;
+    }
+    return result;
+}
+
+// Barycentric combination of three corner lattice indices for a triangular
+// face-interior node: index = (a * L0 + b * L1 + c * L2) / order, with
+// a + b + c == order. Exact for corner inputs (components 0 or order).
+Lattice combine_lattice(const Lattice& l0, const Lattice& l1, const Lattice& l2,
+                        int a, int b, int c, int order) {
+    Lattice result{0, 0, 0};
+    for (std::size_t d = 0; d < 3u; ++d) {
+        const int numerator = a * l0[d] + b * l1[d] + c * l2[d];
+        svmp::throw_if<BasisConstructionException>(
+            numerator % order != 0, SVMP_HERE,
+            "ReferenceNodeLayout: non-integral face-interior lattice index");
+        result[d] = numerator / order;
+    }
+    return result;
+}
+
+// Append the interior nodes of a triangular face spanned by v0, v1, v2 (with
+// matching corner lattice indices l0, l1, l2), emitting both the coordinate and
+// its integer lattice index. Shared by triangle interiors, tetra faces, and the
+// two wedge caps.
+void append_triangle_face_interior(LagrangeNodeLayout& out,
                                    const Point& v0,
                                    const Point& v1,
                                    const Point& v2,
+                                   const Lattice& l0,
+                                   const Lattice& l1,
+                                   const Lattice& l2,
                                    int order) {
     for (int c = 1; c <= order - 2; ++c) {
         for (int b = 1; b <= order - c - 1; ++b) {
             const int a = order - b - c;
             const double inv = double(1) / double(order);
-            nodes.push_back(v0 * (double(a) * inv) +
-                            v1 * (double(b) * inv) +
-                            v2 * (double(c) * inv));
+            out.coords.push_back(v0 * (double(a) * inv) +
+                                 v1 * (double(b) * inv) +
+                                 v2 * (double(c) * inv));
+            out.lattice.push_back(combine_lattice(l0, l1, l2, a, b, c, order));
         }
     }
 }
 
-std::vector<Point> generate_line_nodes(int order) {
+LagrangeNodeLayout generate_line_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(0), double(0), double(0)}};
+        out.coords.push_back(Point{double(0), double(0), double(0)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>(order + 1));
-    nodes.push_back(Point{double(-1), double(0), double(0)});
-    nodes.push_back(Point{double(1), double(0), double(0)});
+    out.coords.reserve(static_cast<std::size_t>(order + 1));
+    out.lattice.reserve(static_cast<std::size_t>(order + 1));
+    out.coords.push_back(Point{double(-1), double(0), double(0)});
+    out.lattice.push_back(Lattice{0, 0, 0});
+    out.coords.push_back(Point{double(1), double(0), double(0)});
+    out.lattice.push_back(Lattice{order, 0, 0});
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), double(0), double(0)});
+        out.coords.push_back(Point{line_coord_pm_one(i, order), double(0), double(0)});
+        out.lattice.push_back(Lattice{i, 0, 0});
     }
-    return nodes;
+    return out;
 }
 
-std::vector<Point> generate_triangle_nodes(int order) {
+LagrangeNodeLayout generate_triangle_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(1) / double(3), double(1) / double(3), double(0)}};
+        out.coords.push_back(Point{double(1) / double(3), double(1) / double(3), double(0)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
-    nodes.push_back(Point{double(0), double(0), double(0)});
-    nodes.push_back(Point{double(1), double(0), double(0)});
-    nodes.push_back(Point{double(0), double(1), double(0)});
+    out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
+    out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
+    out.coords.push_back(Point{double(0), double(0), double(0)});
+    out.lattice.push_back(Lattice{0, 0, 0});
+    out.coords.push_back(Point{double(1), double(0), double(0)});
+    out.lattice.push_back(Lattice{order, 0, 0});
+    out.coords.push_back(Point{double(0), double(1), double(0)});
+    out.lattice.push_back(Lattice{0, order, 0});
 
     for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{line_coord_zero_one(m, order), double(0), double(0)});
+        out.coords.push_back(Point{line_coord_zero_one(m, order), double(0), double(0)});
+        out.lattice.push_back(Lattice{m, 0, 0});
     }
     for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{line_coord_zero_one(order - m, order),
-                              line_coord_zero_one(m, order), double(0)});
+        out.coords.push_back(Point{line_coord_zero_one(order - m, order),
+                                   line_coord_zero_one(m, order), double(0)});
+        out.lattice.push_back(Lattice{order - m, m, 0});
     }
     for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{double(0), line_coord_zero_one(order - m, order), double(0)});
+        out.coords.push_back(Point{double(0), line_coord_zero_one(order - m, order), double(0)});
+        out.lattice.push_back(Lattice{0, order - m, 0});
     }
 
-    append_triangle_face_interior(nodes,
+    append_triangle_face_interior(out,
                                   Point{double(0), double(0), double(0)},
                                   Point{double(1), double(0), double(0)},
                                   Point{double(0), double(1), double(0)},
+                                  Lattice{0, 0, 0},
+                                  Lattice{order, 0, 0},
+                                  Lattice{0, order, 0},
                                   order);
-    return nodes;
+    return out;
 }
 
-std::vector<Point> generate_quad_nodes(int order) {
+LagrangeNodeLayout generate_quad_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(0), double(0), double(0)}};
-    }
-
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
-    nodes.push_back(Point{double(-1), double(-1), double(0)});
-    nodes.push_back(Point{double(1), double(-1), double(0)});
-    nodes.push_back(Point{double(1), double(1), double(0)});
-    nodes.push_back(Point{double(-1), double(1), double(0)});
+        out.coords.push_back(Point{double(0), double(0), double(0)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
+    }
+
+    out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
+    out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
+    out.coords.push_back(Point{double(-1), double(-1), double(0)});
+    out.lattice.push_back(Lattice{0, 0, 0});
+    out.coords.push_back(Point{double(1), double(-1), double(0)});
+    out.lattice.push_back(Lattice{order, 0, 0});
+    out.coords.push_back(Point{double(1), double(1), double(0)});
+    out.lattice.push_back(Lattice{order, order, 0});
+    out.coords.push_back(Point{double(-1), double(1), double(0)});
+    out.lattice.push_back(Lattice{0, order, 0});
 
     for (int i = 1; i < order; ++i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), double(-1), double(0)});
+        out.coords.push_back(Point{line_coord_pm_one(i, order), double(-1), double(0)});
+        out.lattice.push_back(Lattice{i, 0, 0});
     }
     for (int j = 1; j < order; ++j) {
-        nodes.push_back(Point{double(1), line_coord_pm_one(j, order), double(0)});
+        out.coords.push_back(Point{double(1), line_coord_pm_one(j, order), double(0)});
+        out.lattice.push_back(Lattice{order, j, 0});
     }
     for (int i = order - 1; i >= 1; --i) {
-        nodes.push_back(Point{line_coord_pm_one(i, order), double(1), double(0)});
+        out.coords.push_back(Point{line_coord_pm_one(i, order), double(1), double(0)});
+        out.lattice.push_back(Lattice{i, order, 0});
     }
     for (int j = order - 1; j >= 1; --j) {
-        nodes.push_back(Point{double(-1), line_coord_pm_one(j, order), double(0)});
+        out.coords.push_back(Point{double(-1), line_coord_pm_one(j, order), double(0)});
+        out.lattice.push_back(Lattice{0, j, 0});
     }
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order),
-                                  line_coord_pm_one(j, order), double(0)});
+            out.coords.push_back(Point{line_coord_pm_one(i, order),
+                                       line_coord_pm_one(j, order), double(0)});
+            out.lattice.push_back(Lattice{i, j, 0});
         }
     }
-    return nodes;
+    return out;
 }
 
-std::vector<Point> generate_tetra_nodes(int order) {
+LagrangeNodeLayout generate_tetra_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(0.25), double(0.25), double(0.25)}};
+        out.coords.push_back(Point{double(0.25), double(0.25), double(0.25)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
     }
 
     const Point verts[] = {
@@ -138,45 +212,56 @@ std::vector<Point> generate_tetra_nodes(int order) {
         Point{double(0), double(1), double(0)},
         Point{double(0), double(0), double(1)},
     };
+    const Lattice vert_lattice[] = {
+        Lattice{0, 0, 0},
+        Lattice{order, 0, 0},
+        Lattice{0, order, 0},
+        Lattice{0, 0, order},
+    };
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
-    for (const auto& v : verts) {
-        nodes.push_back(v);
+    out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
+    out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
+    for (std::size_t v = 0; v < 4u; ++v) {
+        out.coords.push_back(verts[v]);
+        out.lattice.push_back(vert_lattice[v]);
     }
 
     const int edges[6][2] = {{0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}};
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.coords.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.lattice.push_back(lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order));
         }
     }
 
     const int faces[4][3] = {{0, 1, 2}, {0, 1, 3}, {1, 2, 3}, {0, 2, 3}};
     for (const auto& face : faces) {
-        append_triangle_face_interior(nodes,
-                                      verts[face[0]],
-                                      verts[face[1]],
-                                      verts[face[2]],
+        append_triangle_face_interior(out,
+                                      verts[face[0]], verts[face[1]], verts[face[2]],
+                                      vert_lattice[face[0]], vert_lattice[face[1]], vert_lattice[face[2]],
                                       order);
     }
 
     for (int l = 1; l <= order - 3; ++l) {
         for (int k = 1; k <= order - l - 2; ++k) {
             for (int j = 1; j <= order - l - k - 1; ++j) {
-                nodes.push_back(Point{double(j) / double(order),
-                                      double(k) / double(order),
-                                      double(l) / double(order)});
+                out.coords.push_back(Point{double(j) / double(order),
+                                           double(k) / double(order),
+                                           double(l) / double(order)});
+                out.lattice.push_back(Lattice{j, k, l});
             }
         }
     }
-    return nodes;
+    return out;
 }
 
-std::vector<Point> generate_hex_nodes(int order) {
+LagrangeNodeLayout generate_hex_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(0), double(0), double(0)}};
+        out.coords.push_back(Point{double(0), double(0), double(0)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
     }
 
     const Point verts[] = {
@@ -189,11 +274,22 @@ std::vector<Point> generate_hex_nodes(int order) {
         Point{double(1), double(1), double(1)},
         Point{double(-1), double(1), double(1)},
     };
+    const Lattice vert_lattice[] = {
+        Lattice{0, 0, 0},
+        Lattice{order, 0, 0},
+        Lattice{order, order, 0},
+        Lattice{0, order, 0},
+        Lattice{0, 0, order},
+        Lattice{order, 0, order},
+        Lattice{order, order, order},
+        Lattice{0, order, order},
+    };
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
-    for (const auto& v : verts) {
-        nodes.push_back(v);
+    out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
+    out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
+    for (std::size_t v = 0; v < 8u; ++v) {
+        out.coords.push_back(verts[v]);
+        out.lattice.push_back(vert_lattice[v]);
     }
 
     const int edges[12][2] = {
@@ -204,7 +300,8 @@ std::vector<Point> generate_hex_nodes(int order) {
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.coords.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.lattice.push_back(lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order));
         }
     }
 
@@ -217,54 +314,64 @@ std::vector<Point> generate_hex_nodes(int order) {
     // -X face (x = -1)
     for (int k = 1; k < order; ++k) {
         for (int j = order - 1; j >= 1; --j) {
-            nodes.push_back(Point{double(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            out.coords.push_back(Point{double(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            out.lattice.push_back(Lattice{0, j, k});
         }
     }
     // +X face (x = +1)
     for (int k = 1; k < order; ++k) {
         for (int j = 1; j < order; ++j) {
-            nodes.push_back(Point{double(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            out.coords.push_back(Point{double(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+            out.lattice.push_back(Lattice{order, j, k});
         }
     }
     // -Y face (y = -1)
     for (int k = 1; k < order; ++k) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), double(-1), line_coord_pm_one(k, order)});
+            out.coords.push_back(Point{line_coord_pm_one(i, order), double(-1), line_coord_pm_one(k, order)});
+            out.lattice.push_back(Lattice{i, 0, k});
         }
     }
     // +Y face (y = +1)
     for (int k = 1; k < order; ++k) {
         for (int i = order - 1; i >= 1; --i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), double(1), line_coord_pm_one(k, order)});
+            out.coords.push_back(Point{line_coord_pm_one(i, order), double(1), line_coord_pm_one(k, order)});
+            out.lattice.push_back(Lattice{i, order, k});
         }
     }
     // -Z face (z = -1)
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(-1)});
+            out.coords.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(-1)});
+            out.lattice.push_back(Lattice{i, j, 0});
         }
     }
     // +Z face (z = +1)
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(1)});
+            out.coords.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), double(1)});
+            out.lattice.push_back(Lattice{i, j, order});
         }
     }
     for (int k = 1; k < order; ++k) {
         for (int j = 1; j < order; ++j) {
             for (int i = 1; i < order; ++i) {
-                nodes.push_back(Point{line_coord_pm_one(i, order),
-                                      line_coord_pm_one(j, order),
-                                      line_coord_pm_one(k, order)});
+                out.coords.push_back(Point{line_coord_pm_one(i, order),
+                                           line_coord_pm_one(j, order),
+                                           line_coord_pm_one(k, order)});
+                out.lattice.push_back(Lattice{i, j, k});
             }
         }
     }
-    return nodes;
+    return out;
 }
 
-std::vector<Point> generate_wedge_nodes(int order) {
+LagrangeNodeLayout generate_wedge_nodes(int order) {
+    LagrangeNodeLayout out;
     if (order == 0) {
-        return {Point{double(1) / double(3), double(1) / double(3), double(0)}};
+        out.coords.push_back(Point{double(1) / double(3), double(1) / double(3), double(0)});
+        out.lattice.push_back(Lattice{0, 0, 0});
+        return out;
     }
 
     const Point verts[] = {
@@ -275,11 +382,20 @@ std::vector<Point> generate_wedge_nodes(int order) {
         Point{double(1), double(0), double(1)},
         Point{double(0), double(1), double(1)},
     };
+    const Lattice vert_lattice[] = {
+        Lattice{0, 0, 0},
+        Lattice{order, 0, 0},
+        Lattice{0, order, 0},
+        Lattice{0, 0, order},
+        Lattice{order, 0, order},
+        Lattice{0, order, order},
+    };
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
-    for (const auto& v : verts) {
-        nodes.push_back(v);
+    out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
+    out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
+    for (std::size_t v = 0; v < 6u; ++v) {
+        out.coords.push_back(verts[v]);
+        out.lattice.push_back(vert_lattice[v]);
     }
 
     const int edges[9][2] = {
@@ -290,26 +406,32 @@ std::vector<Point> generate_wedge_nodes(int order) {
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.coords.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
+            out.lattice.push_back(lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order));
         }
     }
 
-    append_triangle_face_interior(nodes, verts[0], verts[1], verts[2], order);
-    append_triangle_face_interior(nodes, verts[3], verts[4], verts[5], order);
+    append_triangle_face_interior(out, verts[0], verts[1], verts[2],
+                                  vert_lattice[0], vert_lattice[1], vert_lattice[2], order);
+    append_triangle_face_interior(out, verts[3], verts[4], verts[5],
+                                  vert_lattice[3], vert_lattice[4], vert_lattice[5], order);
 
     for (int r = 1; r < order; ++r) {
         const double z = line_coord_pm_one(r, order);
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(Point{t, double(0), z});
+            out.coords.push_back(Point{t, double(0), z});
+            out.lattice.push_back(Lattice{m, 0, r});
         }
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(Point{double(1) - t, t, z});
+            out.coords.push_back(Point{double(1) - t, t, z});
+            out.lattice.push_back(Lattice{order - m, m, r});
         }
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            nodes.push_back(Point{double(0), double(1) - t, z});
+            out.coords.push_back(Point{double(0), double(1) - t, z});
+            out.lattice.push_back(Lattice{0, order - m, r});
         }
     }
 
@@ -317,22 +439,27 @@ std::vector<Point> generate_wedge_nodes(int order) {
         const double z = line_coord_pm_one(r, order);
         for (int c = 1; c <= order - 2; ++c) {
             for (int b = 1; b <= order - c - 1; ++b) {
-                nodes.push_back(Point{double(b) / double(order),
-                                      double(c) / double(order),
-                                      z});
+                out.coords.push_back(Point{double(b) / double(order),
+                                           double(c) / double(order),
+                                           z});
+                out.lattice.push_back(Lattice{b, c, r});
             }
         }
     }
-    return nodes;
+    return out;
 }
 
-std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
+LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order) {
     svmp::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
                                              "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
-        case ElementType::Point1:
-            return {Point{double(0), double(0), double(0)}};
+        case ElementType::Point1: {
+            LagrangeNodeLayout out;
+            out.coords.push_back(Point{double(0), double(0), double(0)});
+            out.lattice.push_back(Lattice{0, 0, 0});
+            return out;
+        }
         case ElementType::Line2:
             return generate_line_nodes(order);
         case ElementType::Triangle3:
@@ -357,22 +484,22 @@ std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order
 std::vector<Point> element_nodes(ElementType elem_type) {
     const int order = complete_lagrange_alias_order(elem_type);
     if (order >= 0) {
-        return complete_lagrange_nodes(elem_type, order);
+        return complete_lagrange_nodes(elem_type, order).coords;
     }
 
     switch (elem_type) {
         case ElementType::Quad8: {
-            auto nodes = generate_quad_nodes(2);
+            auto nodes = generate_quad_nodes(2).coords;
             nodes.resize(8u);
             return nodes;
         }
         case ElementType::Hex20: {
-            auto nodes = generate_hex_nodes(2);
+            auto nodes = generate_hex_nodes(2).coords;
             nodes.resize(20u);
             return nodes;
         }
         case ElementType::Wedge15: {
-            auto nodes = generate_wedge_nodes(2);
+            auto nodes = generate_wedge_nodes(2).coords;
             nodes.resize(15u);
             return nodes;
         }
@@ -385,6 +512,33 @@ std::vector<Point> element_nodes(ElementType elem_type) {
     }
 }
 
+// Structural invariants the lattice must satisfy, checked before the accessor
+// hands it out. These replace the floating-point round-trip's near-equality
+// guards with exact integer checks.
+void validate_lattice(const LagrangeNodeLayout& layout, ElementType type, int order) {
+    svmp::throw_if<BasisConstructionException>(
+        layout.coords.size() != layout.lattice.size(), SVMP_HERE,
+        "ReferenceNodeLayout: lattice/coordinate count mismatch");
+
+    const BasisTopology top = topology(type);
+    for (const auto& idx : layout.lattice) {
+        for (std::size_t d = 0; d < 3u; ++d) {
+            svmp::throw_if<BasisConstructionException>(
+                idx[d] < 0 || idx[d] > order, SVMP_HERE,
+                "ReferenceNodeLayout: lattice index outside [0, order]");
+        }
+        if (top == BasisTopology::Triangle || top == BasisTopology::Tetrahedron) {
+            svmp::throw_if<BasisConstructionException>(
+                idx[0] + idx[1] + idx[2] > order, SVMP_HERE,
+                "ReferenceNodeLayout: simplex lattice index sum exceeds order");
+        } else if (top == BasisTopology::Wedge) {
+            svmp::throw_if<BasisConstructionException>(
+                idx[0] + idx[1] > order, SVMP_HERE,
+                "ReferenceNodeLayout: wedge triangle lattice index sum exceeds order");
+        }
+    }
+}
+
 } // namespace
 
 math::Vector<double, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
@@ -401,7 +555,14 @@ std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
 
 std::vector<math::Vector<double, 3>>
 ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
-    return complete_lagrange_nodes(canonical_type, order);
+    return complete_lagrange_nodes(canonical_type, order).coords;
+}
+
+LagrangeNodeLayout
+ReferenceNodeLayout::get_lagrange_lattice(ElementType canonical_type, int order) {
+    LagrangeNodeLayout layout = complete_lagrange_nodes(canonical_type, order);
+    validate_lattice(layout, canonical_type, order);
+    return layout;
 }
 
 std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 951bd854c..89f0365eb 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -7,6 +7,7 @@
 #include "Math/Vector.h"
 #include "Types.h"
 
+#include <array>
 #include <cstddef>
 #include <span>
 #include <vector>
@@ -28,6 +29,26 @@ namespace basis {
     return double(-1) + double(2) * static_cast<double>(i) / static_cast<double>(order);
 }
 
+/**
+ * @brief Reference Lagrange node coordinates paired with their integer lattice
+ * index.
+ *
+ * @details `lattice[n]` is the exact integer index of `coords[n]` in the
+ * element's natural index space, with every component in `[0, order]`:
+ * - tensor topologies (line/quad/hex): axis indices `(i, j, k)`, unused axes `0`;
+ * - simplex topologies (triangle/tetra): off-origin barycentric indices
+ *   `(i, j, k)` (with `k = 0` for triangles) satisfying `i + j + k <= order`;
+ * - wedge: triangle lattice `(i, j)` in the first two components and the
+ *   through-axis index `r` in the third.
+ *
+ * Emitting the lattice alongside the coordinate lets callers consume the integer
+ * index directly instead of reconstructing it from the floating-point coordinate.
+ */
+struct LagrangeNodeLayout {
+    std::vector<math::Vector<double, 3>> coords;
+    std::vector<std::array<int, 3>>      lattice;
+};
+
 class ReferenceNodeLayout {
 public:
     static math::Vector<double, 3> get_node_coords(ElementType elem_type,
@@ -37,6 +58,22 @@ class ReferenceNodeLayout {
     static std::vector<math::Vector<double, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
+    /**
+     * @brief Reference Lagrange nodes with their integer lattice indices.
+     *
+     * @details Returns the same coordinates as get_lagrange_node_coords(), paired
+     * with the integer lattice index of each node (see LagrangeNodeLayout). The
+     * structural invariants in the contract (size match, components in
+     * `[0, order]`, simplex/wedge sum bounds) are validated before returning.
+     *
+     * @param canonical_type Canonical Lagrange element type (or Point1).
+     * @param order Polynomial order.
+     * @return Coordinates and matching lattice indices, one entry per node.
+     * @throws BasisConstructionException If a structural invariant is violated.
+     */
+    static LagrangeNodeLayout
+    get_lagrange_lattice(ElementType canonical_type, int order);
+
     static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
 };
 
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 6a5ad186e..42b5e7349 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -324,6 +324,72 @@ TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
     }
 }
 
+// The lattice emitted with each node must be the exact forward image of the
+// coordinate: tensor axes invert through line_coord_pm_one, simplex axes through
+// the [0, 1] equispaced map, and the wedge combines the two. This pins the
+// integer-lattice contract that replaced the floating-point round-trip, so a
+// generator that emitted a coordinate and a mismatched index would fail here.
+TEST(LagrangeBasis, LatticeIsExactForwardImageOfCoordinates) {
+    constexpr double kTol = double(1e-14);
+
+    const std::vector<std::tuple<ElementType, int, int>> tensor_cases = {
+        {ElementType::Line2, 1, 1}, {ElementType::Line2, 4, 1},
+        {ElementType::Quad4, 1, 2}, {ElementType::Quad4, 4, 2},
+        {ElementType::Hex8, 1, 3},  {ElementType::Hex8, 3, 3},
+    };
+    for (const auto& [type, order, dim] : tensor_cases) {
+        const auto layout = ReferenceNodeLayout::get_lagrange_lattice(type, order);
+        ASSERT_EQ(layout.coords.size(), layout.lattice.size())
+            << "type=" << static_cast<int>(type);
+        for (std::size_t n = 0; n < layout.coords.size(); ++n) {
+            for (int d = 0; d < dim; ++d) {
+                const auto sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(layout.coords[n][sd],
+                            line_coord_pm_one(layout.lattice[n][sd], order), kTol)
+                    << "type=" << static_cast<int>(type) << " node=" << n << " axis=" << d;
+            }
+        }
+    }
+
+    const std::vector<std::tuple<ElementType, int, int>> simplex_cases = {
+        {ElementType::Triangle3, 1, 2}, {ElementType::Triangle3, 4, 2},
+        {ElementType::Tetra4, 1, 3},    {ElementType::Tetra4, 4, 3},
+    };
+    for (const auto& [type, order, dim] : simplex_cases) {
+        const auto layout = ReferenceNodeLayout::get_lagrange_lattice(type, order);
+        ASSERT_EQ(layout.coords.size(), layout.lattice.size())
+            << "type=" << static_cast<int>(type);
+        for (std::size_t n = 0; n < layout.coords.size(); ++n) {
+            for (int d = 0; d < dim; ++d) {
+                const auto sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(layout.coords[n][sd],
+                            static_cast<double>(layout.lattice[n][sd]) /
+                                static_cast<double>(order),
+                            kTol)
+                    << "type=" << static_cast<int>(type) << " node=" << n << " axis=" << d;
+            }
+        }
+    }
+
+    for (const int order : {1, 2, 3, 4}) {
+        const auto layout =
+            ReferenceNodeLayout::get_lagrange_lattice(ElementType::Wedge6, order);
+        ASSERT_EQ(layout.coords.size(), layout.lattice.size()) << "wedge order=" << order;
+        for (std::size_t n = 0; n < layout.coords.size(); ++n) {
+            // (x, y) are triangle [0, 1] indices; z inverts through line_coord_pm_one.
+            EXPECT_NEAR(layout.coords[n][0],
+                        static_cast<double>(layout.lattice[n][0]) / static_cast<double>(order),
+                        kTol) << "wedge order=" << order << " node=" << n;
+            EXPECT_NEAR(layout.coords[n][1],
+                        static_cast<double>(layout.lattice[n][1]) / static_cast<double>(order),
+                        kTol) << "wedge order=" << order << " node=" << n;
+            EXPECT_NEAR(layout.coords[n][2],
+                        line_coord_pm_one(layout.lattice[n][2], order), kTol)
+                << "wedge order=" << order << " node=" << n;
+        }
+    }
+}
+
 TEST(LagrangeBasis, RemovedOrSerendipityFamiliesAreRejected) {
     const std::array<ElementType, 6> unsupported = {
         ElementType::Quad8,

From f6528c0f535cf187aea85cd12705bf5244276837 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 23 Jun 2026 23:23:50 -0700
Subject: [PATCH 51/91] fixing comment because LagrangeBasis consumes the
 integer layout.lattice directly

---
 tests/unitTests/FE/Basis/test_LagrangeBasis.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 42b5e7349..744c46c5d 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -475,9 +475,9 @@ TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
 // activates the volume-interior lattice, and hex order >= 3 activates the six
 // orientation-specific face traversals in NodeOrderingConventions. None of
 // those generation paths run at the orders covered elsewhere; the Kronecker
-// test is what validates the node lattice together with its llround-based
-// inverse index mapping (a duplicated or missing node makes the basis
-// non-nodal here).
+// test is what validates the node lattice together with the integer
+// lattice-index mapping the basis builds from it (a duplicated or missing
+// node makes the basis non-nodal here).
 TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
     const struct Case {
         ElementType type;

From d2b3af4227b96d1d2fdaa6900cba1f22f3d47823 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 23 Jun 2026 23:56:59 -0700
Subject: [PATCH 52/91] adding serendipity_subset_nodes(top, complete_layout,
 keep_count, complete_count)

---
 .../FE/Basis/NodeOrderingConventions.cpp      | 87 +++++++++++++++----
 1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index bf488da5f..dd8b22ff2 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -6,6 +6,7 @@
 #include "BasisTraits.h"
 
 #include <array>
+#include <utility>
 
 namespace svmp {
 namespace FE {
@@ -481,6 +482,68 @@ LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order
     }
 }
 
+// Topological interior dimension of an integer lattice node: the number of
+// independent directions in which the point sits in the relative interior of
+// the reference cell. A vertex gives 0, an edge-interior node 1, a
+// face-interior node 2, and a volume-interior node 3.
+int serendipity_interior_dim(BasisTopology top, const Lattice& idx, int order) {
+    const auto tensor_interior = [order](int v) { return (v > 0 && v < order) ? 1 : 0; };
+    switch (top) {
+        case BasisTopology::Quadrilateral:
+            return tensor_interior(idx[0]) + tensor_interior(idx[1]);
+        case BasisTopology::Hexahedron:
+            return tensor_interior(idx[0]) + tensor_interior(idx[1]) +
+                   tensor_interior(idx[2]);
+        case BasisTopology::Wedge: {
+            // (idx[0], idx[1]) is the triangle cross-section with implied third
+            // barycentric index k; idx[2] is the tensor through-axis. A triangle
+            // vertex contributes 0, a triangle edge 1, and the triangle interior 2.
+            const int i = idx[0];
+            const int j = idx[1];
+            const int k = order - i - j;
+            const bool tri_vertex = (i == order) || (j == order) || (i + j == 0);
+            const bool tri_interior = (i > 0) && (j > 0) && (k > 0);
+            const int tri_dim = tri_vertex ? 0 : (tri_interior ? 2 : 1);
+            return tri_dim + tensor_interior(idx[2]);
+        }
+        default:
+            return 0;
+    }
+}
+
+// Build a serendipity reference layout (Quad8, Hex20, Wedge15) from the complete
+// quadratic layout of the same topology. Serendipity layouts keep only the
+// element's vertices and edge midpoints and drop the face- and volume-interior
+// nodes; the complete-quadratic generators emit the vertex/edge nodes first, so
+// the serendipity set is exactly the leading keep_count nodes.
+std::vector<Point> serendipity_subset_nodes(BasisTopology top,
+                                            LagrangeNodeLayout complete,
+                                            std::size_t keep_count,
+                                            std::size_t complete_count) {
+    constexpr int kQuadraticOrder = 2;
+    svmp::throw_if<BasisConstructionException>(
+        complete.coords.size() != complete_count ||
+            complete.lattice.size() != complete_count,
+        SVMP_HERE,
+        "ReferenceNodeLayout: unexpected complete-quadratic node count for serendipity layout");
+    svmp::throw_if<BasisConstructionException>(
+        keep_count >= complete_count, SVMP_HERE,
+        "ReferenceNodeLayout: serendipity node count must be smaller than the complete layout");
+
+    for (std::size_t n = 0; n < complete.lattice.size(); ++n) {
+        const bool on_skeleton =
+            serendipity_interior_dim(top, complete.lattice[n], kQuadraticOrder) <= 1;
+        const bool kept = n < keep_count;
+        svmp::throw_if<BasisConstructionException>(
+            kept != on_skeleton, SVMP_HERE,
+            "ReferenceNodeLayout: serendipity truncation does not separate skeleton nodes from interior nodes");
+    }
+
+    std::vector<Point> nodes = std::move(complete.coords);
+    nodes.resize(keep_count);
+    return nodes;
+}
+
 std::vector<Point> element_nodes(ElementType elem_type) {
     const int order = complete_lagrange_alias_order(elem_type);
     if (order >= 0) {
@@ -488,21 +551,15 @@ std::vector<Point> element_nodes(ElementType elem_type) {
     }
 
     switch (elem_type) {
-        case ElementType::Quad8: {
-            auto nodes = generate_quad_nodes(2).coords;
-            nodes.resize(8u);
-            return nodes;
-        }
-        case ElementType::Hex20: {
-            auto nodes = generate_hex_nodes(2).coords;
-            nodes.resize(20u);
-            return nodes;
-        }
-        case ElementType::Wedge15: {
-            auto nodes = generate_wedge_nodes(2).coords;
-            nodes.resize(15u);
-            return nodes;
-        }
+        case ElementType::Quad8:
+            return serendipity_subset_nodes(BasisTopology::Quadrilateral,
+                                            generate_quad_nodes(2), 8u, 9u);
+        case ElementType::Hex20:
+            return serendipity_subset_nodes(BasisTopology::Hexahedron,
+                                            generate_hex_nodes(2), 20u, 27u);
+        case ElementType::Wedge15:
+            return serendipity_subset_nodes(BasisTopology::Wedge,
+                                            generate_wedge_nodes(2), 15u, 18u);
         case ElementType::Pyramid13:
             svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: pyramid node ordering is disabled");

From b27f37d53b2b4cbbe5d7f57ec58e45449f00cc93 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 00:06:58 -0700
Subject: [PATCH 53/91] normalizing named serendipity schemes to proper
 dimensionality and ordering expectations

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 100 +++++++++++-------
 .../FE/Basis/test_BasisErrorPaths.cpp         |   4 +
 2 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index f73557473..6009484df 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -375,58 +375,80 @@ void eval_monomial_basis(double r, double s, double t,
     }
 }
 
+struct NormalizedSerendipityRequest {
+    int dimension;
+    int order;
+};
+
+// Validate the element/order pairing and return the normalized reference
+// dimension and effective order. The fixed-layout serendipity elements (Quad8,
+// Hex8, Hex20, Wedge15) are each pinned to a single polynomial order by their
+// node count, so a mismatched explicit order is rejected uniformly rather than
+// silently reinterpreted. Quad4 is the arbitrary-order quadrilateral
+// serendipity entry point and only floors sub-linear requests to order 1.
+NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int order) {
+    const int floored_order = std::max(order, 1);
+    switch (type) {
+        case ElementType::Quad4:
+            return {2, floored_order};
+        case ElementType::Quad8:
+            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
+            return {2, 2};
+        case ElementType::Hex8:
+            svmp::throw_if<BasisConfigurationException>(floored_order != 1, SVMP_HERE,
+                "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
+            return {3, 1};
+        case ElementType::Hex20:
+            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+                "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
+            return {3, 2};
+        case ElementType::Wedge15:
+            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+                "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
+            return {3, 2};
+        default:
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
+    }
+}
+
 } // namespace
 
 SerendipityBasis::SerendipityBasis(ElementType type, int order)
     : element_type_(type), dimension_(0), order_(order), size_(0) {
+    const NormalizedSerendipityRequest normalized = normalize_serendipity_request(type, order);
+    dimension_ = normalized.dimension;
+    order_ = normalized.order;
+
     if (type == ElementType::Quad4 || type == ElementType::Quad8) {
-        dimension_ = 2;
-        if (order_ < 1) {
-            order_ = 1;
-        }
-        svmp::throw_if<BasisConfigurationException>(
-            type == ElementType::Quad8 && order_ != 2, SVMP_HERE,
-            "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
+        // Quadrilateral serendipity is generated from its monomial space, so the
+        // basis size, reference nodes, and nodal coefficient table are all built
+        // here from the effective order.
         quad_monomial_exponents_ = quad_serendipity_exponents(order_);
         size_ = quad_monomial_exponents_.size();
         nodes_ = quad_serendipity_nodes(order_, size_);
         svmp::throw_if<BasisConstructionException>(
             nodes_.size() != size_, SVMP_HERE,
             "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
-        quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
-    } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
-        dimension_ = 3;
-        if (order_ < 1) {
-            order_ = 1;
-        }
-        svmp::throw_if<BasisConfigurationException>(
-            type == ElementType::Hex8 && order_ != 1, SVMP_HERE,
-            "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
-        svmp::throw_if<BasisConfigurationException>(
-            type == ElementType::Hex20 && order_ != 2, SVMP_HERE,
-            "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
-        size_ = (type == ElementType::Hex8) ? 8u : 20u;
-    } else if (type == ElementType::Wedge15) {
-        dimension_ = 3;
-        if (order_ < 2) {
-            order_ = 2;
-        }
-        if (order_ == 2) {
-            size_ = 15;
-        } else {
-            svmp::raise<BasisConfigurationException>(SVMP_HERE,
-                "SerendipityBasis supports up to quadratic on wedge15");
-        }
-    } else {
-        svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-            "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
+        quad_inv_vandermonde_ =
+            quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
+        return;
     }
 
-    if (nodes_.empty()) {
-        nodes_.reserve(size_);
-        for (std::size_t i = 0; i < size_; ++i) {
-            nodes_.push_back(ReferenceNodeLayout::get_node_coords(element_type_, i));
-        }
+    // Hex8/Hex20/Wedge15 use fixed node layouts with tabulated coefficients: the
+    // size is pinned by the layout and the reference nodes come straight from
+    // ReferenceNodeLayout.
+    if (type == ElementType::Hex8) {
+        size_ = 8u;
+    } else if (type == ElementType::Hex20) {
+        size_ = 20u;
+    } else {
+        size_ = 15u;  // Wedge15
+    }
+    nodes_.reserve(size_);
+    for (std::size_t i = 0; i < size_; ++i) {
+        nodes_.push_back(ReferenceNodeLayout::get_node_coords(element_type_, i));
     }
 }
 
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 2699da052..f4f1446da 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -192,6 +192,10 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisConfigurationException);
     EXPECT_THROW(SerendipityBasis(ElementType::Hex20, 3),
                  BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Wedge15, 1),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Wedge15, 3),
+                 BasisConfigurationException);
 }
 
 TEST(BasisErrorPaths, BasisFactoryRejectsNonC0Continuity) {

From 6d7ebd7ed190a45f67ebad4212eade6c16aec6c7 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 00:31:03 -0700
Subject: [PATCH 54/91] addressing minor doc and scaling concerns

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |   4 +
 Code/Source/solver/FE/Basis/BasisFunction.h   |  13 ++-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 110 ++++++++----------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |   1 +
 .../FE/Basis/NodeOrderingConventions.cpp      |   5 +
 .../solver/FE/Basis/NodeOrderingConventions.h |  13 +++
 .../solver/FE/Basis/SerendipityBasis.cpp      |   8 +-
 .../FE/Basis/test_SerendipityBasis.cpp        |   6 +
 8 files changed, 96 insertions(+), 64 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 583692ca4..4849fabda 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -49,6 +49,10 @@ void BasisFunction::evaluate_all(const math::Vector<double, 3>& xi,
     evaluate_hessians(xi, hessians);
 }
 
+// The base-class *_to overloads are a correct fallback for bases that implement
+// only the vector evaluators: they evaluate into a temporary and copy into the
+// caller's span. The concrete nodal families (LagrangeBasis, SerendipityBasis)
+// override these to compute directly into the span without the temporary.
 void BasisFunction::evaluate_values_to(const math::Vector<double, 3>& xi,
                                        std::span<double> values_out) const {
     require_span_size(values_out.size(), size(), "BasisFunction::evaluate_values_to");
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 6144b4274..56683a1b0 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -60,7 +60,12 @@
  * always 3-by-3 matrices; inactive reference directions are expected to be
  * zero for conforming lower-dimensional bases. The std::vector overloads are
  * convenient for setup, tests, and adapter code. The *_to overloads write to
- * caller-owned spans and are the allocation-free path for assembly.
+ * caller-owned spans; the concrete nodal families (LagrangeBasis,
+ * SerendipityBasis) compute directly into the span and so provide the
+ * allocation-free path for assembly. The base-class defaults instead evaluate
+ * into a temporary and copy into the span, so a basis that implements only the
+ * vector form still works through the span API, just without the allocation
+ * savings.
  *
  * Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
  * solver's native node order. A caller that stores elements in another local
@@ -258,6 +263,8 @@ class BasisFunction {
      * @brief Evaluate basis values into caller-provided storage.
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values_out Output span with at least size() entries.
+     * @note The base-class default evaluates into a temporary and copies; nodal
+     *       families override this to write directly into the span.
      */
     virtual void evaluate_values_to(const math::Vector<double, 3>& xi,
                                     std::span<double> values_out) const;
@@ -266,6 +273,8 @@ class BasisFunction {
      * @brief Evaluate basis gradients into caller-provided storage.
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param gradients_out Output span with at least size() entries.
+     * @note The base-class default evaluates into a temporary and copies; nodal
+     *       families override this to write directly into the span.
      */
     virtual void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                        std::span<Gradient> gradients_out) const;
@@ -274,6 +283,8 @@ class BasisFunction {
      * @brief Evaluate basis Hessians into caller-provided storage.
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians_out Output span with at least size() entries.
+     * @note The base-class default evaluates into a temporary and copies; nodal
+     *       families override this to write directly into the span.
      */
     virtual void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                       std::span<Hessian> hessians_out) const;
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index cdb1eeed4..8f90a94c6 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -6,7 +6,6 @@
 
 #include <algorithm>
 #include <array>
-#include <limits>
 #include <span>
 #include <string>
 
@@ -97,72 +96,47 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_lattice(const std::array<in
     return e;
 }
 
-// Sentinel node index meaning "skip nothing" in product_excluding below.
-constexpr std::size_t kNoSkip = std::numeric_limits<std::size_t>::max();
-
-// Evaluate 1D Lagrange polynomials and derivatives at a point. `level` selects
-// how many derivative orders to compute: 0 for values only, 1 to also fill the
-// first derivative, and 2 to also fill the second. The output arrays stay sized
-// at n regardless of level so the tensor-product assembly can index them
-// unconditionally; only the higher-order computation loops are skipped.
-void evaluate_1d_lagrange(double x, const std::vector<double>& nodes, AxisEval& out,
+// Evaluate 1D Lagrange polynomials and their derivatives at a point in the
+// barycentric form l_i(x) = w_i * prod_{j!=i}(x - x_j), where the weights
+// w_i = 1 / prod_{j!=i}(x_i - x_j) depend only on the fixed node set and are
+// precomputed once by the caller. For each i the numerator and its first and
+// second derivatives are built by a single product-rule accumulation over the
+// remaining nodes.
+void evaluate_1d_lagrange(double x,
+                          const std::vector<double>& nodes,
+                          const std::vector<double>& weights,
+                          AxisEval& out,
                           int level) {
     const std::size_t n = nodes.size();
     out.value.assign(n, double(0));
     out.first.assign(n, double(0));
     out.second.assign(n, double(0));
 
-    if (n == 1u) {
-        out.value[0] = double(1);
-        return;
-    }
-
     for (std::size_t i = 0; i < n; ++i) {
-        // Product of (x - nodes[j]) over all j except i and the listed skips.
-        // Each derivative order drops one additional factor from the product.
-        const auto product_excluding = [&](std::size_t skip1 = kNoSkip,
-                                           std::size_t skip2 = kNoSkip) {
-            double product = double(1);
-            for (std::size_t j = 0; j < n; ++j) {
-                if (j != i && j != skip1 && j != skip2) {
-                    product *= x - nodes[j];
-                }
-            }
-            return product;
-        };
-
-        double denom = double(1);
+        double value = double(1);
+        double first = double(0);
+        double second = double(0);
         for (std::size_t j = 0; j < n; ++j) {
-            if (j != i) {
-                denom *= nodes[i] - nodes[j];
+            if (j == i) {
+                continue;
+            }
+            const double f = x - nodes[j];
+            if (level >= 2) {
+                second = second * f + double(2) * first;
             }
+            if (level >= 1) {
+                first = first * f + value;
+            }
+            value = value * f;
         }
 
-        out.value[i] = product_excluding() / denom;
-
+        const double w = weights[i];
+        out.value[i] = value * w;
         if (level >= 1) {
-            double first = double(0);
-            for (std::size_t m = 0; m < n; ++m) {
-                if (m != i) {
-                    first += product_excluding(m);
-                }
-            }
-            out.first[i] = first / denom;
+            out.first[i] = first * w;
         }
-
         if (level >= 2) {
-            double second = double(0);
-            for (std::size_t m = 0; m < n; ++m) {
-                if (m == i) {
-                    continue;
-                }
-                for (std::size_t l = 0; l < n; ++l) {
-                    if (l != i && l != m) {
-                        second += product_excluding(m, l);
-                    }
-                }
-            }
-            out.second[i] = second / denom;
+            out.second[i] = second * w;
         }
     }
 }
@@ -302,19 +276,37 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     init_nodes();
 }
 
-// Initialize equispaced 1D interpolation nodes for tensor-product axes.
+// Initialize equispaced 1D interpolation nodes and their barycentric weights for
+// tensor-product axes.
 void LagrangeBasis::init_equispaced_1d_nodes() {
-    nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
+    const std::size_t n = static_cast<std::size_t>(order_ + 1);
+    nodes_1d_.resize(n);
     for (int i = 0; i <= order_; ++i) {
         nodes_1d_[static_cast<std::size_t>(i)] =
             line_coord_pm_one(i, order_);
     }
+
+    // Barycentric weights w_i = 1 / prod_{j!=i}(x_i - x_j); the nodes are
+    // distinct so every denominator is nonzero. Precomputing here keeps the
+    // per-evaluation 1D Lagrange work at O(n^2) without recomputing the weights
+    // on every call.
+    nodes_1d_weights_.assign(n, double(1));
+    for (std::size_t i = 0; i < n; ++i) {
+        double denom = double(1);
+        for (std::size_t j = 0; j < n; ++j) {
+            if (j != i) {
+                denom *= nodes_1d_[i] - nodes_1d_[j];
+            }
+        }
+        nodes_1d_weights_[i] = double(1) / denom;
+    }
 }
 
 // Initialize reference nodes and topology-specific lookup data.
 void LagrangeBasis::init_nodes() {
     nodes_.clear();
     nodes_1d_.clear();
+    nodes_1d_weights_.clear();
     tensor_indices_.clear();
     simplex_exponents_.clear();
     wedge_indices_.clear();
@@ -435,12 +427,12 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
     AxisEval ax;
     AxisEval ay;
     AxisEval az;
-    evaluate_1d_lagrange(xi[0], nodes_1d_, ax, level);
+    evaluate_1d_lagrange(xi[0], nodes_1d_, nodes_1d_weights_, ax, level);
     if (dimension_ >= 2) {
-        evaluate_1d_lagrange(xi[1], nodes_1d_, ay, level);
+        evaluate_1d_lagrange(xi[1], nodes_1d_, nodes_1d_weights_, ay, level);
     }
     if (dimension_ >= 3) {
-        evaluate_1d_lagrange(xi[2], nodes_1d_, az, level);
+        evaluate_1d_lagrange(xi[2], nodes_1d_, nodes_1d_weights_, az, level);
     }
 
     for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
@@ -524,7 +516,7 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
     AxisEval z_axis;
     evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri,
                      want_tri_gradient, want_hessians);
-    evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis, z_level);
+    evaluate_1d_lagrange(xi[2], nodes_1d_, nodes_1d_weights_, z_axis, z_level);
 
     for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
         const auto [tri_idx, z_idx] = wedge_indices_[node];
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index fc21599f3..c3276341a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -232,6 +232,7 @@ class LagrangeBasis final : public BasisFunction {
     int order_{0};
 
     std::vector<double> nodes_1d_;
+    std::vector<double> nodes_1d_weights_;
     std::vector<math::Vector<double, 3>> nodes_;
     std::vector<TensorNodeIndex> tensor_indices_;
     std::vector<SimplexExponent> simplex_exponents_;
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index dd8b22ff2..0fe344551 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -610,6 +610,11 @@ std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
     return element_nodes(elem_type).size();
 }
 
+std::vector<math::Vector<double, 3>>
+ReferenceNodeLayout::node_coords(ElementType elem_type) {
+    return element_nodes(elem_type);
+}
+
 std::vector<math::Vector<double, 3>>
 ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
     return complete_lagrange_nodes(canonical_type, order).coords;
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 89f0365eb..c6bf6da27 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -55,6 +55,19 @@ class ReferenceNodeLayout {
                                                  std::size_t local_node);
     static std::size_t num_nodes(ElementType elem_type);
 
+    /**
+     * @brief All reference node coordinates for an element type, in public layout order.
+     *
+     * @details Returns the complete public reference layout for @p elem_type
+     * (the same coordinates get_node_coords() returns one at a time), including
+     * the serendipity layouts. Prefer this single call when the whole layout is
+     * needed: get_node_coords() regenerates the full list on every call.
+     *
+     * @param elem_type Element type to look up.
+     * @return Reference node coordinates, one per node.
+     */
+    static std::vector<math::Vector<double, 3>> node_coords(ElementType elem_type);
+
     static std::vector<math::Vector<double, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 6009484df..938ed6713 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -446,10 +446,10 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
     } else {
         size_ = 15u;  // Wedge15
     }
-    nodes_.reserve(size_);
-    for (std::size_t i = 0; i < size_; ++i) {
-        nodes_.push_back(ReferenceNodeLayout::get_node_coords(element_type_, i));
-    }
+    nodes_ = ReferenceNodeLayout::node_coords(element_type_);
+    svmp::throw_if<BasisConstructionException>(
+        nodes_.size() != size_, SVMP_HERE,
+        "SerendipityBasis: fixed serendipity layout node count does not match basis size");
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 55b580219..32406400c 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -89,6 +89,12 @@ double interpolate_nodal_function(const SerendipityBasis& basis,
     return result;
 }
 
+// The _for_test helpers below intentionally re-derive the production monomial
+// selection (superlinear-degree rule, exponent enumeration, and size formula)
+// independently of SerendipityBasis, so the basis is checked against an external
+// oracle rather than against its own code. If the production formula in
+// SerendipityBasis.cpp is changed deliberately, update these copies to match; an
+// accidental drift between the two is meant to surface here as a test failure.
 int quad_serendipity_superlinear_degree_for_test(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }

From 8b786837fc8f1644ac614e06c4360ee5f9992702 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 09:37:21 -0700
Subject: [PATCH 55/91] removing vector and matrix aliases

---
 Code/Source/solver/FE/Math/Matrix.h | 19 -------------------
 Code/Source/solver/FE/Math/Vector.h | 20 --------------------
 2 files changed, 39 deletions(-)

diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 90aa7681a..e9aa6510d 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -42,25 +42,6 @@ namespace math {
 template<typename T, std::size_t M, std::size_t N>
 using Matrix = Eigen::Matrix<T, static_cast<int>(M), static_cast<int>(N)>;
 
-// Type aliases for common matrix types
-template<typename T> using Matrix2x2 = Matrix<T, 2, 2>;
-template<typename T> using Matrix3x3 = Matrix<T, 3, 3>;
-template<typename T> using Matrix4x4 = Matrix<T, 4, 4>;
-template<typename T> using Matrix2x3 = Matrix<T, 2, 3>;
-template<typename T> using Matrix3x2 = Matrix<T, 3, 2>;
-template<typename T> using Matrix3x4 = Matrix<T, 3, 4>;
-template<typename T> using Matrix4x3 = Matrix<T, 4, 3>;
-
-// Double precision aliases
-using Matrix2x2d = Matrix2x2<double>;
-using Matrix3x3d = Matrix3x3<double>;
-using Matrix4x4d = Matrix4x4<double>;
-
-// Single precision aliases
-using Matrix2x2f = Matrix2x2<float>;
-using Matrix3x3f = Matrix3x3<float>;
-using Matrix4x4f = Matrix4x4<float>;
-
 } // namespace math
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 41466e9db..efa573dc8 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -46,26 +46,6 @@ namespace math {
 template<typename T, std::size_t N>
 using Vector = Eigen::Matrix<T, static_cast<int>(N), 1>;
 
-// Type aliases for common vector types
-template<typename T> using Vector2 = Vector<T, 2>;
-template<typename T> using Vector3 = Vector<T, 3>;
-template<typename T> using Vector4 = Vector<T, 4>;
-
-// Double precision aliases
-using Vector2d = Vector2<double>;
-using Vector3d = Vector3<double>;
-using Vector4d = Vector4<double>;
-
-// Single precision aliases
-using Vector2f = Vector2<float>;
-using Vector3f = Vector3<float>;
-using Vector4f = Vector4<float>;
-
-// Integer aliases
-using Vector2i = Vector2<int>;
-using Vector3i = Vector3<int>;
-using Vector4i = Vector4<int>;
-
 } // namespace math
 } // namespace FE
 } // namespace svmp

From 62cc262d8e20a51200f021c309115fb616431741 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 10:39:42 -0700
Subject: [PATCH 56/91] tables for serendipity basis are generated by inverting
 the Vandermonde

---
 .../FE/Basis/NodeOrderingConventions.cpp      |  19 --
 .../solver/FE/Basis/NodeOrderingConventions.h |   2 -
 .../solver/FE/Basis/SerendipityBasis.cpp      | 241 ++++++---------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  32 +-
 .../FE/Basis/test_SerendipityBasis.cpp        | 275 ++++++++++++++++++
 5 files changed, 374 insertions(+), 195 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 0fe344551..b0c46cc93 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -17,17 +17,6 @@ namespace {
 using Point = math::Vector<double, 3>;
 using Lattice = std::array<int, 3>;
 
-// Maps public Hex20 ReferenceNodeLayout slots to the internal coefficient-table
-// basis columns used by kHex20Coefficients. Wedge15 and quadrilateral
-// serendipity tables are stored directly in public node order and need no
-// equivalent permutation.
-constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
-    0, 1, 2, 3, 4, 5, 6, 7,
-    8, 13, 10, 12,
-    9, 15, 11, 14,
-    16, 17, 19, 18
-};
-
 double line_coord_zero_one(int i, int order) {
     if (order <= 0) {
         return double(0);
@@ -627,14 +616,6 @@ ReferenceNodeLayout::get_lagrange_lattice(ElementType canonical_type, int order)
     return layout;
 }
 
-std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
-    if (elem_type == ElementType::Hex20) {
-        return std::span<const std::size_t>(kHex20MeshToBasisOrder.data(),
-                                            kHex20MeshToBasisOrder.size());
-    }
-    return {};
-}
-
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index c6bf6da27..f8251c866 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -86,8 +86,6 @@ class ReferenceNodeLayout {
      */
     static LagrangeNodeLayout
     get_lagrange_lattice(ElementType canonical_type, int order);
-
-    static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
 };
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 938ed6713..33d46b0dd 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -163,43 +163,36 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
     return nodes;
 }
 
-std::vector<double> invert_dense_matrix(std::vector<double> matrix, int n, const char* label) {
-    return math::invert_dense_matrix(
-        std::move(matrix),
-        static_cast<std::size_t>(n),
-        std::string("SerendipityBasis interpolation matrix for ") + label);
-}
-
-std::vector<double> quad_serendipity_inverse_vandermonde(
+// Build the nodal coefficient table for a monomial-generated serendipity family:
+// assemble V[node][monomial] = r^a s^b t^c at the public-order reference nodes and
+// invert it. Because the nodes are in public order, the inverse is already in
+// public basis order and needs no output permutation. The same routine serves the
+// quadrilateral, Hex20, and Wedge15 spaces.
+std::vector<double> build_inverse_vandermonde(
     std::span<const Vec3> nodes,
-    std::span<const std::array<int, 2>> exponents,
-    int order) {
-    const int n = static_cast<int>(nodes.size());
+    std::span<const std::array<int, 3>> exponents,
+    const std::string& label) {
+    const std::size_t n = nodes.size();
     svmp::throw_if<BasisConstructionException>(
-        n == 0 || exponents.size() != nodes.size(), SVMP_HERE,
-        "SerendipityBasis: invalid quadrilateral serendipity interpolation setup");
-
-    std::vector<double> vandermonde(static_cast<std::size_t>(n * n), double(0));
-    auto idx = [n](int row, int col) -> std::size_t {
-        return static_cast<std::size_t>(row * n + col);
-    };
-
-    for (int row = 0; row < n; ++row) {
-        const double x = nodes[static_cast<std::size_t>(row)][0];
-        const double y = nodes[static_cast<std::size_t>(row)][1];
-        for (int col = 0; col < n; ++col) {
-            const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
-            vandermonde[idx(row, col)] = integer_power(x, ax) * integer_power(y, ay);
+        n == 0 || exponents.size() != n, SVMP_HERE,
+        "SerendipityBasis: invalid serendipity interpolation setup");
+
+    std::vector<double> vandermonde(n * n, double(0));
+    for (std::size_t row = 0; row < n; ++row) {
+        const Vec3& p = nodes[row];
+        for (std::size_t col = 0; col < n; ++col) {
+            const auto& e = exponents[col];
+            vandermonde[row * n + col] =
+                integer_power(p[0], e[0]) * integer_power(p[1], e[1]) *
+                integer_power(p[2], e[2]);
         }
     }
 
-    // Quadrilateral serendipity bases are generated from the requested
-    // monomial space, so a small dense inverse produces the nodal coefficient
-    // table at construction time. Hex20 and Wedge15 use fixed tables because
-    // only their quadratic layouts are supported here.
-    const std::string label = "Quad order " + std::to_string(order);
-    return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
+    return math::invert_dense_matrix(
+        std::move(vandermonde), n,
+        "SerendipityBasis interpolation matrix for " + label);
 }
+
 constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{0, 0, 0}},
     {{0, 0, 1}},
@@ -218,29 +211,6 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{2, 0, 1}}
 }};
 
-// Coefficients for the quadratic Wedge15 nodal serendipity basis. Rows are
-// monomials in kWedge15MonomialExponents order; columns are basis functions in
-// public Wedge15 node order. The table is the inverse of
-// V[node][monomial] = r^a s^b t^c evaluated at ReferenceNodeLayout Wedge15
-// nodes, so V * kWedge15Coefficients is the identity.
-constexpr std::array<std::array<double, 15>, 15> kWedge15Coefficients = {{
-    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}},
-    {{-0.5, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{0.5, -0, -0, 0.5, -0, -0, -0, -0, -0, -0, -0, -0, -1, -0, -0}},
-    {{-1, 0, -1, -1, 0, -1, 0, 0, 2, 0, 0, 2, -1, 0, 1}},
-    {{1.5, 0, 0.5, -1.5, 0, -0.5, 0, 0, -2, 0, 0, 2, 0, 0, 0}},
-    {{-0.5, -0, 0.5, -0.5, -0, 0.5, -0, -0, -0, -0, -0, -0, 1, -0, -1}},
-    {{1, 0, 1, 1, 0, 1, 0, 0, -2, 0, 0, -2, 0, 0, 0}},
-    {{-1, 0, -1, 1, 0, 1, 0, 0, 2, 0, 0, -2, 0, 0, 0}},
-    {{-1, -1, 0, -1, -1, 0, 2, 0, 0, 2, 0, 0, -1, 1, 0}},
-    {{1.5, 0.5, 0, -1.5, -0.5, 0, -2, 0, 0, 2, 0, 0, 0, 0, 0}},
-    {{-0.5, 0.5, -0, -0.5, 0.5, -0, -0, -0, -0, -0, -0, -0, 1, -1, -0}},
-    {{2, 0, -0, 2, 0, -0, -2, 2, -2, -2, 2, -2, -0, -0, -0}},
-    {{-2, 0, 0, 2, 0, 0, 2, -2, 2, -2, 2, -2, 0, 0, 0}},
-    {{1, 1, -0, 1, 1, -0, -2, -0, -0, -2, -0, -0, -0, -0, -0}},
-    {{-1, -1, -0, 1, 1, -0, 2, -0, -0, -2, -0, -0, -0, -0, -0}}
-}};
-
 constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
     {{0, 0, 0}}, {{0, 0, 1}}, {{0, 0, 2}}, {{0, 1, 0}}, {{0, 1, 1}},
     {{0, 1, 2}}, {{0, 2, 0}}, {{0, 2, 1}}, {{1, 0, 0}}, {{1, 0, 1}},
@@ -248,35 +218,6 @@ constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
     {{1, 2, 1}}, {{2, 0, 0}}, {{2, 0, 1}}, {{2, 1, 0}}, {{2, 1, 1}}
 }};
 
-// Coefficients for the quadratic Hex20 nodal serendipity basis. Rows are
-// monomials in kHex20MonomialExponents order; columns are basis functions in
-// the internal Hex20 coefficient-table order. The table is the inverse of
-// V[node][monomial] = r^a s^b t^c evaluated at the corresponding Hex20
-// reference nodes, so V * kHex20Coefficients is the identity. Evaluation
-// remaps public output slots through ReferenceNodeLayout::mesh_to_basis_ordering.
-constexpr std::array<std::array<double, 20>, 20> kHex20Coefficients = {{
-    {{-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25}},
-    {{0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0}},
-    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25}},
-    {{0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0, -0.25, -0.25, 0.25, 0.25}},
-    {{0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25}},
-    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0}},
-    {{-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0}},
-    {{0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0, 0, 0, 0, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25}},
-    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0}},
-    {{-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25}},
-    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25}},
-    {{-0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25}},
-    {{-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0}},
-    {{0.125, -0.125, -0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0}},
-    {{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}},
-    {{0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}}
-}};
-
 // Value and first/second derivatives of the 1D monomial x^a. The derivative of
 // a constant or linear term collapses to zero, so negative powers never arise.
 struct MonomialAxis {
@@ -299,20 +240,14 @@ inline MonomialAxis monomial_axis(double x, int exponent) {
 // reference point. For each monomial j the routine forms x^a y^b z^c and the
 // requested derivatives, then accumulates the coefficient-weighted sum into the
 // output slots. `count` is both the number of monomials and the number of basis
-// functions (the coefficient table is square). Outputs are assumed pre-zeroed by
-// the caller; an empty span skips that quantity.
-//
-// `table_to_output_order` maps each output slot to the basis column of the
-// coefficient table. An empty span means the table is already in output (public
-// node) order, i.e. the identity permutation: Hex20 supplies a real permutation
-// because its table is authored in an internal node order, while Wedge15 and the
-// quadrilateral serendipity tables are authored directly in public order.
+// functions (the coefficient table is square). The table is in public basis
+// order, so output slot i reads coefficient column i directly. Outputs are
+// assumed pre-zeroed by the caller; an empty span skips that quantity.
 template <typename ExponentFn, typename CoeffFn>
 void eval_monomial_basis(double r, double s, double t,
                          std::size_t count,
                          ExponentFn&& exponent,
                          CoeffFn&& coeff,
-                         std::span<const std::size_t> table_to_output_order,
                          std::span<double> values,
                          std::span<Gradient> gradients,
                          std::span<Hessian> hessians) {
@@ -347,9 +282,7 @@ void eval_monomial_basis(double r, double s, double t,
         }
 
         for (std::size_t slot = 0; slot < count; ++slot) {
-            const std::size_t basis_index =
-                table_to_output_order.empty() ? slot : table_to_output_order[slot];
-            const double c = coeff(j, basis_index);
+            const double c = coeff(j, slot);
             if (want_values) {
                 values[slot] += c * phi;
             }
@@ -425,31 +358,59 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         // Quadrilateral serendipity is generated from its monomial space, so the
         // basis size, reference nodes, and nodal coefficient table are all built
         // here from the effective order.
-        quad_monomial_exponents_ = quad_serendipity_exponents(order_);
-        size_ = quad_monomial_exponents_.size();
+        const auto quad_exponents = quad_serendipity_exponents(order_);
+        monomial_exponents_.clear();
+        monomial_exponents_.reserve(quad_exponents.size());
+        for (const auto& e : quad_exponents) {
+            monomial_exponents_.push_back({e[0], e[1], 0});
+        }
+        size_ = monomial_exponents_.size();
         nodes_ = quad_serendipity_nodes(order_, size_);
         svmp::throw_if<BasisConstructionException>(
             nodes_.size() != size_, SVMP_HERE,
             "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
-        quad_inv_vandermonde_ =
-            quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
+        inv_vandermonde_ = build_inverse_vandermonde(
+            nodes_, monomial_exponents_, "Quad order " + std::to_string(order_));
         return;
     }
 
-    // Hex8/Hex20/Wedge15 use fixed node layouts with tabulated coefficients: the
-    // size is pinned by the layout and the reference nodes come straight from
-    // ReferenceNodeLayout.
     if (type == ElementType::Hex8) {
+        // Hex8 is the standard trilinear corner basis, evaluated directly rather
+        // than through a generated coefficient table.
         size_ = 8u;
-    } else if (type == ElementType::Hex20) {
+        nodes_ = ReferenceNodeLayout::node_coords(element_type_);
+        svmp::throw_if<BasisConstructionException>(
+            nodes_.size() != size_, SVMP_HERE,
+            "SerendipityBasis: Hex8 layout node count does not match basis size");
+        return;
+    }
+
+    // Hex20 and Wedge15 have fixed monomial spaces and reference layouts. Their
+    // nodal coefficient tables are generated by inverting the Vandermonde built
+    // from the public-order reference nodes, exactly like the quadrilateral, so
+    // no transcribed tables or output permutation are needed.
+    std::span<const std::array<int, 3>> family_exponents;
+    std::string label;
+    if (type == ElementType::Hex20) {
         size_ = 20u;
-    } else {
-        size_ = 15u;  // Wedge15
+        family_exponents = std::span<const std::array<int, 3>>(
+            kHex20MonomialExponents.data(), kHex20MonomialExponents.size());
+        label = "Hex20";
+    } else {  // Wedge15
+        size_ = 15u;
+        family_exponents = std::span<const std::array<int, 3>>(
+            kWedge15MonomialExponents.data(), kWedge15MonomialExponents.size());
+        label = "Wedge15";
     }
     nodes_ = ReferenceNodeLayout::node_coords(element_type_);
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: fixed serendipity layout node count does not match basis size");
+    svmp::throw_if<BasisConstructionException>(
+        family_exponents.size() != size_, SVMP_HERE,
+        "SerendipityBasis: serendipity monomial count does not match basis size");
+    monomial_exponents_.assign(family_exponents.begin(), family_exponents.end());
+    inv_vandermonde_ = build_inverse_vandermonde(nodes_, monomial_exponents_, label);
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
@@ -478,64 +439,26 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     const double y = xi[1];
     const double z = xi[2];
 
-    if (dimension_ == 2) {
-        svmp::throw_if<BasisEvaluationException>(
-            quad_monomial_exponents_.size() != size_ ||
-                quad_inv_vandermonde_.size() != size_ * size_,
-            SVMP_HERE,
-            "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation");
-
-        // Quadrilateral serendipity monomials are planar; the through-axis
-        // exponent is zero, so all z derivatives vanish. The inverse Vandermonde
-        // is already in public node order (identity output ordering).
-        eval_monomial_basis(
-            x, y, z, size_,
-            [this](std::size_t j) {
-                const auto& e = quad_monomial_exponents_[j];
-                return std::array<int, 3>{e[0], e[1], 0};
-            },
-            [this](std::size_t j, std::size_t i) {
-                return quad_inv_vandermonde_[j * size_ + i];
-            },
-            std::span<const std::size_t>{},
-            values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (dimension_ == 3 && order_ == 1) {
+    if (element_type_ == ElementType::Hex8) {
         evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
 
-    if (element_type_ == ElementType::Hex20) {
-        // The Hex20 coefficient table is authored in an internal node order, so
-        // results are remapped into the public node layout through mesh_to_basis.
-        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        svmp::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
-                                               "Hex20 mesh-to-basis ordering is not registered");
-        eval_monomial_basis(
-            x, y, z, size_,
-            [](std::size_t j) { return kHex20MonomialExponents[j]; },
-            [](std::size_t j, std::size_t i) { return kHex20Coefficients[j][i]; },
-            mesh_to_basis,
-            values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (element_type_ == ElementType::Wedge15) {
-        // The Wedge15 coefficient table is authored directly in public node
-        // order, so no output reordering is applied (identity permutation).
-        eval_monomial_basis(
-            x, y, z, size_,
-            [](std::size_t j) { return kWedge15MonomialExponents[j]; },
-            [](std::size_t j, std::size_t i) { return kWedge15Coefficients[j][i]; },
-            std::span<const std::size_t>{},
-            values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "SerendipityBasis::evaluate_all_to: unsupported serendipity configuration");
+    // Quad, Hex20, and Wedge15 evaluate through their generated coefficient
+    // table, which is already in public basis order.
+    svmp::throw_if<BasisEvaluationException>(
+        monomial_exponents_.size() != size_ ||
+            inv_vandermonde_.size() != size_ * size_,
+        SVMP_HERE,
+        "SerendipityBasis: interpolation tables are not initialized for evaluation");
+
+    eval_monomial_basis(
+        x, y, z, size_,
+        [this](std::size_t j) { return monomial_exponents_[j]; },
+        [this](std::size_t j, std::size_t i) {
+            return inv_vandermonde_[j * size_ + i];
+        },
+        values_out, gradients_out, hessians_out);
 }
 
 void SerendipityBasis::evaluate_values(const math::Vector<double, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 29bc2e7da..0e09ef203 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -84,11 +84,12 @@ namespace basis {
  * supported basis.
  *
  * Hex8 uses the standard trilinear corner basis
- * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Hex20 and Wedge15 use tabulated
- * polynomial coefficient tables over monomial bases; analytical gradients and
- * Hessians are obtained by differentiating those monomials. Hex20 evaluation
- * is reordered through ReferenceNodeLayout so the output matches the public
- * basis ordering.
+ * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Hex20 and Wedge15 use fixed monomial
+ * spaces whose nodal coefficient tables are generated at construction by
+ * inverting the Vandermonde built from their public-order reference nodes, the
+ * same way the quadrilateral space is built; analytical gradients and Hessians
+ * are obtained by differentiating those monomials. Because the tables are
+ * generated in public node order, evaluation needs no output reordering.
  */
 class SerendipityBasis final : public BasisFunction {
 public:
@@ -96,11 +97,11 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Construct a serendipity basis for an element type and polynomial order.
      *
      * @details The constructor selects the topology-specific interpolation
-     * space, computes the reference node coordinates, and initializes any
-     * coefficient tables needed for evaluation. Quadrilateral bases build and
-     * invert a Vandermonde matrix for the selected serendipity monomials.
-     * Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
-     * linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
+     * space, computes the reference node coordinates, and builds the nodal
+     * coefficient table needed for evaluation. Quadrilateral, Hex20, and Wedge15
+     * bases build and invert a Vandermonde matrix for their serendipity
+     * monomials; Hex8 uses the trilinear corner basis directly. For hexahedra,
+     * only linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
      * wedges, only quadratic Wedge15 is supported. Quad4 supports explicit
      * quadrilateral serendipity requests of any order @f$p \ge 1@f$; Quad8 is
      * restricted to order 2.
@@ -152,7 +153,7 @@ class SerendipityBasis final : public BasisFunction {
      * monomial vector and multiplies by the inverse Vandermonde matrix to
      * obtain nodal shape-function values. For Hex8, values are the standard
      * trilinear corner products. For Hex20 and Wedge15, values are evaluated
-     * from the stored polynomial coefficient tables.
+     * from their generated nodal coefficient tables.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
@@ -167,7 +168,7 @@ class SerendipityBasis final : public BasisFunction {
      * coordinates. Quadrilateral gradients differentiate the monomial vector
      * before applying the inverse Vandermonde coefficients. Hex8 gradients are
      * direct derivatives of the trilinear corner products. Hex20 and Wedge15
-     * gradients are computed by differentiating the tabulated monomial
+     * gradients are computed by differentiating their generated monomial
      * expansions.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
@@ -184,7 +185,7 @@ class SerendipityBasis final : public BasisFunction {
      * derivatives of the monomial vector and inverse Vandermonde coefficients.
      * Hex8 Hessians are computed directly from the trilinear corner products.
      * Hex20 and Wedge15 Hessians are computed by differentiating their
-     * polynomial coefficient tables twice.
+     * generated monomial expansions twice.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians Receives one 3-by-3 Hessian per basis function.
@@ -239,9 +240,10 @@ class SerendipityBasis final : public BasisFunction {
     int order_;
     std::size_t size_;
     std::vector<math::Vector<double, 3>> nodes_;
-    std::vector<std::array<int, 2>> quad_monomial_exponents_;
+    // Monomial exponents (r^a s^b t^c) spanning the family's polynomial space.
+    std::vector<std::array<int, 3>> monomial_exponents_;
     // Row-major inverse Vandermonde, indexed as [monomial, basis].
-    std::vector<double> quad_inv_vandermonde_;
+    std::vector<double> inv_vandermonde_;
 
     void evaluate_all_to(const math::Vector<double, 3>& xi,
                          std::span<double> values_out,
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 32406400c..18f0e7b94 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -190,6 +190,130 @@ double bilinear_function(const math::Vector<double, 3>& p) {
     return double(2) - double(3) * p[0] + double(4) * p[1] + double(0.5) * p[0] * p[1];
 }
 
+// --- 3D serendipity guard-test helpers (Hex20, Wedge15) --------------------
+//
+// Like the quadrilateral _for_test helpers above, these re-derive the monomial
+// selection and reference-node placement from the mathematical definition of
+// each serendipity space, independently of the production tables in
+// SerendipityBasis.cpp
+
+double monomial_value_3d_for_test(const math::Vector<double, 3>& p,
+                                const std::array<int, 3>& exponent) {
+    return integer_power_for_test(p[0], exponent[0]) *
+           integer_power_for_test(p[1], exponent[1]) *
+           integer_power_for_test(p[2], exponent[2]);
+}
+
+std::vector<double> vandermonde_3d_for_test(
+    const std::vector<math::Vector<double, 3>>& nodes,
+    const std::vector<std::array<int, 3>>& exponents) {
+    const std::size_t n = nodes.size();
+    std::vector<double> vandermonde(n * n, double(0));
+    for (std::size_t row = 0; row < n; ++row) {
+        for (std::size_t col = 0; col < n; ++col) {
+            vandermonde[row * n + col] =
+                monomial_value_3d_for_test(nodes[row], exponents[col]);
+        }
+    }
+    return vandermonde;
+}
+
+// Superlinear degree generalized to three axes (the quadrilateral rule extended
+// to t). An exponent contributes only when it exceeds one.
+int superlinear_degree_3d_for_test(int ax, int ay, int az) {
+    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0) + (az > 1 ? az : 0);
+}
+
+// Hex20 serendipity span: every (ax, ay, az) in {0,1,2}^3 with superlinear
+// degree at most two.
+std::vector<std::array<int, 3>> hex20_serendipity_exponents_for_test() {
+    std::vector<std::array<int, 3>> exponents;
+    for (int ax = 0; ax <= 2; ++ax) {
+        for (int ay = 0; ay <= 2; ++ay) {
+            for (int az = 0; az <= 2; ++az) {
+                if (superlinear_degree_3d_for_test(ax, ay, az) <= 2) {
+                    exponents.push_back({ax, ay, az});
+                }
+            }
+        }
+    }
+    return exponents;
+}
+
+// Wedge15 serendipity span: triangle monomials (ax, ay) with ax + ay <= 2,
+// tensored with the through-axis. Linear triangle monomials (ax + ay <= 1) carry
+// t-degree up to two; quadratic triangle monomials (ax + ay == 2) carry t-degree
+// up to one.
+std::vector<std::array<int, 3>> wedge15_serendipity_exponents_for_test() {
+    std::vector<std::array<int, 3>> exponents;
+    for (int ax = 0; ax <= 2; ++ax) {
+        for (int ay = 0; ax + ay <= 2; ++ay) {
+            const int triangle_degree = ax + ay;
+            const int max_t = (triangle_degree <= 1) ? 2 : 1;
+            for (int az = 0; az <= max_t; ++az) {
+                exponents.push_back({ax, ay, az});
+            }
+        }
+    }
+    return exponents;
+}
+
+// Independent Hex20 reference layout: the eight cube corners followed by the
+// twelve edge midpoints, in the corner/edge order the reference layout uses.
+std::vector<math::Vector<double, 3>> hex20_reference_nodes_for_test() {
+    std::vector<math::Vector<double, 3>> corners;
+    corners.push_back({double(-1), double(-1), double(-1)});
+    corners.push_back({double(1), double(-1), double(-1)});
+    corners.push_back({double(1), double(1), double(-1)});
+    corners.push_back({double(-1), double(1), double(-1)});
+    corners.push_back({double(-1), double(-1), double(1)});
+    corners.push_back({double(1), double(-1), double(1)});
+    corners.push_back({double(1), double(1), double(1)});
+    corners.push_back({double(-1), double(1), double(1)});
+
+    const int edges[12][2] = {
+        {0, 1}, {1, 2}, {2, 3}, {3, 0},
+        {4, 5}, {5, 6}, {6, 7}, {7, 4},
+        {0, 4}, {1, 5}, {2, 6}, {3, 7},
+    };
+
+    std::vector<math::Vector<double, 3>> nodes = corners;
+    for (const auto& edge : edges) {
+        const math::Vector<double, 3> midpoint =
+            (corners[static_cast<std::size_t>(edge[0])] +
+             corners[static_cast<std::size_t>(edge[1])]) * double(0.5);
+        nodes.push_back(midpoint);
+    }
+    return nodes;
+}
+
+// Independent Wedge15 reference layout: the six prism corners followed by the
+// nine edge midpoints, in reference-layout order.
+std::vector<math::Vector<double, 3>> wedge15_reference_nodes_for_test() {
+    std::vector<math::Vector<double, 3>> corners;
+    corners.push_back({double(0), double(0), double(-1)});
+    corners.push_back({double(1), double(0), double(-1)});
+    corners.push_back({double(0), double(1), double(-1)});
+    corners.push_back({double(0), double(0), double(1)});
+    corners.push_back({double(1), double(0), double(1)});
+    corners.push_back({double(0), double(1), double(1)});
+
+    const int edges[9][2] = {
+        {0, 1}, {1, 2}, {2, 0},
+        {3, 4}, {4, 5}, {5, 3},
+        {0, 3}, {1, 4}, {2, 5},
+    };
+
+    std::vector<math::Vector<double, 3>> nodes = corners;
+    for (const auto& edge : edges) {
+        const math::Vector<double, 3> midpoint =
+            (corners[static_cast<std::size_t>(edge[0])] +
+             corners[static_cast<std::size_t>(edge[1])]) * double(0.5);
+        nodes.push_back(midpoint);
+    }
+    return nodes;
+}
+
 } // namespace
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
@@ -446,3 +570,154 @@ TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
         }
     }
 }
+
+// The Hex20 and Wedge15 guard tests below pin the nodal coefficients away from
+// the reference nodes. Each builds an independent Vandermonde from the
+// re-derived monomial span and reference nodes, so a coefficient error that
+// still vanishes at the nodes is caught.
+
+TEST(SerendipityBasis, Hex20VandermondeHasFullRank) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+    const auto exponents = hex20_serendipity_exponents_for_test();
+    const std::size_t n = basis.size();
+    ASSERT_EQ(exponents.size(), n);
+    const auto vandermonde = vandermonde_3d_for_test(basis.nodes(), exponents);
+    ASSERT_EQ(vandermonde.size(), n * n);
+    EXPECT_EQ(math::dense_matrix_rank(vandermonde, n, n), n);
+}
+
+TEST(SerendipityBasis, Wedge15VandermondeHasFullRank) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+    const auto exponents = wedge15_serendipity_exponents_for_test();
+    const std::size_t n = basis.size();
+    ASSERT_EQ(exponents.size(), n);
+    const auto vandermonde = vandermonde_3d_for_test(basis.nodes(), exponents);
+    ASSERT_EQ(vandermonde.size(), n * n);
+    EXPECT_EQ(math::dense_matrix_rank(vandermonde, n, n), n);
+}
+
+// V * C == I guard: independently invert the Vandermonde and confirm the basis
+// evaluates to the same inverse-Vandermonde nodal functions, without reading the
+// basis's internal coefficient table.
+TEST(SerendipityBasis, Hex20MatchesIndependentlyInvertedVandermonde) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+    const auto exponents = hex20_serendipity_exponents_for_test();
+    const std::size_t n = basis.size();
+    ASSERT_EQ(exponents.size(), n);
+    auto vandermonde = vandermonde_3d_for_test(basis.nodes(), exponents);
+    const auto coefficients =
+        math::invert_dense_matrix(std::move(vandermonde), n, "Hex20 test Vandermonde");
+
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
+        {double(0.11), double(0.23), double(-0.42)},
+    };
+    for (const auto& xi : points) {
+        std::vector<double> values;
+        basis.evaluate_values(xi, values);
+        ASSERT_EQ(values.size(), n);
+        for (std::size_t i = 0; i < n; ++i) {
+            double expected = double(0);
+            for (std::size_t j = 0; j < n; ++j) {
+                expected += coefficients[j * n + i] *
+                            monomial_value_3d_for_test(xi, exponents[j]);
+            }
+            EXPECT_NEAR(values[i], expected, double(1e-10)) << "basis=" << i;
+        }
+    }
+}
+
+TEST(SerendipityBasis, Wedge15MatchesIndependentlyInvertedVandermonde) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+    const auto exponents = wedge15_serendipity_exponents_for_test();
+    const std::size_t n = basis.size();
+    ASSERT_EQ(exponents.size(), n);
+    auto vandermonde = vandermonde_3d_for_test(basis.nodes(), exponents);
+    const auto coefficients =
+        math::invert_dense_matrix(std::move(vandermonde), n, "Wedge15 test Vandermonde");
+
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(0.3), double(0.1)},
+        {double(0.25), double(0.25), double(-0.4)},
+        {double(0.1), double(0.6), double(0.5)},
+    };
+    for (const auto& xi : points) {
+        std::vector<double> values;
+        basis.evaluate_values(xi, values);
+        ASSERT_EQ(values.size(), n);
+        for (std::size_t i = 0; i < n; ++i) {
+            double expected = double(0);
+            for (std::size_t j = 0; j < n; ++j) {
+                expected += coefficients[j * n + i] *
+                            monomial_value_3d_for_test(xi, exponents[j]);
+            }
+            EXPECT_NEAR(values[i], expected, double(1e-10)) << "basis=" << i;
+        }
+    }
+}
+
+// Non-nodal polynomial reproduction: the basis must reproduce every monomial in
+// its span at interior points, not just interpolate at the nodes.
+TEST(SerendipityBasis, Hex20ReproducesEverySerendipityMonomial) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+    const auto exponents = hex20_serendipity_exponents_for_test();
+    ASSERT_EQ(exponents.size(), basis.size());
+
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
+        {double(0.11), double(0.23), double(-0.42)},
+    };
+    for (const auto& exponent : exponents) {
+        for (const auto& xi : points) {
+            const double interpolated = interpolate_nodal_function(
+                basis, xi,
+                [&exponent](const math::Vector<double, 3>& node) {
+                    return monomial_value_3d_for_test(node, exponent);
+                });
+            EXPECT_NEAR(interpolated, monomial_value_3d_for_test(xi, exponent),
+                        double(1e-10))
+                << "ax=" << exponent[0] << " ay=" << exponent[1]
+                << " az=" << exponent[2];
+        }
+    }
+}
+
+TEST(SerendipityBasis, Wedge15ReproducesEverySerendipityMonomial) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+    const auto exponents = wedge15_serendipity_exponents_for_test();
+    ASSERT_EQ(exponents.size(), basis.size());
+
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(0.3), double(0.1)},
+        {double(0.25), double(0.25), double(-0.4)},
+        {double(0.1), double(0.6), double(0.5)},
+    };
+    for (const auto& exponent : exponents) {
+        for (const auto& xi : points) {
+            const double interpolated = interpolate_nodal_function(
+                basis, xi,
+                [&exponent](const math::Vector<double, 3>& node) {
+                    return monomial_value_3d_for_test(node, exponent);
+                });
+            EXPECT_NEAR(interpolated, monomial_value_3d_for_test(xi, exponent),
+                        double(1e-10))
+                << "ax=" << exponent[0] << " ay=" << exponent[1]
+                << " az=" << exponent[2];
+        }
+    }
+}
+
+// Independent node-coordinate anchor: the reference nodes must be the cube/prism
+// corners and edge midpoints, breaking the loop where the basis and its node
+// table are otherwise only checked against each other.
+TEST(SerendipityBasis, Hex20ReferenceNodesMatchIndependentConstruction) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+    expect_nodes_near(basis.nodes(), hex20_reference_nodes_for_test(), double(1e-14));
+}
+
+TEST(SerendipityBasis, Wedge15ReferenceNodesMatchIndependentConstruction) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+    expect_nodes_near(basis.nodes(), wedge15_reference_nodes_for_test(), double(1e-14));
+}

From f11f3d034d014ce6ad07703576e3e7ed39a3f357 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 11:01:52 -0700
Subject: [PATCH 57/91] fixed quad8 layout to one standard

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 11 +++--
 .../Source/solver/FE/Basis/SerendipityBasis.h | 38 +++++++++-------
 .../FE/Basis/test_SerendipityBasis.cpp        | 44 +++++++++++++++++++
 3 files changed, 72 insertions(+), 21 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 33d46b0dd..d87a4ab7e 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -355,9 +355,10 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
     order_ = normalized.order;
 
     if (type == ElementType::Quad4 || type == ElementType::Quad8) {
-        // Quadrilateral serendipity is generated from its monomial space, so the
-        // basis size, reference nodes, and nodal coefficient table are all built
-        // here from the effective order.
+        // Quadrilateral serendipity is generated from its monomial space: the
+        // basis size, monomial exponents, and nodal coefficient table are built
+        // here from the effective order, and the coefficient table is the inverse
+        // Vandermonde of those monomials at the reference nodes.
         const auto quad_exponents = quad_serendipity_exponents(order_);
         monomial_exponents_.clear();
         monomial_exponents_.reserve(quad_exponents.size());
@@ -365,7 +366,9 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
             monomial_exponents_.push_back({e[0], e[1], 0});
         }
         size_ = monomial_exponents_.size();
-        nodes_ = quad_serendipity_nodes(order_, size_);
+        nodes_ = (type == ElementType::Quad8)
+                     ? ReferenceNodeLayout::node_coords(element_type_)
+                     : quad_serendipity_nodes(order_, size_);
         svmp::throw_if<BasisConstructionException>(
             nodes_.size() != size_, SVMP_HERE,
             "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 0e09ef203..14c0b151f 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -76,20 +76,23 @@ namespace basis {
  *
  * `SerendipityBasis(ElementType::Quad4, p)` supports explicit
  * arbitrary-order quadrilateral serendipity requests for @f$p \ge 1@f$
- * (requests below one are normalized to one). `ElementType::Quad8` remains
- * the standard quadratic eight-node layout and is valid only with order 2.
- * Solver-default basis selection remains separate: `basis_factory` maps the
- * complete Quad4 layout to the default linear Lagrange basis and maps Quad8 to
- * quadratic serendipity unless a caller explicitly requests a different
- * supported basis.
+ * (requests below one are normalized to one) and generates its own reference
+ * nodes, since the higher-order interior ordering is an implementation
+ * convention rather than a public layout. `ElementType::Quad8` is the named
+ * quadratic eight-node layout (valid only with order 2) and, like Hex20 and
+ * Wedge15, takes its reference nodes from ReferenceNodeLayout so that all named
+ * fixed layouts share the single public node ordering. Solver-default basis
+ * selection remains separate: `basis_factory` maps the complete Quad4 layout to
+ * the default linear Lagrange basis and maps Quad8 to quadratic serendipity
+ * unless a caller explicitly requests a different supported basis.
  *
  * Hex8 uses the standard trilinear corner basis
- * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Hex20 and Wedge15 use fixed monomial
- * spaces whose nodal coefficient tables are generated at construction by
- * inverting the Vandermonde built from their public-order reference nodes, the
- * same way the quadrilateral space is built; analytical gradients and Hessians
- * are obtained by differentiating those monomials. Because the tables are
- * generated in public node order, evaluation needs no output reordering.
+ * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Quad8, Hex20, and Wedge15 use fixed
+ * monomial spaces whose nodal coefficient tables are generated at construction
+ * by inverting the Vandermonde built from their public-order ReferenceNodeLayout
+ * nodes; analytical gradients and Hessians are obtained by differentiating those
+ * monomials. Because the tables are generated in public node order, evaluation
+ * needs no output reordering.
  */
 class SerendipityBasis final : public BasisFunction {
 public:
@@ -132,11 +135,12 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Return the reference interpolation nodes in basis ordering.
      *
      * @details Node coordinates are the points at which the serendipity basis
-     * satisfies the nodal interpolation property. Quadrilateral nodes are
-     * placed first on the boundary and then, for higher order requests, at the
-     * selected interior points needed to make the reduced monomial space
-     * unisolvent. Hexahedral and wedge nodes are taken from
-     * ReferenceNodeLayout. For high-order Quad4 serendipity, the deterministic
+     * satisfies the nodal interpolation property. quadrilateral, hexahedral, and wedge
+     * nodes are taken from ReferenceNodeLayout, the public node-ordering source
+     * the solver adapter permutes against. Arbitrary-order Quad4 nodes are
+     * generated here instead: boundary nodes first and then, for higher order
+     * requests, the selected interior points needed to make the reduced monomial
+     * space unisolvent. For high-order Quad4 serendipity, the deterministic
      * interior row ordering is an implementation convention; callers should
      * pair it with basis values from the same object rather than assume an
      * external mesh ordering contract beyond the supported Quad4/Quad8
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 18f0e7b94..e500a803a 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -314,6 +314,28 @@ std::vector<math::Vector<double, 3>> wedge15_reference_nodes_for_test() {
     return nodes;
 }
 
+// Independent Quad8 reference layout: the four quad corners followed by the four
+// edge midpoints, in the corner/edge order the reference layout uses (the VTK
+// quad boundary traversal). Mirrors the Hex20/Wedge15 anchors above.
+std::vector<math::Vector<double, 3>> quad8_reference_nodes_for_test() {
+    std::vector<math::Vector<double, 3>> corners;
+    corners.push_back({double(-1), double(-1), double(0)});
+    corners.push_back({double(1), double(-1), double(0)});
+    corners.push_back({double(1), double(1), double(0)});
+    corners.push_back({double(-1), double(1), double(0)});
+
+    const int edges[4][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 0}};
+
+    std::vector<math::Vector<double, 3>> nodes = corners;
+    for (const auto& edge : edges) {
+        const math::Vector<double, 3> midpoint =
+            (corners[static_cast<std::size_t>(edge[0])] +
+             corners[static_cast<std::size_t>(edge[1])]) * double(0.5);
+        nodes.push_back(midpoint);
+    }
+    return nodes;
+}
+
 } // namespace
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
@@ -321,11 +343,33 @@ TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
     SerendipityBasis explicit_quad4_basis(ElementType::Quad4, 2);
 
     EXPECT_EQ(basis.size(), 8u);
+    // Quad8 sources its nodes from ReferenceNodeLayout while explicit Quad4 order
+    // 2 uses the local arbitrary-order generator, so this also pins the two
+    // independent quadrilateral node sources to agree at the production order.
     expect_nodes_near(basis.nodes(), explicit_quad4_basis.nodes(), double(1e-14));
     expect_nodal_delta(basis, basis.nodes(), double(1e-10));
     expect_partition_of_unity(basis, {double(0.17), double(-0.31), double(0)});
 }
 
+// Quad8 takes its reference nodes from ReferenceNodeLayout -- the single public
+// node-ordering source the solver adapter permutes against, the same source
+// Hex20 and Wedge15 use.
+TEST(SerendipityBasis, Quad8ReferenceNodesComeFromReferenceNodeLayout) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+    expect_nodes_near(basis.nodes(),
+                      ReferenceNodeLayout::node_coords(ElementType::Quad8),
+                      double(1e-14));
+}
+
+// Independent node-coordinate anchor for the Quad8 layout: the four corners
+// followed by the four edge midpoints, breaking the loop where the basis and the
+// reference layout are otherwise only checked against each other. Mirrors the
+// Hex20/Wedge15 independent-construction anchors.
+TEST(SerendipityBasis, Quad8ReferenceNodesMatchIndependentConstruction) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+    expect_nodes_near(basis.nodes(), quad8_reference_nodes_for_test(), double(1e-14));
+}
+
 TEST(SerendipityBasis, Hex20IsNodalAndPartitionsUnity) {
     SerendipityBasis basis(ElementType::Hex20, 2);
 

From 5a0efdae1bb3f812a8196efd70ee87a899b1ce9b Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 12:26:50 -0700
Subject: [PATCH 58/91] making basis construction based on topology and order
 for arbitrary order basis functions. otherwise named basis functions (Line3,
 Tet10) imply specific order and topology

---
 Code/Source/solver/FE/Basis/BasisFunction.h   |  27 ++-
 Code/Source/solver/FE/Basis/BasisTraits.h     |  97 +++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  69 ++++----
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  55 ++++--
 .../solver/FE/Basis/SerendipityBasis.cpp      | 135 ++++++++------
 .../Source/solver/FE/Basis/SerendipityBasis.h | 103 +++++++----
 .../FE/Basis/test_BasisErrorPaths.cpp         |  30 ++++
 .../unitTests/FE/Basis/test_BasisHessians.cpp | 149 ++++++++--------
 .../FE/Basis/test_ConstexprBasis.cpp          |  16 ++
 .../FE/Basis/test_HigherOrderWedge.cpp        |  11 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 164 +++++++++++-------
 .../FE/Basis/test_SerendipityBasis.cpp        |  53 ++++--
 12 files changed, 605 insertions(+), 304 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 56683a1b0..c2c61ed6e 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -5,6 +5,7 @@
 #define SVMP_FE_BASIS_BASISFUNCTION_H
 
 #include "BasisExceptions.h"
+#include "BasisTraits.h"
 #include "Math/Matrix.h"
 #include "Math/Vector.h"
 #include "Types.h"
@@ -184,8 +185,30 @@ class BasisFunction {
     virtual BasisType basis_type() const noexcept = 0;
 
     /**
-     * @brief Return the canonical element type represented by this basis.
-     * @return Element type used for node layout and evaluation.
+     * @brief Return the reference topology of this basis.
+     *
+     * @details Together with order(), this is the authoritative identity of a
+     * basis: a topology plus a polynomial order, with no node-count assumption.
+     * Arbitrary-order bases are constructed from a BasisTopology and an order;
+     * named ElementType layouts (Hex8, Hex27, ...) are a fixed-order shorthand
+     * that maps to the same (topology, order) pair.
+     *
+     * @return Reference topology.
+     */
+    virtual BasisTopology topology() const noexcept = 0;
+
+    /**
+     * @brief Return the named element type for this basis, if one exists.
+     *
+     * @details Convenience accessor that round-trips to a named mesh element
+     * when one is defined for this (topology(), order(), basis_type()) triple
+     * (orders 0-2), and returns ElementType::Unknown otherwise (for example an
+     * order-0 P0 basis on a volume topology, or any order >= 3 that has no named
+     * layout). topology() + order() are the authoritative identity; this should
+     * not be used as a discriminator for high-order or topology-constructed
+     * bases.
+     *
+     * @return Named element type, or ElementType::Unknown when none applies.
      */
     virtual ElementType element_type() const noexcept = 0;
 
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 9c5e0945d..aeb79498e 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -156,6 +156,103 @@ namespace detail {
     }
 }
 
+// Reference-space dimension of a basis topology: 0 for points up to 3 for
+// volume topologies; -1 for Unknown.
+[[nodiscard]] constexpr int topology_dimension(BasisTopology top) noexcept {
+    switch (top) {
+        case BasisTopology::Point:         return 0;
+        case BasisTopology::Line:          return 1;
+        case BasisTopology::Triangle:
+        case BasisTopology::Quadrilateral: return 2;
+        case BasisTopology::Tetrahedron:
+        case BasisTopology::Hexahedron:
+        case BasisTopology::Wedge:         return 3;
+        default:                           return -1;
+    }
+}
+
+// Lowest-order named element that represents a topology. Used internally to
+// drive the reference-node generators, which key on a canonical ElementType
+// (and re-canonicalize it). This is the inverse of topology() for the linear
+// elements and is purely an implementation detail: the node-count name never
+// leaks into the public basis identity.
+[[nodiscard]] constexpr ElementType lagrange_topology_representative(BasisTopology top) noexcept {
+    switch (top) {
+        case BasisTopology::Point:         return ElementType::Point1;
+        case BasisTopology::Line:          return ElementType::Line2;
+        case BasisTopology::Triangle:      return ElementType::Triangle3;
+        case BasisTopology::Quadrilateral: return ElementType::Quad4;
+        case BasisTopology::Tetrahedron:   return ElementType::Tetra4;
+        case BasisTopology::Hexahedron:    return ElementType::Hex8;
+        case BasisTopology::Wedge:         return ElementType::Wedge6;
+        default:                           return ElementType::Unknown;
+    }
+}
+
+// Polynomial order baked into a named Lagrange element layout: 0 for the point,
+// 1 for the linear elements, 2 for the complete-quadratic aliases; -1 for types
+// with no complete-Lagrange order (serendipity, pyramid, Unknown). Unlike
+// complete_lagrange_alias_order this also maps Point1 -> 0, so it is the single
+// source of truth the (ElementType, order) constructor validates against.
+[[nodiscard]] constexpr int named_lagrange_order(ElementType type) noexcept {
+    if (type == ElementType::Point1) {
+        return 0;
+    }
+    return complete_lagrange_alias_order(type);
+}
+
+// Inverse of (topology(), order()) for the named layouts: the ElementType that a
+// (topology, order, family) triple denotes, or Unknown when no named layout
+// exists (order 0 on a non-point topology, any order >= 3, or a reduced family
+// at an unsupported order). topology() + order() remain the authoritative
+// identity; this only backs the element_type() convenience accessor.
+[[nodiscard]] constexpr ElementType named_element_for(BasisTopology top, int order,
+                                                      BasisType family) noexcept {
+    if (family == BasisType::Serendipity) {
+        switch (top) {
+            case BasisTopology::Quadrilateral:
+                return order == 2 ? ElementType::Quad8 : ElementType::Unknown;
+            case BasisTopology::Hexahedron:
+                if (order == 1) { return ElementType::Hex8; }
+                if (order == 2) { return ElementType::Hex20; }
+                return ElementType::Unknown;
+            case BasisTopology::Wedge:
+                return order == 2 ? ElementType::Wedge15 : ElementType::Unknown;
+            default:
+                return ElementType::Unknown;
+        }
+    }
+
+    // Lagrange (and any nodal family built on the complete layouts).
+    if (top == BasisTopology::Point) {
+        return order == 0 ? ElementType::Point1 : ElementType::Unknown;
+    }
+    switch (order) {
+        case 1:
+            switch (top) {
+                case BasisTopology::Line:          return ElementType::Line2;
+                case BasisTopology::Triangle:      return ElementType::Triangle3;
+                case BasisTopology::Quadrilateral: return ElementType::Quad4;
+                case BasisTopology::Tetrahedron:   return ElementType::Tetra4;
+                case BasisTopology::Hexahedron:    return ElementType::Hex8;
+                case BasisTopology::Wedge:         return ElementType::Wedge6;
+                default:                           return ElementType::Unknown;
+            }
+        case 2:
+            switch (top) {
+                case BasisTopology::Line:          return ElementType::Line3;
+                case BasisTopology::Triangle:      return ElementType::Triangle6;
+                case BasisTopology::Quadrilateral: return ElementType::Quad9;
+                case BasisTopology::Tetrahedron:   return ElementType::Tetra10;
+                case BasisTopology::Hexahedron:    return ElementType::Hex27;
+                case BasisTopology::Wedge:         return ElementType::Wedge18;
+                default:                           return ElementType::Unknown;
+            }
+        default:
+            return ElementType::Unknown;
+    }
+}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 8f90a94c6..ae3d33829 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -29,29 +29,16 @@ struct SimplexEval {
     std::vector<Hessian> hessian;
 };
 
-struct NormalizedLagrangeRequest {
-    ElementType element_type;
-    int order;
-};
-
-// Validate and return the supported basis topology for a Lagrange element type.
-BasisTopology supported_lagrange_topology(ElementType type) {
-    const BasisTopology top = topology(type);
-    svmp::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
-                                                     "LagrangeBasis: unsupported element type");
-    return top;
-}
-
-// Normalize named higher-order element requests to base Lagrange topologies.
+// Validate a named-element Lagrange request and return its reference topology.
 //
-// This function only adds the LagrangeBasis routing policy: serendipity
-// layouts and pyramids are rejected, and a named quadratic alias
-// (Line3, Triangle6, Quad9, Tetra10, Hex27, Wedge18) is floored
-// to at least quadratic order. The floor only raises the
-// order: a higher requested order on an alias is honored, so
-// LagrangeBasis(Hex27, 5) yields an order-5 basis on the Hex8 topology rather
-// than rejecting the over-specified order.
-NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
+// Serendipity layouts and pyramids are rejected. The requested order must equal
+// the order baked into the named layout (0 for Point1, 1 for the linear
+// elements, 2 for the complete-quadratic aliases Line3/Triangle6/Quad9/Tetra10/
+// Hex27/Wedge18). A named element therefore cannot carry a conflicting order;
+// arbitrary orders are requested through the BasisTopology constructor, so a
+// reader never has to read a polynomial order out of a node-count name such as
+// Hex8 or Tetra10.
+BasisTopology validated_lagrange_topology(ElementType element_type, int order) {
     switch (element_type) {
         case ElementType::Quad8:
             svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
@@ -71,10 +58,15 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
             break;
     }
 
-    const ElementType canonical = canonical_lagrange_type(element_type);
-    const bool is_quadratic_alias = canonical != element_type;
-    const int normalized_order = is_quadratic_alias ? std::max(order, 2) : order;
-    return {canonical, normalized_order};
+    const BasisTopology top = topology(element_type);
+    svmp::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
+                                                     "LagrangeBasis: unsupported element type");
+
+    const int baked_order = named_lagrange_order(element_type);
+    svmp::throw_if<BasisConfigurationException>(order != baked_order, SVMP_HERE,
+        "LagrangeBasis: a named element layout has a fixed polynomial order; request the matching "
+        "BasisTopology with an explicit order to choose a different order");
+    return top;
 }
 
 // Convert an integer lattice index (i, j[, k]) into the barycentric exponent
@@ -263,19 +255,19 @@ void evaluate_simplex(const Vec3& xi,
 
 } // namespace
 
-LagrangeBasis::LagrangeBasis(ElementType type, int order)
-    : element_type_(type), order_(order) {
-    const auto normalized = normalize_lagrange_request(element_type_, order_);
-    element_type_ = normalized.element_type;
-    order_ = normalized.order;
+LagrangeBasis::LagrangeBasis(BasisTopology topology, int order)
+    : topology_(topology), order_(order) {
+    svmp::throw_if<BasisElementCompatibilityException>(topology_ == BasisTopology::Unknown, SVMP_HERE,
+                                                     "LagrangeBasis: unknown reference topology");
     svmp::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
                                               "LagrangeBasis requires non-negative polynomial order");
-
-    topology_ = supported_lagrange_topology(element_type_);
-    dimension_ = reference_dimension(element_type_);
+    dimension_ = topology_dimension(topology_);
     init_nodes();
 }
 
+LagrangeBasis::LagrangeBasis(ElementType type, int order)
+    : LagrangeBasis(validated_lagrange_topology(type, order), order) {}
+
 // Initialize equispaced 1D interpolation nodes and their barycentric weights for
 // tensor-product axes.
 void LagrangeBasis::init_equispaced_1d_nodes() {
@@ -343,7 +335,8 @@ void LagrangeBasis::build_point_nodes() {
 // Build nodes and axis indices for tensor-product elements.
 void LagrangeBasis::build_tensor_product_nodes() {
     init_equispaced_1d_nodes();
-    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    const auto layout =
+        ReferenceNodeLayout::get_lagrange_lattice(lagrange_topology_representative(topology_), order_);
     nodes_ = layout.coords;
     tensor_indices_.reserve(layout.lattice.size());
     for (const auto& idx : layout.lattice) {
@@ -358,7 +351,8 @@ void LagrangeBasis::build_tensor_product_nodes() {
 
 // Build nodes and barycentric exponents for simplex elements.
 void LagrangeBasis::build_simplex_nodes() {
-    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    const auto layout =
+        ReferenceNodeLayout::get_lagrange_lattice(lagrange_topology_representative(topology_), order_);
     nodes_ = layout.coords;
     simplex_exponents_.reserve(layout.lattice.size());
     for (const auto& idx : layout.lattice) {
@@ -369,7 +363,8 @@ void LagrangeBasis::build_simplex_nodes() {
 // Build nodes and mixed triangle-axis lookup data for wedge elements.
 void LagrangeBasis::build_wedge_nodes() {
     init_equispaced_1d_nodes();
-    const auto layout = ReferenceNodeLayout::get_lagrange_lattice(element_type_, order_);
+    const auto layout =
+        ReferenceNodeLayout::get_lagrange_lattice(lagrange_topology_representative(topology_), order_);
     nodes_ = layout.coords;
 
     const auto tri_layout =
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index c3276341a..79e4a25b0 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -33,9 +33,13 @@ namespace basis {
  * @details LagrangeBasis represents the nodal interpolation basis associated
  * with an equispaced reference-node lattice. It supports point, line,
  * quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
- * elements. Named complete quadratic elements such as Line3, Triangle6,
- * Quad9, Tetra10, Hex27, and Wedge18 are normalized to their canonical
- * linear topology plus effective order 2.
+ * topologies. The primary constructor takes a BasisTopology and an explicit
+ * polynomial order, so an arbitrary order carries no node-count assumption
+ * (an order-2 hexahedron is BasisTopology::Hexahedron with order 2). A named
+ * ElementType such as Line3, Quad9, Tetra10, or Hex27 is a fixed-order
+ * shorthand: it maps to the same (topology, order) pair and the requested order
+ * must equal the order baked into that layout (1 for the linear elements, 2 for
+ * the complete-quadratic aliases, 0 for Point1).
  *
  * Tensor-product elements use the one-dimensional nodal polynomials
  * @f[
@@ -77,18 +81,35 @@ class LagrangeBasis final : public BasisFunction {
     using WedgeNodeIndex = std::array<std::size_t, 2>;
 
     /**
-     * @brief Construct a Lagrange basis for an element type and polynomial order.
+     * @brief Construct a Lagrange basis on a reference topology at a polynomial order.
      *
-     * @details The constructor normalizes complete higher-order aliases to the
-     * canonical topology and effective polynomial order, builds the reference
-     * node coordinates, and precomputes topology-specific lookup data used by
-     * evaluation. Tensor-product bases store per-axis node indices, simplex
-     * bases store barycentric exponent tuples, and wedge bases store the
-     * triangle-node/axis-node decomposition.
+     * @details This is the primary, arbitrary-order entry point: a BasisTopology
+     * carries no node-count assumption, so any supported order is requested
+     * explicitly (e.g. an order-5 hexahedron is BasisTopology::Hexahedron with
+     * order 5). The constructor builds the reference node coordinates and the
+     * topology-specific lookup data used by evaluation. Tensor-product bases
+     * store per-axis node indices, simplex bases store barycentric exponent
+     * tuples, and wedge bases store the triangle-node/axis-node decomposition.
      *
-     * @param type Element type used to determine topology and reference-node layout.
-     * @param order Requested polynomial order.
-     * @throws BasisConfigurationException If the effective order is negative.
+     * @param topology Reference topology; Point through the volume topologies.
+     * @param order Polynomial order; must be non-negative. Point is order 0.
+     * @throws BasisConfigurationException If the order is negative.
+     * @throws BasisElementCompatibilityException If the topology is Unknown.
+     */
+    LagrangeBasis(BasisTopology topology, int order);
+
+    /**
+     * @brief Construct a Lagrange basis from a named element layout.
+     *
+     * @details Convenience overload for a named mesh element. The order is baked
+     * into the layout (0 for Point1, 1 for the linear elements, 2 for the
+     * complete-quadratic aliases such as Hex27/Tetra10) and the requested
+     * @p order must match it; arbitrary orders must be requested through the
+     * BasisTopology overload. Serendipity and pyramid layouts are rejected.
+     *
+     * @param type Named element type used to determine topology and baked-in order.
+     * @param order Requested order; must equal the element's baked-in order.
+     * @throws BasisConfigurationException If @p order does not match the element's baked-in order.
      * @throws BasisElementCompatibilityException If the element type is unsupported.
      */
     LagrangeBasis(ElementType type, int order);
@@ -96,8 +117,13 @@ class LagrangeBasis final : public BasisFunction {
     /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Lagrange; }
 
+    /** @copydoc BasisFunction::topology() */
+    BasisTopology topology() const noexcept final { return topology_; }
+
     /** @copydoc BasisFunction::element_type() */
-    ElementType element_type() const noexcept final { return element_type_; }
+    ElementType element_type() const noexcept final {
+        return named_element_for(topology_, order_, BasisType::Lagrange);
+    }
 
     /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
@@ -226,7 +252,6 @@ class LagrangeBasis final : public BasisFunction {
                               std::span<Hessian> hessians_out) const final;
 
 private:
-    ElementType element_type_;
     BasisTopology topology_{BasisTopology::Unknown};
     int dimension_{0};
     int order_{0};
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index d87a4ab7e..18391fb5d 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -309,89 +309,120 @@ void eval_monomial_basis(double r, double s, double t,
 }
 
 struct NormalizedSerendipityRequest {
+    BasisTopology topology;
     int dimension;
     int order;
 };
 
-// Validate the element/order pairing and return the normalized reference
-// dimension and effective order. The fixed-layout serendipity elements (Quad8,
-// Hex8, Hex20, Wedge15) are each pinned to a single polynomial order by their
-// node count, so a mismatched explicit order is rejected uniformly rather than
-// silently reinterpreted. Quad4 is the arbitrary-order quadrilateral
-// serendipity entry point and only floors sub-linear requests to order 1.
+// Validate a named serendipity element/order pairing and return its topology,
+// reference dimension, and order. The named serendipity layouts (Quad8, Hex8,
+// Hex20, Wedge15) are each pinned to a single polynomial order by their node
+// count, so a mismatched explicit order is rejected. Arbitrary-order
+// quadrilateral serendipity is not a named element: it is requested through the
+// BasisTopology::Quadrilateral constructor.
 NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int order) {
     const int floored_order = std::max(order, 1);
     switch (type) {
-        case ElementType::Quad4:
-            return {2, floored_order};
         case ElementType::Quad8:
             svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
-                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
-            return {2, 2};
+                "SerendipityBasis: Quad8 is the quadratic 8-node serendipity layout (order 2 only); "
+                "use BasisTopology::Quadrilateral for higher-order quadrilateral serendipity");
+            return {BasisTopology::Quadrilateral, 2, 2};
         case ElementType::Hex8:
             svmp::throw_if<BasisConfigurationException>(floored_order != 1, SVMP_HERE,
                 "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
-            return {3, 1};
+            return {BasisTopology::Hexahedron, 3, 1};
         case ElementType::Hex20:
             svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
                 "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
-            return {3, 2};
+            return {BasisTopology::Hexahedron, 3, 2};
         case ElementType::Wedge15:
             svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
                 "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
-            return {3, 2};
+            return {BasisTopology::Wedge, 3, 2};
         default:
             svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
+                "SerendipityBasis named elements are Quad8, Hex8, Hex20, and Wedge15; "
+                "use BasisTopology::Quadrilateral for arbitrary-order quadrilateral serendipity");
     }
 }
 
 } // namespace
 
+SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
+    : topology_(topology), dimension_(0), order_(0), size_(0) {
+    svmp::throw_if<BasisElementCompatibilityException>(
+        topology_ != BasisTopology::Quadrilateral, SVMP_HERE,
+        "SerendipityBasis: arbitrary-order topology construction is only supported for "
+        "Quadrilateral; use the named ElementType (Hex8/Hex20/Wedge15) for hex/wedge serendipity");
+    dimension_ = 2;
+    order_ = std::max(order, 1);
+    init_quadrilateral(order_, /*nodes_from_reference_layout=*/false);
+}
+
 SerendipityBasis::SerendipityBasis(ElementType type, int order)
-    : element_type_(type), dimension_(0), order_(order), size_(0) {
+    : topology_(BasisTopology::Unknown), dimension_(0), order_(0), size_(0) {
     const NormalizedSerendipityRequest normalized = normalize_serendipity_request(type, order);
+    topology_ = normalized.topology;
     dimension_ = normalized.dimension;
     order_ = normalized.order;
 
-    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
-        // Quadrilateral serendipity is generated from its monomial space: the
-        // basis size, monomial exponents, and nodal coefficient table are built
-        // here from the effective order, and the coefficient table is the inverse
-        // Vandermonde of those monomials at the reference nodes.
-        const auto quad_exponents = quad_serendipity_exponents(order_);
-        monomial_exponents_.clear();
-        monomial_exponents_.reserve(quad_exponents.size());
-        for (const auto& e : quad_exponents) {
-            monomial_exponents_.push_back({e[0], e[1], 0});
-        }
-        size_ = monomial_exponents_.size();
-        nodes_ = (type == ElementType::Quad8)
-                     ? ReferenceNodeLayout::node_coords(element_type_)
-                     : quad_serendipity_nodes(order_, size_);
-        svmp::throw_if<BasisConstructionException>(
-            nodes_.size() != size_, SVMP_HERE,
-            "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
-        inv_vandermonde_ = build_inverse_vandermonde(
-            nodes_, monomial_exponents_, "Quad order " + std::to_string(order_));
-        return;
+    switch (type) {
+        case ElementType::Quad8:
+            // Quad8 is the named quadratic layout; its nodes come from
+            // ReferenceNodeLayout so the basis shares the single public Quad8
+            // ordering (the same source Hex20/Wedge15 use).
+            init_quadrilateral(order_, /*nodes_from_reference_layout=*/true);
+            return;
+        case ElementType::Hex8:
+            // Hex8 is the standard trilinear corner basis, evaluated directly
+            // rather than through a generated coefficient table.
+            size_ = 8u;
+            nodes_ = ReferenceNodeLayout::node_coords(type);
+            svmp::throw_if<BasisConstructionException>(
+                nodes_.size() != size_, SVMP_HERE,
+                "SerendipityBasis: Hex8 layout node count does not match basis size");
+            return;
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+            init_fixed_named(type);
+            return;
+        default:
+            // normalize_serendipity_request already rejected anything else.
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "SerendipityBasis: unsupported named serendipity element");
     }
+}
 
-    if (type == ElementType::Hex8) {
-        // Hex8 is the standard trilinear corner basis, evaluated directly rather
-        // than through a generated coefficient table.
-        size_ = 8u;
-        nodes_ = ReferenceNodeLayout::node_coords(element_type_);
-        svmp::throw_if<BasisConstructionException>(
-            nodes_.size() != size_, SVMP_HERE,
-            "SerendipityBasis: Hex8 layout node count does not match basis size");
-        return;
+// Build the quadrilateral serendipity monomial space, reference nodes, and nodal
+// coefficient table for the given order. The coefficient table is the inverse
+// Vandermonde of those monomials at the reference nodes; because the nodes are
+// in public order, evaluation needs no output permutation. The named Quad8
+// layout sources its nodes from ReferenceNodeLayout; the arbitrary-order
+// topology path generates them.
+void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_layout) {
+    const auto quad_exponents = quad_serendipity_exponents(order);
+    monomial_exponents_.clear();
+    monomial_exponents_.reserve(quad_exponents.size());
+    for (const auto& e : quad_exponents) {
+        monomial_exponents_.push_back({e[0], e[1], 0});
     }
+    size_ = monomial_exponents_.size();
+    nodes_ = nodes_from_reference_layout
+                 ? ReferenceNodeLayout::node_coords(ElementType::Quad8)
+                 : quad_serendipity_nodes(order, size_);
+    svmp::throw_if<BasisConstructionException>(
+        nodes_.size() != size_, SVMP_HERE,
+        "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
+    inv_vandermonde_ = build_inverse_vandermonde(
+        nodes_, monomial_exponents_, "Quad order " + std::to_string(order));
+}
 
-    // Hex20 and Wedge15 have fixed monomial spaces and reference layouts. Their
-    // nodal coefficient tables are generated by inverting the Vandermonde built
-    // from the public-order reference nodes, exactly like the quadrilateral, so
-    // no transcribed tables or output permutation are needed.
+// Build a fixed named volume serendipity layout (Hex20 or Wedge15). The nodal
+// coefficient table is generated by inverting the Vandermonde built from the
+// public-order ReferenceNodeLayout nodes, exactly like the quadrilateral, so no
+// transcribed tables or output permutation are needed.
+void SerendipityBasis::init_fixed_named(ElementType type) {
     std::span<const std::array<int, 3>> family_exponents;
     std::string label;
     if (type == ElementType::Hex20) {
@@ -405,7 +436,7 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
             kWedge15MonomialExponents.data(), kWedge15MonomialExponents.size());
         label = "Wedge15";
     }
-    nodes_ = ReferenceNodeLayout::node_coords(element_type_);
+    nodes_ = ReferenceNodeLayout::node_coords(type);
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: fixed serendipity layout node count does not match basis size");
@@ -442,7 +473,9 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     const double y = xi[1];
     const double z = xi[2];
 
-    if (element_type_ == ElementType::Hex8) {
+    // Hex8 (Hexahedron at order 1) is the only serendipity basis evaluated
+    // directly from the trilinear corner products rather than a coefficient table.
+    if (topology_ == BasisTopology::Hexahedron && order_ == 1) {
         evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 14c0b151f..85ea55425 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -74,17 +74,19 @@ namespace basis {
  * linear term in @f$y@f$. The interpolation Vandermonde is therefore
  * nonsingular for the implemented quadrilateral serendipity space.
  *
- * `SerendipityBasis(ElementType::Quad4, p)` supports explicit
- * arbitrary-order quadrilateral serendipity requests for @f$p \ge 1@f$
- * (requests below one are normalized to one) and generates its own reference
- * nodes, since the higher-order interior ordering is an implementation
- * convention rather than a public layout. `ElementType::Quad8` is the named
- * quadratic eight-node layout (valid only with order 2) and, like Hex20 and
- * Wedge15, takes its reference nodes from ReferenceNodeLayout so that all named
- * fixed layouts share the single public node ordering. Solver-default basis
- * selection remains separate: `basis_factory` maps the complete Quad4 layout to
- * the default linear Lagrange basis and maps Quad8 to quadratic serendipity
- * unless a caller explicitly requests a different supported basis.
+ * `SerendipityBasis(BasisTopology::Quadrilateral, p)` is the arbitrary-order
+ * entry point for quadrilateral serendipity (@f$p \ge 1@f$, requests below one
+ * normalized to one); it generates its own reference nodes, since the
+ * higher-order interior ordering is an implementation convention rather than a
+ * public layout. `ElementType::Quad8` is the named quadratic eight-node layout
+ * (valid only with order 2) and, like Hex20 and Wedge15, takes its reference
+ * nodes from ReferenceNodeLayout so that all named fixed layouts share the
+ * single public node ordering. Hex and wedge serendipity are single fixed
+ * layouts with no arbitrary-order form, so they are constructed only from their
+ * named ElementType. Solver-default basis selection remains separate:
+ * `basis_factory` maps the complete Quad4 layout to the default linear Lagrange
+ * basis and maps Quad8 to quadratic serendipity unless a caller explicitly
+ * requests a different supported basis.
  *
  * Hex8 uses the standard trilinear corner basis
  * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Quad8, Hex20, and Wedge15 use fixed
@@ -97,21 +99,36 @@ namespace basis {
 class SerendipityBasis final : public BasisFunction {
 public:
     /**
-     * @brief Construct a serendipity basis for an element type and polynomial order.
+     * @brief Construct an arbitrary-order quadrilateral serendipity basis.
      *
-     * @details The constructor selects the topology-specific interpolation
-     * space, computes the reference node coordinates, and builds the nodal
-     * coefficient table needed for evaluation. Quadrilateral, Hex20, and Wedge15
-     * bases build and invert a Vandermonde matrix for their serendipity
-     * monomials; Hex8 uses the trilinear corner basis directly. For hexahedra,
-     * only linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
-     * wedges, only quadratic Wedge15 is supported. Quad4 supports explicit
-     * quadrilateral serendipity requests of any order @f$p \ge 1@f$; Quad8 is
-     * restricted to order 2.
+     * @details This is the arbitrary-order entry point for the only serendipity
+     * family with a free order: the quadrilateral. The topology carries no
+     * node-count assumption; the monomial space, reference nodes (generated
+     * here), and nodal coefficient table are built from the requested order
+     * (orders below 1 are normalized to 1). Hex and wedge serendipity are single
+     * fixed layouts and are not constructed this way -- use the named ElementType
+     * overload (Hex8/Hex20/Wedge15) for them.
      *
-     * @param type Element type used to determine topology and reference-node layout.
-     * @param order Requested polynomial order.
-     * @throws BasisConfigurationException If the requested order is invalid.
+     * @param topology Must be BasisTopology::Quadrilateral.
+     * @param order Polynomial order @f$p \ge 1@f$ (lower values normalized to 1).
+     * @throws BasisElementCompatibilityException If @p topology is not Quadrilateral.
+     */
+    SerendipityBasis(BasisTopology topology, int order);
+
+    /**
+     * @brief Construct a serendipity basis from a named element layout.
+     *
+     * @details Convenience overload for the named, fixed serendipity layouts.
+     * Quad8 builds the quadratic quadrilateral serendipity space from its
+     * ReferenceNodeLayout nodes; Hex8 uses the trilinear corner basis directly;
+     * Hex20 and Wedge15 build and invert a Vandermonde over their fixed monomial
+     * spaces. Each layout is pinned to a single order (Hex8 to 1; Quad8, Hex20,
+     * and Wedge15 to 2), so the requested @p order must match it; arbitrary-order
+     * quadrilateral serendipity is requested through the BasisTopology overload.
+     *
+     * @param type Named serendipity element type (Quad8, Hex8, Hex20, or Wedge15).
+     * @param order Requested order; must equal the layout's fixed order.
+     * @throws BasisConfigurationException If @p order does not match the layout's fixed order.
      * @throws BasisElementCompatibilityException If the element type is unsupported.
      */
     SerendipityBasis(ElementType type, int order);
@@ -119,8 +136,13 @@ class SerendipityBasis final : public BasisFunction {
     /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
 
+    /** @copydoc BasisFunction::topology() */
+    BasisTopology topology() const noexcept final { return topology_; }
+
     /** @copydoc BasisFunction::element_type() */
-    ElementType element_type() const noexcept final { return element_type_; }
+    ElementType element_type() const noexcept final {
+        return named_element_for(topology_, order_, BasisType::Serendipity);
+    }
 
     /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
@@ -135,16 +157,16 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Return the reference interpolation nodes in basis ordering.
      *
      * @details Node coordinates are the points at which the serendipity basis
-     * satisfies the nodal interpolation property. quadrilateral, hexahedral, and wedge
-     * nodes are taken from ReferenceNodeLayout, the public node-ordering source
-     * the solver adapter permutes against. Arbitrary-order Quad4 nodes are
-     * generated here instead: boundary nodes first and then, for higher order
-     * requests, the selected interior points needed to make the reduced monomial
-     * space unisolvent. For high-order Quad4 serendipity, the deterministic
-     * interior row ordering is an implementation convention; callers should
-     * pair it with basis values from the same object rather than assume an
-     * external mesh ordering contract beyond the supported Quad4/Quad8
-     * production layouts.
+     * satisfies the nodal interpolation property. The named fixed layouts (Quad8,
+     * Hex8, Hex20, Wedge15) take their nodes from ReferenceNodeLayout, the public
+     * node-ordering source the solver adapter permutes against. Arbitrary-order
+     * quadrilateral serendipity (constructed from BasisTopology::Quadrilateral)
+     * generates its nodes here instead: boundary nodes first and then, for higher
+     * order requests, the selected interior points needed to make the reduced
+     * monomial space unisolvent. That deterministic interior row ordering is an
+     * implementation convention; callers should pair it with basis values from
+     * the same object rather than assume an external mesh ordering contract
+     * beyond the supported Quad8 production layout.
      *
      * @return Reference node coordinates, one per basis function.
      */
@@ -239,7 +261,7 @@ class SerendipityBasis final : public BasisFunction {
                               std::span<Hessian> hessians_out) const final;
 
 private:
-    ElementType element_type_;
+    BasisTopology topology_;
     int dimension_;
     int order_;
     std::size_t size_;
@@ -249,6 +271,15 @@ class SerendipityBasis final : public BasisFunction {
     // Row-major inverse Vandermonde, indexed as [monomial, basis].
     std::vector<double> inv_vandermonde_;
 
+    // Build the quadrilateral serendipity monomial space, reference nodes, and
+    // nodal coefficient table for the given order. The named Quad8 layout takes
+    // its nodes from ReferenceNodeLayout; the arbitrary-order topology path
+    // generates them.
+    void init_quadrilateral(int order, bool nodes_from_reference_layout);
+    // Build a fixed named volume serendipity layout (Hex20 or Wedge15) from its
+    // tabulated monomial space and ReferenceNodeLayout nodes.
+    void init_fixed_named(ElementType type);
+
     void evaluate_all_to(const math::Vector<double, 3>& xi,
                          std::span<double> values_out,
                          std::span<Gradient> gradients_out,
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index f4f1446da..a6f0da44a 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -41,6 +41,7 @@ namespace {
 class MinimalScalarBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+    BasisTopology topology() const noexcept override { return BasisTopology::Line; }
     ElementType element_type() const noexcept override { return ElementType::Line2; }
     int dimension() const noexcept override { return 1; }
     int order() const noexcept override { return 1; }
@@ -63,6 +64,7 @@ class ExactQuadraticBasis : public BasisFunction {
     using BasisFunction::numerical_hessian;
 
     BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    BasisTopology topology() const noexcept override { return BasisTopology::Hexahedron; }
     ElementType element_type() const noexcept override { return ElementType::Hex8; }
     int dimension() const noexcept override { return 3; }
     int order() const noexcept override { return 2; }
@@ -111,6 +113,7 @@ class ExactQuadraticBasis : public BasisFunction {
 class CompleteFallbackBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+    BasisTopology topology() const noexcept override { return BasisTopology::Triangle; }
     ElementType element_type() const noexcept override { return ElementType::Triangle3; }
     int dimension() const noexcept override { return 2; }
     int order() const noexcept override { return 1; }
@@ -177,6 +180,33 @@ TEST(BasisErrorPaths, LagrangeInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
 }
 
+// A named Lagrange element layout fixes its polynomial order: the matching order
+// is accepted and any other order is rejected. Arbitrary orders must be
+// requested through the BasisTopology overload, never by over-/under-specifying
+// a node-count-named element.
+TEST(BasisErrorPaths, NamedLagrangeElementsRejectNonBakedOrders) {
+    const std::vector<std::pair<ElementType, int>> named = {
+        {ElementType::Point1, 0},
+        {ElementType::Line2, 1},     {ElementType::Line3, 2},
+        {ElementType::Triangle3, 1}, {ElementType::Triangle6, 2},
+        {ElementType::Quad4, 1},     {ElementType::Quad9, 2},
+        {ElementType::Tetra4, 1},    {ElementType::Tetra10, 2},
+        {ElementType::Hex8, 1},      {ElementType::Hex27, 2},
+        {ElementType::Wedge6, 1},    {ElementType::Wedge18, 2},
+    };
+
+    for (const auto& [type, baked] : named) {
+        EXPECT_NO_THROW((void)LagrangeBasis(type, baked))
+            << "element=" << static_cast<int>(type);
+        EXPECT_THROW((void)LagrangeBasis(type, baked + 1), BasisConfigurationException)
+            << "element=" << static_cast<int>(type);
+        if (baked > 0) {
+            EXPECT_THROW((void)LagrangeBasis(type, baked - 1), BasisConfigurationException)
+                << "element=" << static_cast<int>(type);
+        }
+    }
+}
+
 TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW(SerendipityBasis(ElementType::Unknown, 2),
                  BasisElementCompatibilityException);
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index bc0fea554..dbbc33f9b 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -74,19 +74,19 @@ void numerical_hessian_helper(const BasisFunction& basis,
     }
 }
 
-std::vector<math::Vector<double, 3>> sample_points_for(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
+std::vector<math::Vector<double, 3>> sample_points_for(BasisTopology topology) {
+    switch (topology) {
+        case BasisTopology::Line:
             return {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}};
-        case ElementType::Triangle3:
+        case BasisTopology::Triangle:
             return {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}};
-        case ElementType::Quad4:
+        case BasisTopology::Quadrilateral:
             return {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}};
-        case ElementType::Tetra4:
+        case BasisTopology::Tetrahedron:
             return {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}};
-        case ElementType::Hex8:
+        case BasisTopology::Hexahedron:
             return {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}};
-        case ElementType::Wedge6:
+        case BasisTopology::Wedge:
             return {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}};
         default:
             return {{double(0), double(0), double(0)}};
@@ -221,56 +221,56 @@ void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
     }
 }
 
-std::vector<math::Vector<double, 3>> serendipity_sample_points(ElementType type) {
-    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
+std::vector<math::Vector<double, 3>> serendipity_sample_points(BasisTopology topology) {
+    if (topology == BasisTopology::Quadrilateral) {
         return {{double(0.17), double(-0.31), double(0)}, {double(-0.45), double(0.25), double(0)}};
     }
-    if (type == ElementType::Hex8 || type == ElementType::Hex20) {
+    if (topology == BasisTopology::Hexahedron) {
         return {{double(0.2), double(-0.1), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}};
     }
-    return {{double(0.2), double(0.3), double(0.1)}, {double(0.12), double(0.16), double(-0.2)}};
+    return {{double(0.2), double(0.3), double(0.1)}, {double(0.12), double(0.16), double(-0.2)}};  // wedge
 }
 
 } // namespace
 
 TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
     const struct Case {
-        ElementType type;
+        BasisTopology topology;
         int order;
         double tol;
         double eps;
     } cases[] = {
-        {ElementType::Line2, 3, double(1e-7), double(1e-5)},
-        {ElementType::Triangle3, 3, double(2e-6), double(1e-5)},
-        {ElementType::Quad4, 3, double(1e-6), double(1e-5)},
-        {ElementType::Tetra4, 2, double(1e-6), double(1e-5)},
-        {ElementType::Hex8, 2, double(1e-6), double(1e-5)},
-        {ElementType::Wedge6, 2, double(1e-5), double(1e-5)},
+        {BasisTopology::Line, 3, double(1e-7), double(1e-5)},
+        {BasisTopology::Triangle, 3, double(2e-6), double(1e-5)},
+        {BasisTopology::Quadrilateral, 3, double(1e-6), double(1e-5)},
+        {BasisTopology::Tetrahedron, 2, double(1e-6), double(1e-5)},
+        {BasisTopology::Hexahedron, 2, double(1e-6), double(1e-5)},
+        {BasisTopology::Wedge, 2, double(1e-5), double(1e-5)},
     };
 
     for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        expect_hessians_match_numerical(basis, sample_points_for(c.type), c.tol, c.eps);
+        LagrangeBasis basis(c.topology, c.order);
+        expect_hessians_match_numerical(basis, sample_points_for(c.topology), c.tol, c.eps);
     }
 }
 
 TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
     const struct Case {
-        ElementType type;
+        BasisTopology topology;
         int order;
         math::Vector<double, 3> xi;
         double tol;
     } cases[] = {
-        {ElementType::Line2, 3, {double(0.15), double(0), double(0)}, double(1e-12)},
-        {ElementType::Triangle3, 3, {double(0.2), double(0.25), double(0)}, double(1e-10)},
-        {ElementType::Quad4, 3, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
-        {ElementType::Tetra4, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
-        {ElementType::Hex8, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
-        {ElementType::Wedge6, 2, {double(0.2), double(0.15), double(-0.3)}, double(1e-10)},
+        {BasisTopology::Line, 3, {double(0.15), double(0), double(0)}, double(1e-12)},
+        {BasisTopology::Triangle, 3, {double(0.2), double(0.25), double(0)}, double(1e-10)},
+        {BasisTopology::Quadrilateral, 3, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
+        {BasisTopology::Tetrahedron, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
+        {BasisTopology::Hexahedron, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
+        {BasisTopology::Wedge, 2, {double(0.2), double(0.15), double(-0.3)}, double(1e-10)},
     };
 
     for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
+        LagrangeBasis basis(c.topology, c.order);
         expect_partition_hessian_sum_zero(basis, c.xi, double(10) * c.tol);
         expect_hessians_symmetric(basis, c.xi, c.tol);
     }
@@ -331,21 +331,21 @@ TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
 // shared by the first- and second-derivative recurrences.
 TEST(BasisGradients, LagrangeCanonicalTopologiesMatchNumericalGradients) {
     const struct Case {
-        ElementType type;
+        BasisTopology topology;
         int order;
         double tol;
     } cases[] = {
-        {ElementType::Line2, 3, double(1e-8)},
-        {ElementType::Triangle3, 3, double(1e-7)},
-        {ElementType::Quad4, 3, double(1e-7)},
-        {ElementType::Tetra4, 2, double(1e-7)},
-        {ElementType::Hex8, 2, double(1e-7)},
-        {ElementType::Wedge6, 2, double(1e-7)},
+        {BasisTopology::Line, 3, double(1e-8)},
+        {BasisTopology::Triangle, 3, double(1e-7)},
+        {BasisTopology::Quadrilateral, 3, double(1e-7)},
+        {BasisTopology::Tetrahedron, 2, double(1e-7)},
+        {BasisTopology::Hexahedron, 2, double(1e-7)},
+        {BasisTopology::Wedge, 2, double(1e-7)},
     };
 
     for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        expect_gradients_match_numerical(basis, sample_points_for(c.type), c.tol);
+        LagrangeBasis basis(c.topology, c.order);
+        expect_gradients_match_numerical(basis, sample_points_for(c.topology), c.tol);
     }
 }
 
@@ -356,66 +356,63 @@ TEST(BasisGradients, LagrangeCanonicalTopologiesMatchNumericalGradients) {
 // structurally, so neither can detect a wrong derivative formula. Finite
 // differences of values are the authoritative check.
 TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
-    const struct Case {
-        ElementType type;
-        int order;
-        double tol;
-    } cases[] = {
-        {ElementType::Quad4, 1, double(1e-8)},
+    // Arbitrary-order quadrilateral serendipity (topology path).
+    const struct QuadCase { int order; double tol; } quad_cases[] = {
+        {1, double(1e-8)}, {3, double(1e-7)}, {4, double(5e-7)}, {6, double(2e-6)},
+    };
+    for (const auto& c : quad_cases) {
+        SerendipityBasis basis(BasisTopology::Quadrilateral, c.order);
+        expect_gradients_match_numerical(
+            basis, serendipity_sample_points(BasisTopology::Quadrilateral), c.tol);
+    }
+
+    // Named fixed serendipity layouts.
+    const struct NamedCase { ElementType type; int order; double tol; } named_cases[] = {
         {ElementType::Quad8, 2, double(1e-7)},
-        {ElementType::Quad4, 3, double(1e-7)},
-        {ElementType::Quad4, 4, double(5e-7)},
-        {ElementType::Quad4, 6, double(2e-6)},
         {ElementType::Hex8, 1, double(1e-8)},
         {ElementType::Hex20, 2, double(1e-7)},
         {ElementType::Wedge15, 2, double(1e-7)},
     };
-
-    for (const auto& c : cases) {
+    for (const auto& c : named_cases) {
         SerendipityBasis basis(c.type, c.order);
-        expect_gradients_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+        expect_gradients_match_numerical(
+            basis, serendipity_sample_points(basis.topology()), c.tol);
     }
 }
 
 TEST(BasisGradients, QuadrilateralSerendipityInactiveZDerivativesRemainZero) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Quad4, 1},
-        {ElementType::Quad8, 2},
-        {ElementType::Quad4, 4},
-        {ElementType::Quad4, 6},
-        {ElementType::Quad4, 10},
-    };
-
-    for (const auto& c : cases) {
-        SerendipityBasis basis(c.type, c.order);
+    // All quadrilateral serendipity, including the production order 2, exercised
+    // through the arbitrary-order topology path.
+    for (const int order : {1, 2, 4, 6, 10}) {
+        SerendipityBasis basis(BasisTopology::Quadrilateral, order);
         expect_inactive_z_derivatives_zero(
             basis,
-            serendipity_sample_points(c.type),
+            serendipity_sample_points(BasisTopology::Quadrilateral),
             double(1e-12));
     }
 }
 
 TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
-    const struct Case {
-        ElementType type;
-        int order;
-        double tol;
-    } cases[] = {
-        {ElementType::Quad4, 1, double(1e-6)},
+    // Arbitrary-order quadrilateral serendipity (topology path).
+    const struct QuadCase { int order; double tol; } quad_cases[] = {
+        {1, double(1e-6)}, {3, double(1e-6)}, {4, double(5e-6)}, {6, double(2e-5)},
+    };
+    for (const auto& c : quad_cases) {
+        SerendipityBasis basis(BasisTopology::Quadrilateral, c.order);
+        expect_hessians_match_numerical(
+            basis, serendipity_sample_points(BasisTopology::Quadrilateral), c.tol);
+    }
+
+    // Named fixed serendipity layouts.
+    const struct NamedCase { ElementType type; int order; double tol; } named_cases[] = {
         {ElementType::Quad8, 2, double(1e-6)},
-        {ElementType::Quad4, 3, double(1e-6)},
-        {ElementType::Quad4, 4, double(5e-6)},
-        {ElementType::Quad4, 6, double(2e-5)},
         {ElementType::Hex8, 1, double(1e-6)},
         {ElementType::Hex20, 2, double(1e-6)},
         {ElementType::Wedge15, 2, double(1e-6)},
     };
-
-    for (const auto& c : cases) {
+    for (const auto& c : named_cases) {
         SerendipityBasis basis(c.type, c.order);
-        expect_hessians_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+        expect_hessians_match_numerical(
+            basis, serendipity_sample_points(basis.topology()), c.tol);
     }
 }
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index b1215c354..209d15c48 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -37,6 +37,22 @@ static_assert(detail::basis_nearly_equal(
     double(1),
     double(1) + detail::basis_scaled_tolerance() * double(0.5)));
 
+// Topology/order helpers backing the BasisTopology construction path.
+static_assert(topology_dimension(BasisTopology::Line) == 1);
+static_assert(topology_dimension(BasisTopology::Hexahedron) == 3);
+static_assert(lagrange_topology_representative(BasisTopology::Hexahedron) == ElementType::Hex8);
+static_assert(lagrange_topology_representative(BasisTopology::Point) == ElementType::Point1);
+static_assert(named_lagrange_order(ElementType::Hex8) == 1);
+static_assert(named_lagrange_order(ElementType::Hex27) == 2);
+static_assert(named_lagrange_order(ElementType::Point1) == 0);
+static_assert(named_element_for(BasisTopology::Hexahedron, 1, BasisType::Lagrange) == ElementType::Hex8);
+static_assert(named_element_for(BasisTopology::Hexahedron, 2, BasisType::Lagrange) == ElementType::Hex27);
+static_assert(named_element_for(BasisTopology::Hexahedron, 5, BasisType::Lagrange) == ElementType::Unknown);
+static_assert(named_element_for(BasisTopology::Point, 0, BasisType::Lagrange) == ElementType::Point1);
+static_assert(named_element_for(BasisTopology::Quadrilateral, 2, BasisType::Serendipity) == ElementType::Quad8);
+static_assert(named_element_for(BasisTopology::Hexahedron, 2, BasisType::Serendipity) == ElementType::Hex20);
+static_assert(named_element_for(BasisTopology::Hexahedron, 1, BasisType::Serendipity) == ElementType::Hex8);
+
 TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     const std::vector<std::pair<ElementType, std::size_t>> expected = {
         {ElementType::Line2, 2u},
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 0994d6e42..c5db1ebac 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -106,18 +106,19 @@ void expect_all_entries_finite(const LagrangeBasis& basis,
 } // namespace
 
 TEST(HigherOrderWedge, CompleteAliasMatchesGeneratedNodeLayout) {
-    LagrangeBasis alias_basis(ElementType::Wedge18, 1);
+    LagrangeBasis alias_basis(ElementType::Wedge18, 2);
     const auto generated =
         ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Wedge6, 2);
 
     ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(ElementType::Wedge18));
-    EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge6);
+    EXPECT_EQ(alias_basis.topology(), BasisTopology::Wedge);
+    EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge18);  // faithful round-trip
     EXPECT_EQ(alias_basis.order(), 2);
     expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
 }
 
 TEST(HigherOrderWedge, OrderThreeIsNodalAndPartitionsUnity) {
-    LagrangeBasis wedge(ElementType::Wedge6, 3);
+    LagrangeBasis wedge(BasisTopology::Wedge, 3);
 
     expect_kronecker_at_nodes(wedge, double(2e-10));
     expect_partition_gradient_hessian_sums(
@@ -132,7 +133,7 @@ TEST(HigherOrderWedge, OrderThreeIsNodalAndPartitionsUnity) {
 }
 
 TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
-    LagrangeBasis wedge(ElementType::Wedge6, 4);
+    LagrangeBasis wedge(BasisTopology::Wedge, 4);
 
     expect_all_entries_finite(wedge, {double(0.2), double(0.1), double(-0.6)});
     expect_all_entries_finite(wedge, {double(0.05), double(0.8), double(0.3)});
@@ -142,7 +143,7 @@ TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
 // the Kronecker property validates the order-four node lattice and its inverse
 // index mapping end to end.
 TEST(HigherOrderWedge, OrderFourIsNodalAndPartitionsUnity) {
-    LagrangeBasis wedge(ElementType::Wedge6, 4);
+    LagrangeBasis wedge(BasisTopology::Wedge, 4);
 
     // Order-4 wedge = triangle(order 4) x line(order 4) = 15 x 5 nodes.
     EXPECT_EQ(wedge.size(), 15u * 5u);
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 744c46c5d..e2099dea4 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -24,7 +24,8 @@ namespace {
 using Point = math::Vector<double, 3>;
 
 struct CanonicalCase {
-    ElementType type;
+    BasisTopology topology;
+    ElementType representative;  // linear element for sample-point lookup and labeling
     int order;
     std::size_t size;
     int dimension;
@@ -34,31 +35,31 @@ struct CanonicalCase {
 
 const std::vector<CanonicalCase>& canonical_cases() {
     static const std::vector<CanonicalCase> cases = {
-        {ElementType::Line2, 3, 4u, 1,
+        {BasisTopology::Line, ElementType::Line2, 3, 4u, 1,
          {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}},
          double(1e-11)},
-        {ElementType::Triangle3, 3, 10u, 2,
+        {BasisTopology::Triangle, ElementType::Triangle3, 3, 10u, 2,
          {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}},
          double(1e-9)},
-        {ElementType::Quad4, 3, 16u, 2,
+        {BasisTopology::Quadrilateral, ElementType::Quad4, 3, 16u, 2,
          {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}},
          double(1e-11)},
-        {ElementType::Tetra4, 2, 10u, 3,
+        {BasisTopology::Tetrahedron, ElementType::Tetra4, 2, 10u, 3,
          {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}},
          double(1e-9)},
-        {ElementType::Hex8, 2, 27u, 3,
+        {BasisTopology::Hexahedron, ElementType::Hex8, 2, 27u, 3,
          {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}},
          double(1e-10)},
-        {ElementType::Wedge6, 2, 18u, 3,
+        {BasisTopology::Wedge, ElementType::Wedge6, 2, 18u, 3,
          {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}},
          double(1e-9)},
     };
     return cases;
 }
 
-std::vector<Point> sample_points_for(ElementType type) {
+std::vector<Point> sample_points_for(ElementType representative) {
     for (const auto& c : canonical_cases()) {
-        if (c.type == type) {
+        if (c.representative == representative) {
             return c.points;
         }
     }
@@ -235,9 +236,9 @@ double interpolate_value(const LagrangeBasis& basis,
 
 TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
     for (const auto& c : canonical_cases()) {
-        LagrangeBasis basis(c.type, c.order);
+        LagrangeBasis basis(c.topology, c.order);
         EXPECT_EQ(basis.basis_type(), BasisType::Lagrange);
-        EXPECT_EQ(basis.element_type(), c.type);
+        EXPECT_EQ(basis.topology(), c.topology);
         EXPECT_EQ(basis.order(), c.order);
         EXPECT_EQ(basis.size(), c.size);
         EXPECT_EQ(basis.dimension(), c.dimension);
@@ -246,7 +247,7 @@ TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
 
 TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
     for (const auto& c : canonical_cases()) {
-        LagrangeBasis basis(c.type, c.order);
+        LagrangeBasis basis(c.topology, c.order);
         expect_kronecker_at_nodes(basis, double(2e-10));
         expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
     }
@@ -254,45 +255,68 @@ TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
 
 TEST(LagrangeBasis, SpanOutputSinksMatchVectorEvaluationAcrossTopologies) {
     for (const auto& c : canonical_cases()) {
-        LagrangeBasis basis(c.type, c.order);
+        LagrangeBasis basis(c.topology, c.order);
         expect_span_sinks_match_vector_evaluation(basis, c.points.front());
     }
 }
 
-TEST(LagrangeBasis, CompleteAliasesNormalizeToCanonicalBases) {
-    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
-        {ElementType::Line3, ElementType::Line2, 2},
-        {ElementType::Triangle6, ElementType::Triangle3, 2},
-        {ElementType::Quad9, ElementType::Quad4, 2},
-        {ElementType::Tetra10, ElementType::Tetra4, 2},
-        {ElementType::Hex27, ElementType::Hex8, 2},
-        {ElementType::Wedge18, ElementType::Wedge6, 2},
+// A named quadratic alias is a fixed-order shorthand for the same topology at
+// order 2: it builds the identical basis as the BasisTopology overload, reports
+// that topology, and round-trips faithfully through element_type() (Hex27 stays
+// Hex27 rather than collapsing to a canonical linear type).
+TEST(LagrangeBasis, CompleteAliasesMatchTopologyConstruction) {
+    const std::vector<std::tuple<ElementType, BasisTopology, ElementType>> aliases = {
+        {ElementType::Line3, BasisTopology::Line, ElementType::Line2},
+        {ElementType::Triangle6, BasisTopology::Triangle, ElementType::Triangle3},
+        {ElementType::Quad9, BasisTopology::Quadrilateral, ElementType::Quad4},
+        {ElementType::Tetra10, BasisTopology::Tetrahedron, ElementType::Tetra4},
+        {ElementType::Hex27, BasisTopology::Hexahedron, ElementType::Hex8},
+        {ElementType::Wedge18, BasisTopology::Wedge, ElementType::Wedge6},
     };
 
-    for (const auto& [alias, canonical, order] : aliases) {
-        LagrangeBasis alias_basis(alias, 1);
-        LagrangeBasis canonical_basis(canonical, order);
-        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
+    for (const auto& [alias, topo, representative] : aliases) {
+        LagrangeBasis alias_basis(alias, 2);
+        LagrangeBasis topo_basis(topo, 2);
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(representative, 2);
 
-        EXPECT_EQ(alias_basis.element_type(), canonical);
-        EXPECT_EQ(alias_basis.order(), order);
+        EXPECT_EQ(alias_basis.topology(), topo);
+        EXPECT_EQ(alias_basis.element_type(), alias);
+        EXPECT_EQ(alias_basis.order(), 2);
         expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
-        expect_nodes_close(alias_basis.nodes(), canonical_basis.nodes(), double(1e-14));
+        expect_nodes_close(alias_basis.nodes(), topo_basis.nodes(), double(1e-14));
         expect_evaluations_match(alias_basis,
-                                 canonical_basis,
-                                 sample_points_for(canonical),
+                                 topo_basis,
+                                 sample_points_for(representative),
                                  double(1e-12));
     }
 }
 
-// CompleteAliasesNormalizeToCanonicalBases pins the alias floor (a named
-// quadratic alias requested below order 2 is raised to 2). This pins the
-// complementary direction documented in normalize_lagrange_request: a higher
-// requested order on an alias is honored, not clamped to the alias order.
-TEST(LagrangeBasis, QuadraticAliasHonorsHigherRequestedOrder) {
-    const LagrangeBasis basis(ElementType::Hex27, 3);
-    EXPECT_EQ(basis.element_type(), ElementType::Hex8);
-    EXPECT_EQ(basis.order(), 3);
+// The arbitrary order a named alias rejects is exactly what the BasisTopology
+// overload is for: a node-count-named element cannot carry a conflicting order,
+// and an order with no named layout reports element_type() == Unknown.
+TEST(LagrangeBasis, ArbitraryOrderRequiresTopologyNotNamedAlias) {
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Hex27, 3), BasisConfigurationException);
+
+    const LagrangeBasis basis(BasisTopology::Hexahedron, 5);
+    EXPECT_EQ(basis.topology(), BasisTopology::Hexahedron);
+    EXPECT_EQ(basis.order(), 5);
+    EXPECT_EQ(basis.size(), 216u);  // (5 + 1)^3
+    EXPECT_EQ(basis.element_type(), ElementType::Unknown);  // no named order-5 hex
+}
+
+// element_type() is the inverse of (topology, order): a named layout at orders
+// 0-2, Unknown where no named element exists (order 0 on a volume topology, or
+// any order >= 3). topology() + order() remain the authoritative identity.
+TEST(LagrangeBasis, ElementTypeAccessorReflectsTopologyAndOrder) {
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Point, 0).element_type(), ElementType::Point1);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 1).element_type(), ElementType::Hex8);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 2).element_type(), ElementType::Hex27);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 3).element_type(), ElementType::Unknown);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Tetrahedron, 1).element_type(), ElementType::Tetra4);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Tetrahedron, 2).element_type(), ElementType::Tetra10);
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Quadrilateral, 2).element_type(), ElementType::Quad9);
+    // An order-0 P0 basis on a volume topology has no named element.
+    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 0).element_type(), ElementType::Unknown);
 }
 
 TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
@@ -480,23 +504,23 @@ TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
 // node makes the basis non-nodal here).
 TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
     const struct Case {
-        ElementType type;
+        BasisTopology topology;
         int order;
         std::size_t size;
         double kronecker_tol;
         double derivative_tol;
         std::vector<Point> points;
     } cases[] = {
-        {ElementType::Tetra4, 3, 20u, double(5e-10), double(1e-8),
+        {BasisTopology::Tetrahedron, 3, 20u, double(5e-10), double(1e-8),
          {{double(0.12), double(0.18), double(0.16)}, {double(0.3), double(0.2), double(0.25)}}},
-        {ElementType::Tetra4, 4, 35u, double(1e-9), double(1e-7),
+        {BasisTopology::Tetrahedron, 4, 35u, double(1e-9), double(1e-7),
          {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}}},
-        {ElementType::Hex8, 3, 64u, double(5e-10), double(1e-8),
+        {BasisTopology::Hexahedron, 3, 64u, double(5e-10), double(1e-8),
          {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}}},
     };
 
     for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
+        LagrangeBasis basis(c.topology, c.order);
         EXPECT_EQ(basis.size(), c.size);
         expect_kronecker_at_nodes(basis, c.kronecker_tol);
         expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
@@ -540,19 +564,19 @@ TEST(LagrangeBasis, HigherOrderHexFaceInteriorFollowsVtkFaceOrder) {
 }
 
 TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
-    const std::vector<std::pair<ElementType, Point>> cases = {
-        {ElementType::Tetra4, {double(0.15), double(0.2), double(0.25)}},
-        {ElementType::Hex8, {double(0.15), double(-0.2), double(0.25)}},
+    const std::vector<std::pair<BasisTopology, Point>> cases = {
+        {BasisTopology::Tetrahedron, {double(0.15), double(0.2), double(0.25)}},
+        {BasisTopology::Hexahedron, {double(0.15), double(-0.2), double(0.25)}},
     };
 
-    for (const auto& [type, point] : cases) {
-        LagrangeBasis basis(type, 3);
+    for (const auto& [topo, point] : cases) {
+        LagrangeBasis basis(topo, 3);
         std::vector<double> values;
         basis.evaluate_values(point, values);
 
         const double interpolated = interpolate_value(basis, values, cubic_function);
         EXPECT_NEAR(interpolated, cubic_function(point), double(1e-10))
-            << "element=" << static_cast<int>(type);
+            << "topology=" << static_cast<int>(topo);
     }
 }
 
@@ -606,21 +630,21 @@ TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
 // the order-zero branches in node generation and the simplex/tensor/wedge
 // evaluators have no other coverage.
 TEST(LagrangeBasis, OrderZeroBasesAreConstantUnity) {
-    const std::array<ElementType, 6> types = {
-        ElementType::Line2,
-        ElementType::Triangle3,
-        ElementType::Quad4,
-        ElementType::Tetra4,
-        ElementType::Hex8,
-        ElementType::Wedge6,
-    };
-
-    for (const auto type : types) {
-        LagrangeBasis basis(type, 0);
-        EXPECT_EQ(basis.order(), 0) << "element=" << static_cast<int>(type);
-        EXPECT_EQ(basis.size(), 1u) << "element=" << static_cast<int>(type);
-
-        for (const auto& xi : sample_points_for(type)) {
+    const std::array<std::pair<BasisTopology, ElementType>, 6> cases = {{
+        {BasisTopology::Line, ElementType::Line2},
+        {BasisTopology::Triangle, ElementType::Triangle3},
+        {BasisTopology::Quadrilateral, ElementType::Quad4},
+        {BasisTopology::Tetrahedron, ElementType::Tetra4},
+        {BasisTopology::Hexahedron, ElementType::Hex8},
+        {BasisTopology::Wedge, ElementType::Wedge6},
+    }};
+
+    for (const auto& [topo, representative] : cases) {
+        LagrangeBasis basis(topo, 0);
+        EXPECT_EQ(basis.order(), 0) << "topology=" << static_cast<int>(topo);
+        EXPECT_EQ(basis.size(), 1u) << "topology=" << static_cast<int>(topo);
+
+        for (const auto& xi : sample_points_for(representative)) {
             std::vector<double> values;
             std::vector<Gradient> gradients;
             std::vector<Hessian> hessians;
@@ -628,7 +652,7 @@ TEST(LagrangeBasis, OrderZeroBasesAreConstantUnity) {
 
             ASSERT_EQ(values.size(), 1u);
             EXPECT_NEAR(values[0], double(1), double(1e-14))
-                << "element=" << static_cast<int>(type);
+                << "topology=" << static_cast<int>(topo);
             for (std::size_t d = 0; d < 3u; ++d) {
                 EXPECT_NEAR(gradients[0][d], double(0), double(1e-14));
                 for (std::size_t e = 0; e < 3u; ++e) {
@@ -702,12 +726,18 @@ TEST(BasisFactoryDefaults, RejectsElementsWithoutDefaultBasis) {
 
 TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     auto lagrange =
-        basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1});
+        basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 2});
     ASSERT_NE(lagrange, nullptr);
     EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
-    EXPECT_EQ(lagrange->element_type(), ElementType::Hex8);
+    EXPECT_EQ(lagrange->topology(), BasisTopology::Hexahedron);
+    EXPECT_EQ(lagrange->element_type(), ElementType::Hex27);
     EXPECT_EQ(lagrange->order(), 2);
 
+    // The factory inherits the named-element order validation: Hex27 is order 2.
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1}),
+                 BasisConfigurationException);
+
     auto serendipity =
         basis_factory::create(BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
     ASSERT_NE(serendipity, nullptr);
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index e500a803a..b0b95edd7 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -340,13 +340,14 @@ std::vector<math::Vector<double, 3>> quad8_reference_nodes_for_test() {
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
     SerendipityBasis basis(ElementType::Quad8, 2);
-    SerendipityBasis explicit_quad4_basis(ElementType::Quad4, 2);
+    SerendipityBasis topology_quad_basis(BasisTopology::Quadrilateral, 2);
 
     EXPECT_EQ(basis.size(), 8u);
-    // Quad8 sources its nodes from ReferenceNodeLayout while explicit Quad4 order
-    // 2 uses the local arbitrary-order generator, so this also pins the two
-    // independent quadrilateral node sources to agree at the production order.
-    expect_nodes_near(basis.nodes(), explicit_quad4_basis.nodes(), double(1e-14));
+    // Quad8 sources its nodes from ReferenceNodeLayout while the arbitrary-order
+    // Quadrilateral path at order 2 uses the local generator, so this also pins
+    // the two independent quadrilateral node sources to agree at the production
+    // order.
+    expect_nodes_near(basis.nodes(), topology_quad_basis.nodes(), double(1e-14));
     expect_nodal_delta(basis, basis.nodes(), double(1e-10));
     expect_partition_of_unity(basis, {double(0.17), double(-0.31), double(0)});
 }
@@ -396,19 +397,41 @@ TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 1), FEException);
+    // Quad4 is the linear Lagrange quad, not a named serendipity layout; arbitrary
+    // quadrilateral serendipity is requested through BasisTopology::Quadrilateral.
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad4, 2), FEException);
+}
+
+// Topology construction is the arbitrary-order entry point and exists only for
+// the quadrilateral, the single serendipity family with a free order. Hex and
+// wedge serendipity are fixed layouts requested through their named ElementType.
+TEST(SerendipityBasis, TopologyConstructionOnlySupportsQuadrilateral) {
+    EXPECT_NO_THROW((void)SerendipityBasis(BasisTopology::Quadrilateral, 3));
+    EXPECT_THROW(SerendipityBasis(BasisTopology::Hexahedron, 2),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(BasisTopology::Wedge, 2),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(BasisTopology::Triangle, 2),
+                 BasisElementCompatibilityException);
+
+    // Topology and named construction agree at the production order.
+    SerendipityBasis topo(BasisTopology::Quadrilateral, 2);
+    EXPECT_EQ(topo.topology(), BasisTopology::Quadrilateral);
+    EXPECT_EQ(topo.order(), 2);
+    EXPECT_EQ(topo.element_type(), ElementType::Quad8);
 }
 
 TEST(SerendipityBasis, QuadrilateralOrderZeroNormalizesToLinear) {
-    SerendipityBasis basis(ElementType::Quad4, 0);
+    SerendipityBasis basis(BasisTopology::Quadrilateral, 0);
 
     EXPECT_EQ(basis.order(), 1);
     EXPECT_EQ(basis.size(), 4u);
     expect_nodal_delta(basis, basis.nodes(), double(1e-12));
 }
 
-// Explicit Quad4 serendipity orders run the documented monomial selection,
-// boundary plus triangular interior node placement, and runtime Vandermonde
-// inversion. Order four is the first order with an interior residual
+// Explicit quadrilateral-topology serendipity orders run the documented monomial
+// selection, boundary plus triangular interior node placement, and runtime
+// Vandermonde inversion. Order four is the first order with an interior residual
 // polynomial, so it is the first order that appends an interior node.
 TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity) {
     const struct Case {
@@ -421,7 +444,7 @@ TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity)
     };
 
     for (const auto& c : cases) {
-        SerendipityBasis basis(ElementType::Quad4, c.order);
+        SerendipityBasis basis(BasisTopology::Quadrilateral, c.order);
         EXPECT_EQ(basis.size(), c.size) << "order=" << c.order;
         EXPECT_EQ(basis.order(), c.order);
         EXPECT_EQ(basis.dimension(), 2);
@@ -442,7 +465,7 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
     constexpr double kTol = double(1e-14);
 
     for (int order = 1; order <= 10; ++order) {
-        SerendipityBasis basis(ElementType::Quad4, order);
+        SerendipityBasis basis(BasisTopology::Quadrilateral, order);
         const auto& nodes = basis.nodes();
         const std::size_t expected_size = expected_quad_serendipity_size(order);
         const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
@@ -501,7 +524,7 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
 }
 
 TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
-    SerendipityBasis basis(ElementType::Quad4, 1);
+    SerendipityBasis basis(BasisTopology::Quadrilateral, 1);
 
     const std::vector<math::Vector<double, 3>> points = {
         {double(0.25), double(-0.4), double(0)},
@@ -515,7 +538,7 @@ TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
 }
 
 TEST(SerendipityBasis, QuadrilateralOrderThreeReproducesSerendipityCubics) {
-    SerendipityBasis basis(ElementType::Quad4, 3);
+    SerendipityBasis basis(BasisTopology::Quadrilateral, 3);
 
     const std::vector<math::Vector<double, 3>> points = {
         {double(0.25), double(-0.4), double(0)},
@@ -536,7 +559,7 @@ TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
     };
 
     for (int order = 1; order <= 10; ++order) {
-        SerendipityBasis basis(ElementType::Quad4, order);
+        SerendipityBasis basis(BasisTopology::Quadrilateral, order);
         const auto exponents = quad_serendipity_exponents_for_test(order);
         ASSERT_EQ(exponents.size(), basis.size()) << "order=" << order;
 
@@ -562,7 +585,7 @@ TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
 
 TEST(SerendipityBasis, QuadrilateralVandermondeHasFullRankThroughOrderTen) {
     for (int order = 1; order <= 10; ++order) {
-        SerendipityBasis basis(ElementType::Quad4, order);
+        SerendipityBasis basis(BasisTopology::Quadrilateral, order);
         const auto exponents = quad_serendipity_exponents_for_test(order);
         const auto vandermonde =
             quadrilateral_vandermonde_for_test(basis.nodes(), exponents);

From b19e6f3a3639e1552e13b293027854a9d16bf91a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 13:39:51 -0700
Subject: [PATCH 59/91] NodeOrderingConventions.cpp:443, the negative-order
 guard in complete_lagrange_nodes

---
 Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index b0c46cc93..6f5154961 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -440,8 +440,8 @@ LagrangeNodeLayout generate_wedge_nodes(int order) {
 }
 
 LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order) {
-    svmp::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
-                                             "ReferenceNodeLayout requires non-negative Lagrange order");
+    svmp::throw_if<BasisConfigurationException>(order < 0, SVMP_HERE,
+                                              "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
         case ElementType::Point1: {

From 93ad4d02ee4b6fdc08bb26319a11ba119db67ad4 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 14:24:16 -0700
Subject: [PATCH 60/91] fixing the construction of serendipity basis functions
 with orders less than 1 to be explicitly rejected and not silently floored

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 18 ++++++++++-----
 .../Source/solver/FE/Basis/SerendipityBasis.h | 22 +++++++++++--------
 .../FE/Basis/test_BasisErrorPaths.cpp         | 13 +++++++++++
 .../FE/Basis/test_SerendipityBasis.cpp        | 14 +++++++++---
 4 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 18391fb5d..f93410a2e 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -321,23 +321,26 @@ struct NormalizedSerendipityRequest {
 // quadrilateral serendipity is not a named element: it is requested through the
 // BasisTopology::Quadrilateral constructor.
 NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int order) {
-    const int floored_order = std::max(order, 1);
+    // The named layouts carry an inferred fixed order (Hex8 -> 1; Quad8, Hex20,
+    // and Wedge15 -> 2). The request must supply that exact order: it is never
+    // floored or otherwise adjusted to fit, so order 0 and negative orders are
+    // rejected rather than promoted to a valid layout.
     switch (type) {
         case ElementType::Quad8:
-            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
                 "SerendipityBasis: Quad8 is the quadratic 8-node serendipity layout (order 2 only); "
                 "use BasisTopology::Quadrilateral for higher-order quadrilateral serendipity");
             return {BasisTopology::Quadrilateral, 2, 2};
         case ElementType::Hex8:
-            svmp::throw_if<BasisConfigurationException>(floored_order != 1, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != 1, SVMP_HERE,
                 "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
             return {BasisTopology::Hexahedron, 3, 1};
         case ElementType::Hex20:
-            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
                 "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
             return {BasisTopology::Hexahedron, 3, 2};
         case ElementType::Wedge15:
-            svmp::throw_if<BasisConfigurationException>(floored_order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
                 "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
             return {BasisTopology::Wedge, 3, 2};
         default:
@@ -355,8 +358,11 @@ SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
         topology_ != BasisTopology::Quadrilateral, SVMP_HERE,
         "SerendipityBasis: arbitrary-order topology construction is only supported for "
         "Quadrilateral; use the named ElementType (Hex8/Hex20/Wedge15) for hex/wedge serendipity");
+    svmp::throw_if<BasisConfigurationException>(
+        order < 1, SVMP_HERE,
+        "SerendipityBasis: quadrilateral serendipity requires a polynomial order >= 1");
     dimension_ = 2;
-    order_ = std::max(order, 1);
+    order_ = order;
     init_quadrilateral(order_, /*nodes_from_reference_layout=*/false);
 }
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 85ea55425..630243838 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -75,8 +75,8 @@ namespace basis {
  * nonsingular for the implemented quadrilateral serendipity space.
  *
  * `SerendipityBasis(BasisTopology::Quadrilateral, p)` is the arbitrary-order
- * entry point for quadrilateral serendipity (@f$p \ge 1@f$, requests below one
- * normalized to one); it generates its own reference nodes, since the
+ * entry point for quadrilateral serendipity (@f$p \ge 1@f$; orders below one are
+ * rejected); it generates its own reference nodes, since the
  * higher-order interior ordering is an implementation convention rather than a
  * public layout. `ElementType::Quad8` is the named quadratic eight-node layout
  * (valid only with order 2) and, like Hex20 and Wedge15, takes its reference
@@ -105,12 +105,13 @@ class SerendipityBasis final : public BasisFunction {
      * family with a free order: the quadrilateral. The topology carries no
      * node-count assumption; the monomial space, reference nodes (generated
      * here), and nodal coefficient table are built from the requested order
-     * (orders below 1 are normalized to 1). Hex and wedge serendipity are single
+     * (which must be @f$p \ge 1@f$). Hex and wedge serendipity are single
      * fixed layouts and are not constructed this way -- use the named ElementType
      * overload (Hex8/Hex20/Wedge15) for them.
      *
      * @param topology Must be BasisTopology::Quadrilateral.
-     * @param order Polynomial order @f$p \ge 1@f$ (lower values normalized to 1).
+     * @param order Polynomial order @f$p \ge 1@f$; orders below 1 are rejected.
+     * @throws BasisConfigurationException If @p order is less than 1.
      * @throws BasisElementCompatibilityException If @p topology is not Quadrilateral.
      */
     SerendipityBasis(BasisTopology topology, int order);
@@ -122,13 +123,16 @@ class SerendipityBasis final : public BasisFunction {
      * Quad8 builds the quadratic quadrilateral serendipity space from its
      * ReferenceNodeLayout nodes; Hex8 uses the trilinear corner basis directly;
      * Hex20 and Wedge15 build and invert a Vandermonde over their fixed monomial
-     * spaces. Each layout is pinned to a single order (Hex8 to 1; Quad8, Hex20,
-     * and Wedge15 to 2), so the requested @p order must match it; arbitrary-order
-     * quadrilateral serendipity is requested through the BasisTopology overload.
+     * spaces. Each layout carries an inferred fixed order (Hex8 to 1; Quad8,
+     * Hex20, and Wedge15 to 2); the requested @p order must equal that inferred
+     * order and is never adjusted to fit, so a mismatched request (including
+     * order 0 or negative) is rejected. Arbitrary-order quadrilateral serendipity
+     * is requested through the BasisTopology overload.
      *
      * @param type Named serendipity element type (Quad8, Hex8, Hex20, or Wedge15).
-     * @param order Requested order; must equal the layout's fixed order.
-     * @throws BasisConfigurationException If @p order does not match the layout's fixed order.
+     * @param order Requested order; must equal the layout's inferred fixed order
+     *        (1 for Hex8; 2 for Quad8, Hex20, and Wedge15).
+     * @throws BasisConfigurationException If @p order does not match the layout's inferred order.
      * @throws BasisElementCompatibilityException If the element type is unsupported.
      */
     SerendipityBasis(ElementType type, int order);
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index a6f0da44a..6ddb7917a 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -226,6 +226,19 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisConfigurationException);
     EXPECT_THROW(SerendipityBasis(ElementType::Wedge15, 3),
                  BasisConfigurationException);
+
+    // Order 0 and negative orders are rejected for every serendipity layout; a
+    // named element is pinned to its inferred order and is never floored up to it.
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 0),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex8, 0),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex20, 0),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Wedge15, 0),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Hex8, -1),
+                 BasisConfigurationException);
 }
 
 TEST(BasisErrorPaths, BasisFactoryRejectsNonC0Continuity) {
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index b0b95edd7..2edc6618d 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -421,9 +421,17 @@ TEST(SerendipityBasis, TopologyConstructionOnlySupportsQuadrilateral) {
     EXPECT_EQ(topo.element_type(), ElementType::Quad8);
 }
 
-TEST(SerendipityBasis, QuadrilateralOrderZeroNormalizesToLinear) {
-    SerendipityBasis basis(BasisTopology::Quadrilateral, 0);
-
+TEST(SerendipityBasis, QuadrilateralRejectsOrdersBelowOne) {
+    // Serendipity bases require a positive polynomial order; orders <= 0 are
+    // rejected rather than normalized up to the linear space.
+    EXPECT_THROW(SerendipityBasis(BasisTopology::Quadrilateral, 0),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(BasisTopology::Quadrilateral, -1),
+                 BasisConfigurationException);
+
+    // Order 1 is the smallest valid quadrilateral serendipity (the bilinear Q1
+    // space): four corner nodes and the nodal-interpolation property.
+    SerendipityBasis basis(BasisTopology::Quadrilateral, 1);
     EXPECT_EQ(basis.order(), 1);
     EXPECT_EQ(basis.size(), 4u);
     expect_nodal_delta(basis, basis.nodes(), double(1e-12));

From 736fba3ce437c773777d632f9cd2a9c1e08aea1c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 14:58:49 -0700
Subject: [PATCH 61/91] support arbitrary-order hexahedral serendipity bases,
 with Hex8/Hex20 as the order-1/order-2 special cases of one generated
 construction

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 310 ++++++++++++------
 .../Source/solver/FE/Basis/SerendipityBasis.h | 157 +++++----
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  22 ++
 .../FE/Basis/test_SerendipityBasis.cpp        | 221 ++++++++++++-
 4 files changed, 533 insertions(+), 177 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index f93410a2e..1798920e0 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -17,55 +17,6 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<double, 3>;
 
-void evaluate_hex8_reference(double r,
-                             double s,
-                             double t,
-                             std::span<double> values,
-                             std::span<Gradient> gradients,
-                             std::span<Hessian> hessians) {
-    static constexpr int signs[8][3] = {
-        {-1, -1, -1},
-        { 1, -1, -1},
-        { 1,  1, -1},
-        {-1,  1, -1},
-        {-1, -1,  1},
-        { 1, -1,  1},
-        { 1,  1,  1},
-        {-1,  1,  1},
-    };
-
-    for (std::size_t i = 0; i < 8u; ++i) {
-        const double a = double(signs[i][0]);
-        const double b = double(signs[i][1]);
-        const double c = double(signs[i][2]);
-        const double ar = double(1) + a * r;
-        const double bs = double(1) + b * s;
-        const double ct = double(1) + c * t;
-
-        if (!values.empty()) {
-            values[i] = double(0.125) * ar * bs * ct;
-        }
-        if (!gradients.empty()) {
-            Gradient& g = gradients[i];
-            g[0] = double(0.125) * a * bs * ct;
-            g[1] = double(0.125) * b * ar * ct;
-            g[2] = double(0.125) * c * ar * bs;
-        }
-        if (!hessians.empty()) {
-            Hessian& h = hessians[i];
-            h(0, 0) = double(0);
-            h(0, 1) = double(0.125) * a * b * ct;
-            h(0, 2) = double(0.125) * a * c * bs;
-            h(1, 0) = h(0, 1);
-            h(1, 1) = double(0);
-            h(1, 2) = double(0.125) * b * c * ar;
-            h(2, 0) = h(0, 2);
-            h(2, 1) = h(1, 2);
-            h(2, 2) = double(0);
-        }
-    }
-}
-
 int quad_serendipity_superlinear_degree(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }
@@ -163,6 +114,162 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
     return nodes;
 }
 
+int hex_serendipity_superlinear_degree(int ax, int ay, int az) {
+    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0) + (az > 1 ? az : 0);
+}
+
+// Hexahedral serendipity monomial space: every r^a s^b t^c whose superlinear
+// degree is at most `order`. This is the three-axis generalization of
+// quad_serendipity_exponents; at order 1 it is the eight multilinear monomials
+// (the Hex8 space) and at order 2 it is the twenty-monomial Hex20 space. The
+// enumeration order is internal -- evaluation sums over the monomials, so only
+// the node order, not the monomial order, is observable to a caller.
+std::vector<std::array<int, 3>> hex_serendipity_exponents(int order) {
+    std::vector<std::array<int, 3>> exponents;
+    for (int az = 0; az <= order; ++az) {
+        for (int ay = 0; ay <= order; ++ay) {
+            for (int ax = 0; ax <= order; ++ax) {
+                if (hex_serendipity_superlinear_degree(ax, ay, az) <= order) {
+                    exponents.push_back({ax, ay, az});
+                }
+            }
+        }
+    }
+    return exponents;
+}
+
+// Volume-interior node count for hexahedral serendipity. Once the boundary trace
+// is fixed, an interior serendipity function factors as the cube bubble
+// (1 - r^2)(1 - s^2)(1 - t^2) times a quotient of total degree at most order - 6,
+// so the interior space is P_{order-6} in three variables: empty until order 6,
+// then dim P_{order-6} = (m+1)(m+2)(m+3)/6 with m = order - 6.
+std::size_t hex_serendipity_volume_interior_count(int order) {
+    if (order < 6) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 6);
+    return (m + 1u) * (m + 2u) * (m + 3u) / 6u;
+}
+
+// Append the face-interior nodes. The restriction of the order-`order` cube
+// serendipity space to a face is the order-`order` quadrilateral serendipity
+// space, so every face carries the same 2D quad-serendipity interior set,
+// embedded into the face plane. Faces are visited in VTK face order
+// (-X, +X, -Y, +Y, -Z, +Z); the in-plane (u, v) point maps to the two free axes
+// of each face. Empty until order 4 (when the quad interior first appears).
+void append_hex_serendipity_face_interior_nodes(std::vector<Vec3>& nodes, int order) {
+    std::vector<Vec3> face_interior;  // (u, v, 0) interior points of one quad face
+    append_quad_serendipity_interior_nodes(face_interior, order);
+    if (face_interior.empty()) {
+        return;
+    }
+
+    // Each face: the fixed axis (0 = r, 1 = s, 2 = t), its +/-1 value, and the two
+    // in-plane axes that carry the 2D interior point (u, v).
+    struct Face {
+        int fixed_axis;
+        double fixed_value;
+        int u_axis;
+        int v_axis;
+    };
+    static constexpr Face faces[6] = {
+        {0, double(-1), 1, 2},  // -X: (s, t) in plane
+        {0, double(1),  1, 2},  // +X
+        {1, double(-1), 0, 2},  // -Y: (r, t) in plane
+        {1, double(1),  0, 2},  // +Y
+        {2, double(-1), 0, 1},  // -Z: (r, s) in plane
+        {2, double(1),  0, 1},  // +Z
+    };
+
+    for (const auto& face : faces) {
+        for (const auto& p : face_interior) {
+            Vec3 node = Vec3::Zero();
+            node[static_cast<std::size_t>(face.fixed_axis)] = face.fixed_value;
+            node[static_cast<std::size_t>(face.u_axis)] = p[0];
+            node[static_cast<std::size_t>(face.v_axis)] = p[1];
+            nodes.push_back(node);
+        }
+    }
+}
+
+// Append the volume-interior nodes: a tetrahedral staircase unisolvent for the
+// interior residual P_{order-6}. Each t-layer is a triangular staircase (the 2D
+// construction reused) whose total degree decreases by one per layer, so the
+// layers consume P_{order-6} by induction in t exactly as the quad interior
+// consumes P_{order-4} by induction in s. Empty until order 6.
+void append_hex_serendipity_volume_interior_nodes(std::vector<Vec3>& nodes, int order) {
+    if (order < 6) {
+        return;
+    }
+    const int m = order - 6;
+    for (int layer = 0; layer <= m; ++layer) {
+        const int tri_order = m - layer;
+        const double t = double(-1) + double(2) * double(layer + 1) / double(m + 2);
+        for (int row = 0; row <= tri_order; ++row) {
+            const int row_count = tri_order + 1 - row;
+            const double s = double(-1) + double(2) * double(row + 1) / double(tri_order + 2);
+            for (int col = 0; col < row_count; ++col) {
+                const double r = double(-1) + double(2) * double(col + 1) / double(row_count + 1);
+                nodes.push_back(Vec3{r, s, t});
+            }
+        }
+    }
+}
+
+// Generate the hexahedral serendipity reference nodes in the generalized
+// right-hand-rule / VTK-consistent stratified order: 8 corners, then 12 edges in
+// VTK quadratic-hex edge order, then the 6 face interiors in VTK face order, then
+// the volume interior. The corner and edge strata reuse the VTK ordering of
+// generate_hex_nodes verbatim, so at order 1 (corners only) and order 2 (corners
+// plus edge midpoints) the layout is exactly the public Hex8 / Hex20 ordering;
+// for higher order the reduced face/volume sets are this module's own convention.
+std::vector<Vec3> hex_serendipity_nodes(int order, std::size_t total_size) {
+    static constexpr double corner_coords[8][3] = {
+        {-1, -1, -1}, {1, -1, -1}, {1, 1, -1}, {-1, 1, -1},
+        {-1, -1, 1},  {1, -1, 1},  {1, 1, 1},  {-1, 1, 1},
+    };
+    static constexpr int edges[12][2] = {
+        {0, 1}, {1, 2}, {2, 3}, {3, 0},
+        {4, 5}, {5, 6}, {6, 7}, {7, 4},
+        {0, 4}, {1, 5}, {2, 6}, {3, 7},
+    };
+
+    std::vector<Vec3> nodes;
+    nodes.reserve(total_size);
+
+    for (const auto& c : corner_coords) {
+        nodes.push_back(Vec3{c[0], c[1], c[2]});
+    }
+
+    for (const auto& edge : edges) {
+        const auto& ca = corner_coords[edge[0]];
+        const auto& cb = corner_coords[edge[1]];
+        const Vec3 a{ca[0], ca[1], ca[2]};
+        const Vec3 b{cb[0], cb[1], cb[2]};
+        for (int m = 1; m < order; ++m) {
+            const double t = double(m) / double(order);
+            nodes.push_back(a * (double(1) - t) + b * t);
+        }
+    }
+
+    const std::size_t skeleton = nodes.size();
+    append_hex_serendipity_face_interior_nodes(nodes, order);
+    svmp::throw_if<BasisConstructionException>(
+        nodes.size() - skeleton != 6u * quad_serendipity_interior_count(order), SVMP_HERE,
+        "SerendipityBasis: hexahedral serendipity face-interior node count mismatch");
+
+    const std::size_t before_volume = nodes.size();
+    append_hex_serendipity_volume_interior_nodes(nodes, order);
+    svmp::throw_if<BasisConstructionException>(
+        nodes.size() - before_volume != hex_serendipity_volume_interior_count(order), SVMP_HERE,
+        "SerendipityBasis: hexahedral serendipity volume-interior node count mismatch");
+
+    svmp::throw_if<BasisConstructionException>(
+        nodes.size() != total_size, SVMP_HERE,
+        "SerendipityBasis: hexahedral serendipity node count does not match the monomial count");
+    return nodes;
+}
+
 // Build the nodal coefficient table for a monomial-generated serendipity family:
 // assemble V[node][monomial] = r^a s^b t^c at the public-order reference nodes and
 // invert it. Because the nodes are in public order, the inverse is already in
@@ -211,13 +318,6 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{2, 0, 1}}
 }};
 
-constexpr std::array<std::array<int, 3>, 20> kHex20MonomialExponents = {{
-    {{0, 0, 0}}, {{0, 0, 1}}, {{0, 0, 2}}, {{0, 1, 0}}, {{0, 1, 1}},
-    {{0, 1, 2}}, {{0, 2, 0}}, {{0, 2, 1}}, {{1, 0, 0}}, {{1, 0, 1}},
-    {{1, 0, 2}}, {{1, 1, 0}}, {{1, 1, 1}}, {{1, 1, 2}}, {{1, 2, 0}},
-    {{1, 2, 1}}, {{2, 0, 0}}, {{2, 0, 1}}, {{2, 1, 0}}, {{2, 1, 1}}
-}};
-
 // Value and first/second derivatives of the 1D monomial x^a. The derivative of
 // a constant or linear term collapses to zero, so negative powers never arise.
 struct MonomialAxis {
@@ -354,16 +454,22 @@ NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int
 
 SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
     : topology_(topology), dimension_(0), order_(0), size_(0) {
+    const bool supported_topology = topology_ == BasisTopology::Quadrilateral ||
+                                    topology_ == BasisTopology::Hexahedron;
     svmp::throw_if<BasisElementCompatibilityException>(
-        topology_ != BasisTopology::Quadrilateral, SVMP_HERE,
-        "SerendipityBasis: arbitrary-order topology construction is only supported for "
-        "Quadrilateral; use the named ElementType (Hex8/Hex20/Wedge15) for hex/wedge serendipity");
+        !supported_topology, SVMP_HERE,
+        "SerendipityBasis: arbitrary-order topology construction is supported for "
+        "Quadrilateral and Hexahedron; use the named ElementType (Wedge15) for wedge serendipity");
     svmp::throw_if<BasisConfigurationException>(
         order < 1, SVMP_HERE,
-        "SerendipityBasis: quadrilateral serendipity requires a polynomial order >= 1");
-    dimension_ = 2;
+        "SerendipityBasis: serendipity requires a polynomial order >= 1");
+    dimension_ = topology_ == BasisTopology::Hexahedron ? 3 : 2;
     order_ = order;
-    init_quadrilateral(order_, /*nodes_from_reference_layout=*/false);
+    if (topology_ == BasisTopology::Hexahedron) {
+        init_hexahedron(order_, /*nodes_from_reference_layout=*/false);
+    } else {
+        init_quadrilateral(order_, /*nodes_from_reference_layout=*/false);
+    }
 }
 
 SerendipityBasis::SerendipityBasis(ElementType type, int order)
@@ -377,19 +483,17 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
         case ElementType::Quad8:
             // Quad8 is the named quadratic layout; its nodes come from
             // ReferenceNodeLayout so the basis shares the single public Quad8
-            // ordering (the same source Hex20/Wedge15 use).
+            // ordering (the same source Hex8/Hex20/Wedge15 use).
             init_quadrilateral(order_, /*nodes_from_reference_layout=*/true);
             return;
         case ElementType::Hex8:
-            // Hex8 is the standard trilinear corner basis, evaluated directly
-            // rather than through a generated coefficient table.
-            size_ = 8u;
-            nodes_ = ReferenceNodeLayout::node_coords(type);
-            svmp::throw_if<BasisConstructionException>(
-                nodes_.size() != size_, SVMP_HERE,
-                "SerendipityBasis: Hex8 layout node count does not match basis size");
+            // Hex8 is the order-1 instance of the hexahedral serendipity space.
+            init_hexahedron(1, /*nodes_from_reference_layout=*/true);
             return;
         case ElementType::Hex20:
+            // Hex20 is the order-2 instance of the hexahedral serendipity space.
+            init_hexahedron(2, /*nodes_from_reference_layout=*/true);
+            return;
         case ElementType::Wedge15:
             init_fixed_named(type);
             return;
@@ -424,33 +528,48 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
         nodes_, monomial_exponents_, "Quad order " + std::to_string(order));
 }
 
-// Build a fixed named volume serendipity layout (Hex20 or Wedge15). The nodal
-// coefficient table is generated by inverting the Vandermonde built from the
-// public-order ReferenceNodeLayout nodes, exactly like the quadrilateral, so no
-// transcribed tables or output permutation are needed.
-void SerendipityBasis::init_fixed_named(ElementType type) {
-    std::span<const std::array<int, 3>> family_exponents;
-    std::string label;
-    if (type == ElementType::Hex20) {
-        size_ = 20u;
-        family_exponents = std::span<const std::array<int, 3>>(
-            kHex20MonomialExponents.data(), kHex20MonomialExponents.size());
-        label = "Hex20";
-    } else {  // Wedge15
-        size_ = 15u;
-        family_exponents = std::span<const std::array<int, 3>>(
-            kWedge15MonomialExponents.data(), kWedge15MonomialExponents.size());
-        label = "Wedge15";
+// Build the hexahedral serendipity monomial space, reference nodes, and nodal
+// coefficient table for the given order, mirroring init_quadrilateral. The
+// arbitrary-order topology path generates its own VTK-consistent nodes; the named
+// Hex8 (order 1) and Hex20 (order 2) layouts source their public-order nodes from
+// ReferenceNodeLayout so the generated layout matches the public ordering exactly.
+void SerendipityBasis::init_hexahedron(int order, bool nodes_from_reference_layout) {
+    monomial_exponents_ = hex_serendipity_exponents(order);
+    size_ = monomial_exponents_.size();
+    if (nodes_from_reference_layout) {
+        const ElementType named =
+            (order == 1) ? ElementType::Hex8 : ElementType::Hex20;
+        nodes_ = ReferenceNodeLayout::node_coords(named);
+    } else {
+        nodes_ = hex_serendipity_nodes(order, size_);
     }
+    svmp::throw_if<BasisConstructionException>(
+        nodes_.size() != size_, SVMP_HERE,
+        "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
+    inv_vandermonde_ = build_inverse_vandermonde(
+        nodes_, monomial_exponents_, "Hex order " + std::to_string(order));
+}
+
+// Build the Wedge15 serendipity layout from its tabulated monomial space and
+// public-order ReferenceNodeLayout nodes. Hexahedral serendipity (Hex8 and Hex20
+// included) is generated by init_hexahedron, so the prism is the only named
+// layout that still carries a fixed monomial table.
+void SerendipityBasis::init_fixed_named(ElementType type) {
+    svmp::throw_if<BasisConstructionException>(
+        type != ElementType::Wedge15, SVMP_HERE,
+        "SerendipityBasis: init_fixed_named builds only the Wedge15 layout");
+    size_ = 15u;
+    const std::span<const std::array<int, 3>> family_exponents(
+        kWedge15MonomialExponents.data(), kWedge15MonomialExponents.size());
     nodes_ = ReferenceNodeLayout::node_coords(type);
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
-        "SerendipityBasis: fixed serendipity layout node count does not match basis size");
+        "SerendipityBasis: Wedge15 layout node count does not match basis size");
     svmp::throw_if<BasisConstructionException>(
         family_exponents.size() != size_, SVMP_HERE,
-        "SerendipityBasis: serendipity monomial count does not match basis size");
+        "SerendipityBasis: Wedge15 monomial count does not match basis size");
     monomial_exponents_.assign(family_exponents.begin(), family_exponents.end());
-    inv_vandermonde_ = build_inverse_vandermonde(nodes_, monomial_exponents_, label);
+    inv_vandermonde_ = build_inverse_vandermonde(nodes_, monomial_exponents_, "Wedge15");
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
@@ -479,15 +598,8 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     const double y = xi[1];
     const double z = xi[2];
 
-    // Hex8 (Hexahedron at order 1) is the only serendipity basis evaluated
-    // directly from the trilinear corner products rather than a coefficient table.
-    if (topology_ == BasisTopology::Hexahedron && order_ == 1) {
-        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    // Quad, Hex20, and Wedge15 evaluate through their generated coefficient
-    // table, which is already in public basis order.
+    // Every serendipity family evaluates through its generated coefficient table,
+    // which is already in public basis order.
     svmp::throw_if<BasisEvaluationException>(
         monomial_exponents_.size() != size_ ||
             inv_vandermonde_.size() != size_ * size_,
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 630243838..d6640ed07 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -34,11 +34,13 @@ namespace basis {
 /**
  * @brief Reduced-degree-of-freedom serendipity basis on supported reference elements.
  *
- * @details SerendipityBasis implements nodal bases for Quad4/Quad8,
- * Hex8/Hex20, and Wedge15. Compared with a complete tensor-product Lagrange
- * basis of the same nominal order, a serendipity basis removes selected
- * interior modes while retaining nodal interpolation on the supported node
- * layout.
+ * @details SerendipityBasis implements nodal bases for the quadrilateral and
+ * hexahedral serendipity families at arbitrary order, plus the Wedge15 prism
+ * layout. Compared with a complete tensor-product Lagrange basis of the same
+ * nominal order, a serendipity basis removes selected interior modes while
+ * retaining nodal interpolation on the supported node layout. The named layouts
+ * Quad8, Hex8, and Hex20 are the fixed-order instances of these families
+ * (quadrilateral order 2, hexahedron orders 1 and 2).
  *
  * Quadrilateral serendipity bases are built from monomials
  * @f$x^{a_x}y^{a_y}@f$ whose superlinear degree is at most the requested
@@ -74,45 +76,64 @@ namespace basis {
  * linear term in @f$y@f$. The interpolation Vandermonde is therefore
  * nonsingular for the implemented quadrilateral serendipity space.
  *
- * `SerendipityBasis(BasisTopology::Quadrilateral, p)` is the arbitrary-order
- * entry point for quadrilateral serendipity (@f$p \ge 1@f$; orders below one are
- * rejected); it generates its own reference nodes, since the
- * higher-order interior ordering is an implementation convention rather than a
- * public layout. `ElementType::Quad8` is the named quadratic eight-node layout
- * (valid only with order 2) and, like Hex20 and Wedge15, takes its reference
- * nodes from ReferenceNodeLayout so that all named fixed layouts share the
- * single public node ordering. Hex and wedge serendipity are single fixed
- * layouts with no arbitrary-order form, so they are constructed only from their
- * named ElementType. Solver-default basis selection remains separate:
- * `basis_factory` maps the complete Quad4 layout to the default linear Lagrange
- * basis and maps Quad8 to quadratic serendipity unless a caller explicitly
- * requests a different supported basis.
+ * Hexahedral serendipity generalizes the same construction to the cube. The
+ * monomial space is every @f$r^{a_r}s^{a_s}t^{a_t}@f$ whose superlinear degree
+ * (the three-axis form of the rule above) is at most @f$p@f$, and the nodal
+ * basis is again the inverse Vandermonde at the reference nodes. Those nodes are
+ * distributed by boundary stratum: 8 corners, @f$12(p-1)@f$ edge nodes,
+ * @f$6\,q(p)@f$ face-interior nodes -- each face carries the 2D quadrilateral
+ * serendipity interior, since the trace of the cube space on a face is the
+ * square space -- and a volume interior that is empty until @f$p \ge 6@f$.
+ * Unisolvence follows the same factorization: a function vanishing on every
+ * boundary node vanishes on each face by the quadrilateral result above, hence
+ * is divisible by the cube bubble @f$(1 - r^2)(1 - s^2)(1 - t^2)@f$ with quotient
+ * in @f$P_{p-6}@f$; the volume-interior nodes form a tetrahedral staircase that
+ * is unisolvent for @f$P_{p-6}@f$ by induction over @f$t@f$-layers, so the cube
+ * Vandermonde is nonsingular.
  *
- * Hex8 uses the standard trilinear corner basis
- * @f$(1 \pm r)(1 \pm s)(1 \pm t)/8@f$. Quad8, Hex20, and Wedge15 use fixed
- * monomial spaces whose nodal coefficient tables are generated at construction
- * by inverting the Vandermonde built from their public-order ReferenceNodeLayout
- * nodes; analytical gradients and Hessians are obtained by differentiating those
- * monomials. Because the tables are generated in public node order, evaluation
- * needs no output reordering.
+ * `SerendipityBasis(BasisTopology::Quadrilateral, p)` and
+ * `SerendipityBasis(BasisTopology::Hexahedron, p)` are the arbitrary-order entry
+ * points (@f$p \ge 1@f$; orders below one are rejected). They generate their own
+ * reference nodes in a VTK-consistent stratified order; for @f$p \ge 3@f$ the
+ * interior ordering is an implementation convention rather than a public layout.
+ * The named fixed layouts -- `ElementType::Quad8` (order 2), `Hex8` (order 1),
+ * and `Hex20` (order 2) -- are the same construction at those orders but take
+ * their nodes from ReferenceNodeLayout so they carry the single public node
+ * ordering the solver permutes against. Because the generator reuses the VTK
+ * corner/edge ordering, its order-1 and order-2 hexahedral layouts match the
+ * public Hex8/Hex20 ordering exactly, so the named and topology constructions
+ * produce identical objects. Wedge serendipity remains a single fixed layout
+ * (Wedge15), constructed only from its named ElementType. Solver-default basis
+ * selection is separate: `basis_factory` maps the complete Quad4 layout to the
+ * default linear Lagrange basis and maps Quad8/Hex20 to serendipity unless a
+ * caller explicitly requests a different supported basis.
+ *
+ * Every supported family -- quadrilateral, hexahedral, and Wedge15 -- is built by
+ * inverting the Vandermonde of its monomial space at the public-order reference
+ * nodes; values, gradients, and Hessians are evaluated by differentiating the
+ * monomial vector and applying the inverse-Vandermonde coefficients. Because the
+ * tables are generated in public node order, evaluation needs no output
+ * reordering, and there is no hand-written special case -- the Hex8 basis is the
+ * order-1 instance of the generated hexahedral space, not a separate trilinear
+ * evaluator.
  */
 class SerendipityBasis final : public BasisFunction {
 public:
     /**
-     * @brief Construct an arbitrary-order quadrilateral serendipity basis.
+     * @brief Construct an arbitrary-order quadrilateral or hexahedral serendipity basis.
      *
-     * @details This is the arbitrary-order entry point for the only serendipity
-     * family with a free order: the quadrilateral. The topology carries no
-     * node-count assumption; the monomial space, reference nodes (generated
-     * here), and nodal coefficient table are built from the requested order
-     * (which must be @f$p \ge 1@f$). Hex and wedge serendipity are single
-     * fixed layouts and are not constructed this way -- use the named ElementType
-     * overload (Hex8/Hex20/Wedge15) for them.
+     * @details This is the arbitrary-order entry point for the serendipity
+     * families with a free order: the quadrilateral and the hexahedron. The
+     * topology carries no node-count assumption; the monomial space, reference
+     * nodes (generated here in VTK-consistent stratified order), and nodal
+     * coefficient table are built from the requested order (which must be
+     * @f$p \ge 1@f$). Wedge serendipity is a single fixed layout and is not
+     * constructed this way -- use the named ElementType overload (Wedge15).
      *
-     * @param topology Must be BasisTopology::Quadrilateral.
+     * @param topology Must be BasisTopology::Quadrilateral or BasisTopology::Hexahedron.
      * @param order Polynomial order @f$p \ge 1@f$; orders below 1 are rejected.
      * @throws BasisConfigurationException If @p order is less than 1.
-     * @throws BasisElementCompatibilityException If @p topology is not Quadrilateral.
+     * @throws BasisElementCompatibilityException If @p topology is not Quadrilateral or Hexahedron.
      */
     SerendipityBasis(BasisTopology topology, int order);
 
@@ -120,14 +141,15 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Construct a serendipity basis from a named element layout.
      *
      * @details Convenience overload for the named, fixed serendipity layouts.
-     * Quad8 builds the quadratic quadrilateral serendipity space from its
-     * ReferenceNodeLayout nodes; Hex8 uses the trilinear corner basis directly;
-     * Hex20 and Wedge15 build and invert a Vandermonde over their fixed monomial
-     * spaces. Each layout carries an inferred fixed order (Hex8 to 1; Quad8,
+     * Each layout is the fixed-order instance of its family, built through the
+     * same generated construction as the arbitrary-order path and taking its
+     * nodes from ReferenceNodeLayout: Quad8 is the quadrilateral at order 2, Hex8
+     * and Hex20 are the hexahedron at orders 1 and 2, and Wedge15 is the prism
+     * layout. Each layout carries an inferred fixed order (Hex8 to 1; Quad8,
      * Hex20, and Wedge15 to 2); the requested @p order must equal that inferred
      * order and is never adjusted to fit, so a mismatched request (including
-     * order 0 or negative) is rejected. Arbitrary-order quadrilateral serendipity
-     * is requested through the BasisTopology overload.
+     * order 0 or negative) is rejected. Arbitrary-order quadrilateral and
+     * hexahedral serendipity is requested through the BasisTopology overload.
      *
      * @param type Named serendipity element type (Quad8, Hex8, Hex20, or Wedge15).
      * @param order Requested order; must equal the layout's inferred fixed order
@@ -164,13 +186,14 @@ class SerendipityBasis final : public BasisFunction {
      * satisfies the nodal interpolation property. The named fixed layouts (Quad8,
      * Hex8, Hex20, Wedge15) take their nodes from ReferenceNodeLayout, the public
      * node-ordering source the solver adapter permutes against. Arbitrary-order
-     * quadrilateral serendipity (constructed from BasisTopology::Quadrilateral)
-     * generates its nodes here instead: boundary nodes first and then, for higher
-     * order requests, the selected interior points needed to make the reduced
-     * monomial space unisolvent. That deterministic interior row ordering is an
-     * implementation convention; callers should pair it with basis values from
-     * the same object rather than assume an external mesh ordering contract
-     * beyond the supported Quad8 production layout.
+     * quadrilateral and hexahedral serendipity (constructed from a BasisTopology)
+     * generates its nodes here instead, in VTK-consistent stratified order:
+     * corners and edges first (matching the public Quad8/Hex8/Hex20 ordering at
+     * the named orders), then the face and volume interior points needed to make
+     * the reduced monomial space unisolvent. For @f$p \ge 3@f$ that interior
+     * ordering is an implementation convention; callers should pair it with basis
+     * values from the same object rather than assume an external mesh ordering
+     * contract beyond the supported named production layouts.
      *
      * @return Reference node coordinates, one per basis function.
      */
@@ -179,11 +202,10 @@ class SerendipityBasis final : public BasisFunction {
     /**
      * @brief Evaluate serendipity basis function values at a reference coordinate.
      *
-     * @details For quadrilateral bases, this evaluates the serendipity
-     * monomial vector and multiplies by the inverse Vandermonde matrix to
-     * obtain nodal shape-function values. For Hex8, values are the standard
-     * trilinear corner products. For Hex20 and Wedge15, values are evaluated
-     * from their generated nodal coefficient tables.
+     * @details Every family evaluates the serendipity monomial vector and
+     * multiplies by the generated inverse Vandermonde matrix to obtain nodal
+     * shape-function values; the coefficient table is already in public basis
+     * order, so no output reordering is needed.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
@@ -194,12 +216,9 @@ class SerendipityBasis final : public BasisFunction {
     /**
      * @brief Evaluate analytical serendipity basis gradients at a reference coordinate.
      *
-     * @details Gradients are derivatives with respect to reference
-     * coordinates. Quadrilateral gradients differentiate the monomial vector
-     * before applying the inverse Vandermonde coefficients. Hex8 gradients are
-     * direct derivatives of the trilinear corner products. Hex20 and Wedge15
-     * gradients are computed by differentiating their generated monomial
-     * expansions.
+     * @details Gradients are derivatives with respect to reference coordinates.
+     * Every family differentiates the monomial vector and applies the same
+     * generated inverse Vandermonde coefficients used for the values.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param gradients Receives one three-component gradient per basis function.
@@ -210,12 +229,10 @@ class SerendipityBasis final : public BasisFunction {
     /**
      * @brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
      *
-     * @details Hessians are second derivatives in reference coordinates and
-     * are stored as 3-by-3 matrices. Quadrilateral Hessians use second
-     * derivatives of the monomial vector and inverse Vandermonde coefficients.
-     * Hex8 Hessians are computed directly from the trilinear corner products.
-     * Hex20 and Wedge15 Hessians are computed by differentiating their
-     * generated monomial expansions twice.
+     * @details Hessians are second derivatives in reference coordinates and are
+     * stored as 3-by-3 matrices. Every family uses the second derivatives of the
+     * monomial vector together with the same generated inverse Vandermonde
+     * coefficients used for the values.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians Receives one 3-by-3 Hessian per basis function.
@@ -280,8 +297,14 @@ class SerendipityBasis final : public BasisFunction {
     // its nodes from ReferenceNodeLayout; the arbitrary-order topology path
     // generates them.
     void init_quadrilateral(int order, bool nodes_from_reference_layout);
-    // Build a fixed named volume serendipity layout (Hex20 or Wedge15) from its
-    // tabulated monomial space and ReferenceNodeLayout nodes.
+    // Build the hexahedral serendipity monomial space, reference nodes, and nodal
+    // coefficient table for the given order. The arbitrary-order topology path
+    // generates VTK-consistent nodes; the named Hex8 (order 1) and Hex20 (order 2)
+    // layouts take their public-order nodes from ReferenceNodeLayout.
+    void init_hexahedron(int order, bool nodes_from_reference_layout);
+    // Build the Wedge15 serendipity layout from its tabulated monomial space and
+    // ReferenceNodeLayout nodes. Hexahedral serendipity (Hex8/Hex20) is generated
+    // by init_hexahedron, so the prism is the only remaining fixed named layout.
     void init_fixed_named(ElementType type);
 
     void evaluate_all_to(const math::Vector<double, 3>& xi,
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index dbbc33f9b..cce73a39a 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -366,6 +366,17 @@ TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
             basis, serendipity_sample_points(BasisTopology::Quadrilateral), c.tol);
     }
 
+    // Arbitrary-order hexahedral serendipity (topology path).
+    const struct HexCase { int order; double tol; } hex_cases[] = {
+        {1, double(1e-8)}, {2, double(1e-7)}, {3, double(5e-7)},
+        {4, double(1e-6)}, {5, double(5e-6)},
+    };
+    for (const auto& c : hex_cases) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, c.order);
+        expect_gradients_match_numerical(
+            basis, serendipity_sample_points(BasisTopology::Hexahedron), c.tol);
+    }
+
     // Named fixed serendipity layouts.
     const struct NamedCase { ElementType type; int order; double tol; } named_cases[] = {
         {ElementType::Quad8, 2, double(1e-7)},
@@ -403,6 +414,17 @@ TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
             basis, serendipity_sample_points(BasisTopology::Quadrilateral), c.tol);
     }
 
+    // Arbitrary-order hexahedral serendipity (topology path).
+    const struct HexCase { int order; double tol; } hex_cases[] = {
+        {1, double(1e-6)}, {2, double(1e-6)}, {3, double(5e-6)},
+        {4, double(1e-5)}, {5, double(5e-5)},
+    };
+    for (const auto& c : hex_cases) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, c.order);
+        expect_hessians_match_numerical(
+            basis, serendipity_sample_points(BasisTopology::Hexahedron), c.tol);
+    }
+
     // Named fixed serendipity layouts.
     const struct NamedCase { ElementType type; int order; double tol; } named_cases[] = {
         {ElementType::Quad8, 2, double(1e-6)},
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 2edc6618d..b6cfd06ab 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -240,6 +240,46 @@ std::vector<std::array<int, 3>> hex20_serendipity_exponents_for_test() {
     return exponents;
 }
 
+// Arbitrary-order hexahedral serendipity verification
+std::vector<std::array<int, 3>> hex_serendipity_exponents_for_test(int order) {
+    std::vector<std::array<int, 3>> exponents;
+    for (int az = 0; az <= order; ++az) {
+        for (int ay = 0; ay <= order; ++ay) {
+            for (int ax = 0; ax <= order; ++ax) {
+                if (superlinear_degree_3d_for_test(ax, ay, az) <= order) {
+                    exponents.push_back({ax, ay, az});
+                }
+            }
+        }
+    }
+    return exponents;
+}
+
+std::size_t quad_serendipity_interior_count_for_test(int order) {
+    if (order < 4) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 4);
+    return (m + 1u) * (m + 2u) / 2u;
+}
+
+std::size_t hex_serendipity_volume_interior_count_for_test(int order) {
+    if (order < 6) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 6);
+    return (m + 1u) * (m + 2u) * (m + 3u) / 6u;
+}
+
+// dim S_p from the node strata: 8 corners, 12 (p - 1) edge nodes, 6 q(p) face
+// interiors, and the P_{p-6} volume residual.
+std::size_t expected_hex_serendipity_size(int order) {
+    const auto p = static_cast<std::size_t>(order);
+    return 8u + 12u * (p - 1u) +
+           6u * quad_serendipity_interior_count_for_test(order) +
+           hex_serendipity_volume_interior_count_for_test(order);
+}
+
 // Wedge15 serendipity span: triangle monomials (ax, ay) with ax + ay <= 2,
 // tensored with the through-axis. Linear triangle monomials (ax + ay <= 1) carry
 // t-degree up to two; quadratic triangle monomials (ax + ay == 2) carry t-degree
@@ -405,20 +445,25 @@ TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
 // Topology construction is the arbitrary-order entry point and exists only for
 // the quadrilateral, the single serendipity family with a free order. Hex and
 // wedge serendipity are fixed layouts requested through their named ElementType.
-TEST(SerendipityBasis, TopologyConstructionOnlySupportsQuadrilateral) {
+TEST(SerendipityBasis, TopologyConstructionSupportsQuadrilateralAndHexahedron) {
     EXPECT_NO_THROW((void)SerendipityBasis(BasisTopology::Quadrilateral, 3));
-    EXPECT_THROW(SerendipityBasis(BasisTopology::Hexahedron, 2),
-                 BasisElementCompatibilityException);
+    EXPECT_NO_THROW((void)SerendipityBasis(BasisTopology::Hexahedron, 3));
     EXPECT_THROW(SerendipityBasis(BasisTopology::Wedge, 2),
                  BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(BasisTopology::Triangle, 2),
                  BasisElementCompatibilityException);
 
-    // Topology and named construction agree at the production order.
-    SerendipityBasis topo(BasisTopology::Quadrilateral, 2);
-    EXPECT_EQ(topo.topology(), BasisTopology::Quadrilateral);
-    EXPECT_EQ(topo.order(), 2);
-    EXPECT_EQ(topo.element_type(), ElementType::Quad8);
+    // Topology and named construction agree at the production order for both the
+    // quadrilateral and the hexahedron.
+    SerendipityBasis quad(BasisTopology::Quadrilateral, 2);
+    EXPECT_EQ(quad.topology(), BasisTopology::Quadrilateral);
+    EXPECT_EQ(quad.order(), 2);
+    EXPECT_EQ(quad.element_type(), ElementType::Quad8);
+
+    SerendipityBasis hex(BasisTopology::Hexahedron, 2);
+    EXPECT_EQ(hex.topology(), BasisTopology::Hexahedron);
+    EXPECT_EQ(hex.order(), 2);
+    EXPECT_EQ(hex.element_type(), ElementType::Hex20);
 }
 
 TEST(SerendipityBasis, QuadrilateralRejectsOrdersBelowOne) {
@@ -606,9 +651,10 @@ TEST(SerendipityBasis, QuadrilateralVandermondeHasFullRankThroughOrderTen) {
     }
 }
 
-// SerendipityBasis(Hex8, 1) is the only route to the hand-written trilinear
-// corner evaluator (values, gradients, and Hessians); it must agree with the
-// trilinear Lagrange basis on the same element.
+// Hex8 serendipity is the order-1 instance of the generated hexahedral
+// serendipity space (the eight multilinear monomials). It must still reproduce
+// the trilinear Lagrange basis -- values, gradients, and Hessians -- which guards
+// the generated order-1 coefficient table against the closed-form trilinear basis.
 TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
     SerendipityBasis serendipity(ElementType::Hex8, 1);
     LagrangeBasis lagrange(ElementType::Hex8, 1);
@@ -796,3 +842,156 @@ TEST(SerendipityBasis, Wedge15ReferenceNodesMatchIndependentConstruction) {
     SerendipityBasis basis(ElementType::Wedge15, 2);
     expect_nodes_near(basis.nodes(), wedge15_reference_nodes_for_test(), double(1e-14));
 }
+
+// --- Arbitrary-order hexahedral serendipity (BasisTopology::Hexahedron) -------
+
+// dim S_p of the cube serendipity space for p = 1..6 (Hex8 = 8, Hex20 = 20),
+// checked against both the re-derived monomial enumeration and the node-strata
+// decomposition.
+TEST(SerendipityBasis, HexahedralSerendipitySpaceHasExpectedDimensions) {
+    const std::array<std::size_t, 7> expected = {0u, 8u, 20u, 32u, 50u, 74u, 105u};
+    for (int order = 1; order <= 6; ++order) {
+        const auto exponents = hex_serendipity_exponents_for_test(order);
+        const auto p = static_cast<std::size_t>(order);
+        EXPECT_EQ(exponents.size(), expected[p]) << "order=" << order;
+        EXPECT_EQ(expected_hex_serendipity_size(order), expected[p]) << "order=" << order;
+        for (const auto& e : exponents) {
+            EXPECT_LE(superlinear_degree_3d_for_test(e[0], e[1], e[2]), order);
+            for (int d = 0; d < 3; ++d) {
+                EXPECT_GE(e[d], 0);
+                EXPECT_LE(e[d], order);
+            }
+        }
+    }
+
+    // The order-2 hex serendipity span is exactly the Hex20 span (as a set).
+    auto order_two = hex_serendipity_exponents_for_test(2);
+    auto hex20 = hex20_serendipity_exponents_for_test();
+    std::sort(order_two.begin(), order_two.end());
+    std::sort(hex20.begin(), hex20.end());
+    EXPECT_EQ(order_two, hex20);
+}
+
+// VTK conformance: the generated arbitrary-order layout reproduces the public
+// Hex8 (order 1) and Hex20 (order 2) node ordering coordinate-for-coordinate.
+TEST(SerendipityBasis, HexahedralTopologyNodesMatchPublicHex8AndHex20Layouts) {
+    SerendipityBasis hex8(BasisTopology::Hexahedron, 1);
+    EXPECT_EQ(hex8.size(), 8u);
+    expect_nodes_near(hex8.nodes(),
+                      ReferenceNodeLayout::node_coords(ElementType::Hex8),
+                      double(1e-14));
+
+    SerendipityBasis hex20(BasisTopology::Hexahedron, 2);
+    EXPECT_EQ(hex20.size(), 20u);
+    expect_nodes_near(hex20.nodes(),
+                      ReferenceNodeLayout::node_coords(ElementType::Hex20),
+                      double(1e-14));
+}
+
+// The generated node set is unisolvent for the hex serendipity span at every
+// supported order: the Vandermonde of the re-derived monomials at the generated
+// nodes has full rank.
+TEST(SerendipityBasis, HexahedralSerendipityVandermondeHasFullRankThroughOrderSix) {
+    for (int order = 1; order <= 6; ++order) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, order);
+        const auto exponents = hex_serendipity_exponents_for_test(order);
+        const std::size_t n = basis.size();
+        ASSERT_EQ(exponents.size(), n) << "order=" << order;
+        const auto vandermonde = vandermonde_3d_for_test(basis.nodes(), exponents);
+        ASSERT_EQ(vandermonde.size(), n * n) << "order=" << order;
+        EXPECT_EQ(math::dense_matrix_rank(vandermonde, n, n), n) << "order=" << order;
+    }
+}
+
+TEST(SerendipityBasis, HexahedralTopologyIsNodalAndPartitionsUnity) {
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
+    };
+    for (int order = 1; order <= 5; ++order) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, order);
+        EXPECT_EQ(basis.dimension(), 3);
+        EXPECT_EQ(basis.order(), order);
+        EXPECT_EQ(basis.size(), expected_hex_serendipity_size(order)) << "order=" << order;
+        ASSERT_EQ(basis.nodes().size(), basis.size());
+
+        expect_nodal_delta(basis, basis.nodes(), double(1e-7));
+        for (const auto& xi : points) {
+            expect_partition_of_unity(basis, xi, double(1e-7));
+        }
+    }
+}
+
+// Non-nodal polynomial reproduction across orders: the basis reproduces every
+// monomial in its span at interior points, pinning the production monomial space
+// against the re-derived verification.
+TEST(SerendipityBasis, HexahedralTopologyReproducesEverySerendipityMonomial) {
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
+        {double(0.11), double(0.23), double(-0.42)},
+    };
+    for (int order = 1; order <= 5; ++order) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, order);
+        const auto exponents = hex_serendipity_exponents_for_test(order);
+        ASSERT_EQ(exponents.size(), basis.size()) << "order=" << order;
+        for (const auto& exponent : exponents) {
+            for (const auto& xi : points) {
+                const double interpolated = interpolate_nodal_function(
+                    basis, xi,
+                    [&exponent](const math::Vector<double, 3>& node) {
+                        return monomial_value_3d_for_test(node, exponent);
+                    });
+                EXPECT_NEAR(interpolated, monomial_value_3d_for_test(xi, exponent),
+                            double(1e-6))
+                    << "order=" << order << " ax=" << exponent[0]
+                    << " ay=" << exponent[1] << " az=" << exponent[2];
+            }
+        }
+    }
+}
+
+
+TEST(SerendipityBasis, NamedHexLayoutsMatchTopologyConstruction) {
+    const struct Case { ElementType type; int order; } cases[] = {
+        {ElementType::Hex8, 1},
+        {ElementType::Hex20, 2},
+    };
+    const std::vector<math::Vector<double, 3>> points = {
+        {double(0.2), double(-0.1), double(0.3)},
+        {double(-0.35), double(0.25), double(-0.15)},
+        {double(0.11), double(0.23), double(-0.42)},
+    };
+    for (const auto& c : cases) {
+        SerendipityBasis named(c.type, c.order);
+        SerendipityBasis topo(BasisTopology::Hexahedron, c.order);
+
+        ASSERT_EQ(named.size(), topo.size());
+        ASSERT_EQ(named.nodes().size(), topo.nodes().size());
+        for (std::size_t i = 0; i < named.nodes().size(); ++i) {
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_EQ(named.nodes()[i][d], topo.nodes()[i][d])
+                    << "node=" << i << " d=" << d;
+            }
+        }
+
+        for (const auto& xi : points) {
+            std::vector<double> nv, tv;
+            std::vector<Gradient> ng, tg;
+            std::vector<Hessian> nh, th;
+            named.evaluate_all(xi, nv, ng, nh);
+            topo.evaluate_all(xi, tv, tg, th);
+            ASSERT_EQ(nv.size(), tv.size());
+            for (std::size_t i = 0; i < nv.size(); ++i) {
+                EXPECT_EQ(nv[i], tv[i]) << "value i=" << i;
+                for (std::size_t d = 0; d < 3u; ++d) {
+                    EXPECT_EQ(ng[i][d], tg[i][d]) << "grad i=" << i << " d=" << d;
+                    for (std::size_t e = 0; e < 3u; ++e) {
+                        EXPECT_EQ(nh[i](d, e), th[i](d, e))
+                            << "hess i=" << i << " (" << d << "," << e << ")";
+                    }
+                }
+            }
+        }
+    }
+}

From 0a4788e6b97adae55c3854bf42cb00b8f561a8e3 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 15:02:59 -0700
Subject: [PATCH 62/91] fixing point lagrange order to zero since nonzero
 support does not make sense and should not be silently changed

---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp     | 3 +++
 Code/Source/solver/FE/Basis/LagrangeBasis.h       | 3 ++-
 tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index ae3d33829..ee36db2ea 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -261,6 +261,9 @@ LagrangeBasis::LagrangeBasis(BasisTopology topology, int order)
                                                      "LagrangeBasis: unknown reference topology");
     svmp::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
                                               "LagrangeBasis requires non-negative polynomial order");
+    svmp::throw_if<BasisConfigurationException>(
+        topology_ == BasisTopology::Point && order_ != 0, SVMP_HERE,
+        "LagrangeBasis: Point topology supports order 0 only");
     dimension_ = topology_dimension(topology_);
     init_nodes();
 }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 79e4a25b0..0e2588364 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -93,7 +93,8 @@ class LagrangeBasis final : public BasisFunction {
      *
      * @param topology Reference topology; Point through the volume topologies.
      * @param order Polynomial order; must be non-negative. Point is order 0.
-     * @throws BasisConfigurationException If the order is negative.
+     * @throws BasisConfigurationException If the order is negative, or if Point
+     *         is requested with a nonzero order.
      * @throws BasisElementCompatibilityException If the topology is Unknown.
      */
     LagrangeBasis(BasisTopology topology, int order);
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 6ddb7917a..beb3028bf 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -178,6 +178,9 @@ TEST(BasisErrorPaths, LagrangeInvalidRequestsThrowBasisExceptions) {
                  BasisConfigurationException);
     EXPECT_THROW(LagrangeBasis(ElementType::Quad8, 2),
                  BasisElementCompatibilityException);
+    EXPECT_NO_THROW((void)LagrangeBasis(BasisTopology::Point, 0));
+    EXPECT_THROW((void)LagrangeBasis(BasisTopology::Point, 1),
+                 BasisConfigurationException);
 }
 
 // A named Lagrange element layout fixes its polynomial order: the matching order

From 142419d9d273ae00c30be3bebc509ede6d79997a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 15:21:55 -0700
Subject: [PATCH 63/91] two arbitrary-order generators now source their
 corner+edge skeleton from the single ReferenceNodeLayout generator

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 80 +++++++------------
 .../FE/Basis/test_SerendipityBasis.cpp        | 31 +++++++
 2 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 1798920e0..5e6955e84 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -76,30 +76,23 @@ void append_quad_serendipity_interior_nodes(std::vector<Vec3>& nodes, int order)
 }
 
 std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
-    std::vector<Vec3> nodes;
     if (order <= 0) {
-        return nodes;
+        return {};
     }
 
-    const double inv_order = double(1) / double(order);
-
-    nodes.push_back(Vec3{double(-1), double(-1), double(0)});
-    nodes.push_back(Vec3{double(1),  double(-1), double(0)});
-    nodes.push_back(Vec3{double(1),  double(1),  double(0)});
-    nodes.push_back(Vec3{double(-1), double(1),  double(0)});
-
-    for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{double(-1) + double(2 * i) * inv_order, double(-1), double(0)});
-    }
-    for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{double(1), double(-1) + double(2 * i) * inv_order, double(0)});
-    }
-    for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{double(1) - double(2 * i) * inv_order, double(1), double(0)});
-    }
-    for (int i = 1; i < order; ++i) {
-        nodes.push_back(Vec3{double(-1), double(1) - double(2 * i) * inv_order, double(0)});
-    }
+    // The corner+edge skeleton is the leading prefix of the complete quadrilateral
+    // Lagrange layout of the same order: 4 corners followed by 4(order-1) edge
+    // nodes, in the same VTK boundary order. Source it from the single
+    // ReferenceNodeLayout generator and drop that layout's interior, so the
+    // reference-cell corner/edge geometry has one owner; only the reduced interior
+    // appended below is serendipity-specific.
+    std::vector<Vec3> nodes =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad4, order);
+    const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
+    svmp::throw_if<BasisConstructionException>(
+        boundary_count > nodes.size(), SVMP_HERE,
+        "SerendipityBasis: quadrilateral skeleton exceeds the complete Lagrange layout");
+    nodes.resize(boundary_count);
 
     svmp::throw_if<BasisConstructionException>(
         nodes.size() > total_size, SVMP_HERE,
@@ -219,38 +212,21 @@ void append_hex_serendipity_volume_interior_nodes(std::vector<Vec3>& nodes, int
 // Generate the hexahedral serendipity reference nodes in the generalized
 // right-hand-rule / VTK-consistent stratified order: 8 corners, then 12 edges in
 // VTK quadratic-hex edge order, then the 6 face interiors in VTK face order, then
-// the volume interior. The corner and edge strata reuse the VTK ordering of
-// generate_hex_nodes verbatim, so at order 1 (corners only) and order 2 (corners
-// plus edge midpoints) the layout is exactly the public Hex8 / Hex20 ordering;
-// for higher order the reduced face/volume sets are this module's own convention.
+// the volume interior. The corner and edge strata are taken directly from the
+// complete hexahedral Lagrange layout (generate_hex_nodes, via ReferenceNodeLayout),
+// so they share that single generator's VTK ordering: at order 1 (corners only)
+// and order 2 (corners plus edge midpoints) the layout is exactly the public
+// Hex8 / Hex20 ordering, and for higher order the reduced face/volume sets are
+// this module's own convention.
 std::vector<Vec3> hex_serendipity_nodes(int order, std::size_t total_size) {
-    static constexpr double corner_coords[8][3] = {
-        {-1, -1, -1}, {1, -1, -1}, {1, 1, -1}, {-1, 1, -1},
-        {-1, -1, 1},  {1, -1, 1},  {1, 1, 1},  {-1, 1, 1},
-    };
-    static constexpr int edges[12][2] = {
-        {0, 1}, {1, 2}, {2, 3}, {3, 0},
-        {4, 5}, {5, 6}, {6, 7}, {7, 4},
-        {0, 4}, {1, 5}, {2, 6}, {3, 7},
-    };
-
-    std::vector<Vec3> nodes;
-    nodes.reserve(total_size);
-
-    for (const auto& c : corner_coords) {
-        nodes.push_back(Vec3{c[0], c[1], c[2]});
-    }
-
-    for (const auto& edge : edges) {
-        const auto& ca = corner_coords[edge[0]];
-        const auto& cb = corner_coords[edge[1]];
-        const Vec3 a{ca[0], ca[1], ca[2]};
-        const Vec3 b{cb[0], cb[1], cb[2]};
-        for (int m = 1; m < order; ++m) {
-            const double t = double(m) / double(order);
-            nodes.push_back(a * (double(1) - t) + b * t);
-        }
-    }
+    std::vector<Vec3> nodes =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Hex8, order);
+    const std::size_t skeleton_count =
+        8u + 12u * static_cast<std::size_t>(order - 1);
+    svmp::throw_if<BasisConstructionException>(
+        skeleton_count > nodes.size(), SVMP_HERE,
+        "SerendipityBasis: hexahedral skeleton exceeds the complete Lagrange layout");
+    nodes.resize(skeleton_count);
 
     const std::size_t skeleton = nodes.size();
     append_hex_serendipity_face_interior_nodes(nodes, order);
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index b6cfd06ab..d62b8c441 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -888,6 +888,37 @@ TEST(SerendipityBasis, HexahedralTopologyNodesMatchPublicHex8AndHex20Layouts) {
                       double(1e-14));
 }
 
+TEST(SerendipityBasis, SkeletonMatchesCompleteLagrangePrefix) {
+    for (int order = 1; order <= 8; ++order) {
+        SerendipityBasis quad(BasisTopology::Quadrilateral, order);
+        const auto quad_complete =
+            ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad4, order);
+        const std::size_t quad_skeleton = static_cast<std::size_t>(4 * order);
+        ASSERT_LE(quad_skeleton, quad.nodes().size()) << "quad order=" << order;
+        ASSERT_LE(quad_skeleton, quad_complete.size()) << "quad order=" << order;
+        for (std::size_t i = 0; i < quad_skeleton; ++i) {
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_EQ(quad.nodes()[i][d], quad_complete[i][d])
+                    << "quad order=" << order << " node=" << i << " d=" << d;
+            }
+        }
+
+        SerendipityBasis hex(BasisTopology::Hexahedron, order);
+        const auto hex_complete =
+            ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Hex8, order);
+        const std::size_t hex_skeleton =
+            8u + 12u * static_cast<std::size_t>(order - 1);
+        ASSERT_LE(hex_skeleton, hex.nodes().size()) << "hex order=" << order;
+        ASSERT_LE(hex_skeleton, hex_complete.size()) << "hex order=" << order;
+        for (std::size_t i = 0; i < hex_skeleton; ++i) {
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_EQ(hex.nodes()[i][d], hex_complete[i][d])
+                    << "hex order=" << order << " node=" << i << " d=" << d;
+            }
+        }
+    }
+}
+
 // The generated node set is unisolvent for the hex serendipity span at every
 // supported order: the Vandermonde of the re-derived monomials at the generated
 // nodes has full rank.

From f0c7f48ada8eddd445b70ae30af38e65c4de4dab Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 15:40:45 -0700
Subject: [PATCH 64/91] Added topology-based BasisRequest factory creation for
 arbitrary-order bases while preserving named-element request behavior and
 covering both paths with FE basis tests

---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  | 23 ++++++++
 Code/Source/solver/FE/Basis/BasisFactory.h    | 19 +++++++
 Code/Source/solver/FE/Basis/BasisFunction.h   |  2 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 55 +++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index 22ae7183e..dddf549d9 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -13,6 +13,11 @@ namespace basis {
 
 namespace {
 
+enum class RequestTarget {
+    NamedElement,
+    Topology,
+};
+
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
@@ -23,6 +28,18 @@ int require_basis_order(const BasisRequest& req,
     return *req.order;
 }
 
+RequestTarget require_single_request_target(const BasisRequest& req) {
+    const bool has_named_element = req.element_type != ElementType::Unknown;
+    const bool has_topology = req.topology != BasisTopology::Unknown;
+    svmp::throw_if<BasisConfigurationException>(
+        !has_named_element && !has_topology, SVMP_HERE,
+        "BasisFactory: request must specify either a named element_type or a reference topology");
+    svmp::throw_if<BasisConfigurationException>(
+        has_named_element && has_topology, SVMP_HERE,
+        "BasisFactory: request must specify element_type or topology, not both");
+    return has_topology ? RequestTarget::Topology : RequestTarget::NamedElement;
+}
+
 void require_scalar_c0_request(const BasisRequest& req) {
     svmp::throw_if<BasisConfigurationException>(
         req.field_type != FieldType::Scalar, SVMP_HERE,
@@ -38,6 +55,9 @@ std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
         req,
         "BasisFactory: Lagrange creation requires an explicit order",
         "BasisFactory: Lagrange requires non-negative order");
+    if (require_single_request_target(req) == RequestTarget::Topology) {
+        return std::make_shared<LagrangeBasis>(req.topology, order);
+    }
     return std::make_shared<LagrangeBasis>(req.element_type, order);
 }
 
@@ -47,6 +67,9 @@ std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
         req,
         "BasisFactory: Serendipity creation requires an explicit order",
         "BasisFactory: Serendipity requires non-negative order");
+    if (require_single_request_target(req) == RequestTarget::Topology) {
+        return std::make_shared<SerendipityBasis>(req.topology, order);
+    }
     return std::make_shared<SerendipityBasis>(req.element_type, order);
 }
 
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index 252a76226..0bc9c5c9e 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -20,6 +20,8 @@ namespace FE {
 namespace basis {
 
 struct BasisRequest {
+    // Named mesh element layout for default/mesh-compatible bases. Leave Unknown
+    // when requesting an arbitrary-order basis by reference topology.
     ElementType element_type{ElementType::Unknown};
     BasisType basis_type{BasisType::Lagrange};
     std::optional<int> order{};
@@ -32,10 +34,27 @@ struct BasisRequest {
     std::vector<std::vector<double>> axis_weights{};
     std::vector<int> tensor_extents{};
     std::string custom_id{};
+    // Reference topology for arbitrary-order bases. This field is intentionally
+    // last so existing aggregate initializers for named elements keep their
+    // positional meaning.
+    BasisTopology topology{BasisTopology::Unknown};
 };
 
 namespace basis_factory {
 
+/**
+ * @brief Create a basis from a runtime request.
+ *
+ * @details A request must identify exactly one construction target: set
+ * BasisRequest::element_type for a named mesh-node layout, or set
+ * BasisRequest::topology for an arbitrary-order reference-topology basis.
+ * Setting neither target, or setting both, is rejected. Named element requests
+ * keep the element's fixed polynomial order contract; topology requests are the
+ * arbitrary-order path.
+ *
+ * @param req Basis family, target, and order request.
+ * @return Shared basis instance.
+ */
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
 
 /**
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index c2c61ed6e..584b10785 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -93,7 +93,7 @@
  *   enrichment, and convergence studies may use different families or orders
  *   for different fields on the same mesh topology. Those bases should be
  *   requested explicitly with basis_factory::create() and a BasisRequest
- *   naming the desired family and order.
+ *   naming the desired family, topology, and order.
  * - **Evaluation points come from the caller.** Quadrature rules, probe
  *   points, interpolation targets, and error-sampling locations are outside
  *   this module. The basis only evaluates at the reference coordinates it is
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index e2099dea4..d971e4b2c 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -738,15 +738,70 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
                      BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1}),
                  BasisConfigurationException);
 
+    BasisRequest arbitrary_lagrange;
+    arbitrary_lagrange.basis_type = BasisType::Lagrange;
+    arbitrary_lagrange.order = 5;
+    arbitrary_lagrange.topology = BasisTopology::Hexahedron;
+    auto high_order_lagrange = basis_factory::create(arbitrary_lagrange);
+    ASSERT_NE(high_order_lagrange, nullptr);
+    EXPECT_EQ(high_order_lagrange->basis_type(), BasisType::Lagrange);
+    EXPECT_EQ(high_order_lagrange->topology(), BasisTopology::Hexahedron);
+    EXPECT_EQ(high_order_lagrange->element_type(), ElementType::Unknown);
+    EXPECT_EQ(high_order_lagrange->order(), 5);
+    EXPECT_EQ(high_order_lagrange->size(), 216u);
+
     auto serendipity =
         basis_factory::create(BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
     ASSERT_NE(serendipity, nullptr);
     EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
 
+    BasisRequest arbitrary_quad_serendipity;
+    arbitrary_quad_serendipity.basis_type = BasisType::Serendipity;
+    arbitrary_quad_serendipity.order = 4;
+    arbitrary_quad_serendipity.topology = BasisTopology::Quadrilateral;
+    auto high_order_quad_serendipity = basis_factory::create(arbitrary_quad_serendipity);
+    ASSERT_NE(high_order_quad_serendipity, nullptr);
+    EXPECT_EQ(high_order_quad_serendipity->basis_type(), BasisType::Serendipity);
+    EXPECT_EQ(high_order_quad_serendipity->topology(), BasisTopology::Quadrilateral);
+    EXPECT_EQ(high_order_quad_serendipity->element_type(), ElementType::Unknown);
+    EXPECT_EQ(high_order_quad_serendipity->order(), 4);
+    EXPECT_EQ(high_order_quad_serendipity->size(), 17u);
+
+    BasisRequest arbitrary_hex_serendipity;
+    arbitrary_hex_serendipity.basis_type = BasisType::Serendipity;
+    arbitrary_hex_serendipity.order = 3;
+    arbitrary_hex_serendipity.topology = BasisTopology::Hexahedron;
+    auto high_order_hex_serendipity = basis_factory::create(arbitrary_hex_serendipity);
+    ASSERT_NE(high_order_hex_serendipity, nullptr);
+    EXPECT_EQ(high_order_hex_serendipity->basis_type(), BasisType::Serendipity);
+    EXPECT_EQ(high_order_hex_serendipity->topology(), BasisTopology::Hexahedron);
+    EXPECT_EQ(high_order_hex_serendipity->element_type(), ElementType::Unknown);
+    EXPECT_EQ(high_order_hex_serendipity->order(), 3);
+    EXPECT_EQ(high_order_hex_serendipity->size(), 32u);
+
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
                  BasisElementCompatibilityException);
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Pyramid13, BasisType::Serendipity, 2}),
                  BasisElementCompatibilityException);
+
+    BasisRequest ambiguous;
+    ambiguous.element_type = ElementType::Hex27;
+    ambiguous.basis_type = BasisType::Lagrange;
+    ambiguous.order = 2;
+    ambiguous.topology = BasisTopology::Hexahedron;
+    EXPECT_THROW((void)basis_factory::create(ambiguous), BasisConfigurationException);
+
+    BasisRequest missing_target;
+    missing_target.basis_type = BasisType::Lagrange;
+    missing_target.order = 2;
+    EXPECT_THROW((void)basis_factory::create(missing_target), BasisConfigurationException);
+
+    BasisRequest unsupported_serendipity_topology;
+    unsupported_serendipity_topology.basis_type = BasisType::Serendipity;
+    unsupported_serendipity_topology.order = 2;
+    unsupported_serendipity_topology.topology = BasisTopology::Wedge;
+    EXPECT_THROW((void)basis_factory::create(unsupported_serendipity_topology),
+                 BasisElementCompatibilityException);
 }

From fa3b753a48246681c9137c3161aa607c538c76de Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 16:36:36 -0700
Subject: [PATCH 65/91] arbitrary-order quadrilateral and hexahedral
 serendipity bases (and the shared tensor-Lagrange node distribution)
 well-conditioned at high order

---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  10 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |   2 +-
 .../FE/Basis/NodeOrderingConventions.cpp      |  99 ++++++-
 .../solver/FE/Basis/NodeOrderingConventions.h |  24 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      | 267 +++++++++++++-----
 .../Source/solver/FE/Basis/SerendipityBasis.h |  45 ++-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |   6 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  33 +++
 .../FE/Basis/test_SerendipityBasis.cpp        | 150 +++++++++-
 9 files changed, 529 insertions(+), 107 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index ee36db2ea..97f5cad5f 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -271,9 +271,9 @@ LagrangeBasis::LagrangeBasis(BasisTopology topology, int order)
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
     : LagrangeBasis(validated_lagrange_topology(type, order), order) {}
 
-// Initialize equispaced 1D interpolation nodes and their barycentric weights for
-// tensor-product axes.
-void LagrangeBasis::init_equispaced_1d_nodes() {
+// Initialize the 1D tensor-axis interpolation nodes (Gauss-Lobatto-Legendre, via
+// line_coord_pm_one) and their barycentric weights for tensor-product axes.
+void LagrangeBasis::init_tensor_axis_nodes() {
     const std::size_t n = static_cast<std::size_t>(order_ + 1);
     nodes_1d_.resize(n);
     for (int i = 0; i <= order_; ++i) {
@@ -337,7 +337,7 @@ void LagrangeBasis::build_point_nodes() {
 
 // Build nodes and axis indices for tensor-product elements.
 void LagrangeBasis::build_tensor_product_nodes() {
-    init_equispaced_1d_nodes();
+    init_tensor_axis_nodes();
     const auto layout =
         ReferenceNodeLayout::get_lagrange_lattice(lagrange_topology_representative(topology_), order_);
     nodes_ = layout.coords;
@@ -365,7 +365,7 @@ void LagrangeBasis::build_simplex_nodes() {
 
 // Build nodes and mixed triangle-axis lookup data for wedge elements.
 void LagrangeBasis::build_wedge_nodes() {
-    init_equispaced_1d_nodes();
+    init_tensor_axis_nodes();
     const auto layout =
         ReferenceNodeLayout::get_lagrange_lattice(lagrange_topology_representative(topology_), order_);
     nodes_ = layout.coords;
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 0e2588364..fa78d9de7 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -269,7 +269,7 @@ class LagrangeBasis final : public BasisFunction {
     void build_tensor_product_nodes();
     void build_simplex_nodes();
     void build_wedge_nodes();
-    void init_equispaced_1d_nodes();
+    void init_tensor_axis_nodes();
 
     void evaluate_all_to(const math::Vector<double, 3>& xi,
                          std::span<double> values_out,
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 6f5154961..d70ea4293 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -6,6 +6,8 @@
 #include "BasisTraits.h"
 
 #include <array>
+#include <cmath>
+#include <map>
 #include <utility>
 
 namespace svmp {
@@ -17,6 +19,69 @@ namespace {
 using Point = math::Vector<double, 3>;
 using Lattice = std::array<int, 3>;
 
+// Gauss-Lobatto-Legendre nodes on [-1, 1] for a degree-`order` distribution
+// (order + 1 nodes). The endpoints are -1 and +1; the interior nodes are the
+// roots of P'_order, found by Newton iteration on f(x) = x P_order(x) -
+// P_{order-1}(x) -- whose roots are exactly the GLL nodes -- from the
+// Chebyshev-Gauss-Lobatto seed.
+const std::vector<double>& gll_points(int order) {
+    thread_local std::map<int, std::vector<double>> cache;
+    const auto found = cache.find(order);
+    if (found != cache.end()) {
+        return found->second;
+    }
+
+    std::vector<double> pts(static_cast<std::size_t>(order + 1), double(0));
+    if (order >= 1) {
+        pts.front() = double(-1);
+        pts.back() = double(1);
+    }
+    const double pi = std::acos(double(-1));
+    const int half = order / 2;
+    for (int j = 1; j <= half; ++j) {
+        if (2 * j == order) {
+            pts[static_cast<std::size_t>(j)] = double(0);  // exact center, even order
+            continue;
+        }
+        double x = -std::cos(pi * static_cast<double>(j) / static_cast<double>(order));
+        for (int iter = 0; iter < 100; ++iter) {
+            // Legendre P_k and P'_k up to k = order at x, by the three-term
+            // recurrences (regular at x = +/-1).
+            double p_km1 = double(1);   // P_0
+            double p_k = x;             // P_1
+            double d_km1 = double(0);   // P'_0
+            double d_k = double(1);     // P'_1
+            for (int k = 1; k < order; ++k) {
+                const double kk = static_cast<double>(k);
+                const double inv = double(1) / (kk + double(1));
+                const double p_kp1 =
+                    ((double(2) * kk + double(1)) * x * p_k - kk * p_km1) * inv;
+                const double d_kp1 =
+                    ((double(2) * kk + double(1)) * (p_k + x * d_k) - kk * d_km1) * inv;
+                p_km1 = p_k;
+                p_k = p_kp1;
+                d_km1 = d_k;
+                d_k = d_kp1;
+            }
+            // p_k = P_order, p_km1 = P_{order-1}, d_k = P'_order, d_km1 = P'_{order-1}.
+            const double f = x * p_k - p_km1;
+            const double f_prime = p_k + x * d_k - d_km1;
+            const double dx = f / f_prime;
+            x -= dx;
+            if (std::abs(dx) <= double(1e-15)) {
+                break;
+            }
+        }
+        pts[static_cast<std::size_t>(j)] = x;
+    }
+    for (int j = half + 1; j < order; ++j) {
+        pts[static_cast<std::size_t>(j)] = -pts[static_cast<std::size_t>(order - j)];
+    }
+
+    auto inserted = cache.emplace(order, std::move(pts));
+    return inserted.first->second;
+}
+
 double line_coord_zero_one(int i, int order) {
     if (order <= 0) {
         return double(0);
@@ -287,11 +352,19 @@ LagrangeNodeLayout generate_hex_nodes(int order) {
         {4, 5}, {5, 6}, {6, 7}, {7, 4},
         {0, 4}, {1, 5}, {2, 6}, {3, 7},
     };
+    // Edge-interior nodes at the Gauss-Lobatto-Legendre position of their lattice
+    // index on each axis (line_coord_pm_one), consistent with the corner, face, and
+    // interior strata and with the 1D tensor-axis nodes the evaluator uses. (A plain
+    // equispaced interpolation along the edge would disagree with the GLL faces and
+    // interior at order >= 3.)
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
-            const double t = static_cast<double>(m) / static_cast<double>(order);
-            out.coords.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
-            out.lattice.push_back(lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order));
+            const Lattice idx =
+                lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order);
+            out.coords.push_back(Point{line_coord_pm_one(idx[0], order),
+                                       line_coord_pm_one(idx[1], order),
+                                       line_coord_pm_one(idx[2], order)});
+            out.lattice.push_back(idx);
         }
     }
 
@@ -393,11 +466,20 @@ LagrangeNodeLayout generate_wedge_nodes(int order) {
         {3, 4}, {4, 5}, {5, 3},
         {0, 3}, {1, 4}, {2, 5},
     };
+    // The triangle cross-section (x, y) keeps its equispaced simplex placement; the
+    // through-axis (z) uses the Gauss-Lobatto-Legendre node of the lattice index, so
+    // the prism's tensor axis matches the 1D nodes the evaluator uses. (Triangle
+    // edges have z lattice 0 or `order`, for which line_coord_pm_one is -1 / +1, so
+    // their z is unchanged.)
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const double t = static_cast<double>(m) / static_cast<double>(order);
-            out.coords.push_back(verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t);
-            out.lattice.push_back(lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order));
+            const Lattice idx =
+                lerp_lattice(vert_lattice[edge[0]], vert_lattice[edge[1]], m, order);
+            Point coord = verts[edge[0]] * (double(1) - t) + verts[edge[1]] * t;
+            coord[2] = line_coord_pm_one(idx[2], order);
+            out.coords.push_back(coord);
+            out.lattice.push_back(idx);
         }
     }
 
@@ -587,6 +669,13 @@ void validate_lattice(const LagrangeNodeLayout& layout, ElementType type, int or
 
 } // namespace
 
+double line_coord_pm_one(int i, int order) {
+    if (order <= 0) {
+        return double(0);
+    }
+    return gll_points(order)[static_cast<std::size_t>(i)];
+}
+
 math::Vector<double, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index f8251c866..62658d4a3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -17,17 +17,25 @@ namespace FE {
 namespace basis {
 
 /**
- * @brief Equispaced 1D reference coordinate on [-1, 1]: -1 + 2 i / order.
+ * @brief The i-th 1D tensor-axis reference node on [-1, 1] at the given order.
+ *
+ * @details Returns the Gauss-Lobatto-Legendre (GLL) node of index @p i for a
+ * degree-@p order distribution: the endpoints are -1 and +1 and the interior
+ * nodes are the roots of @f$P'_{order}@f$, so high-order tensor interpolation
+ * stays well-conditioned (a logarithmic Lebesgue constant instead of the
+ * exponential growth of equispaced nodes). At order 1 the nodes are
+ * @f$\{-1, +1\}@f$ and at order 2 @f$\{-1, 0, +1\}@f$, so they coincide with the
+ * equispaced layout for the production orders and differ only for order >= 3.
+ * Returns 0 for order <= 0.
  *
  * Shared by the reference-node layout generators and the Lagrange tensor-axis
- * node initialization so the lattice formula lives in a single place.
+ * node initialization so the 1D distribution lives in a single place.
+ *
+ * @param i Node index in [0, order].
+ * @param order Polynomial order of the 1D distribution.
+ * @return GLL node coordinate on [-1, 1].
  */
-[[nodiscard]] inline constexpr double line_coord_pm_one(int i, int order) noexcept {
-    if (order <= 0) {
-        return double(0);
-    }
-    return double(-1) + double(2) * static_cast<double>(i) / static_cast<double>(order);
-}
+[[nodiscard]] double line_coord_pm_one(int i, int order);
 
 /**
  * @brief Reference Lagrange node coordinates paired with their integer lattice
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 5e6955e84..25ba94bb4 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cmath>
 #include <span>
 #include <string>
 
@@ -29,6 +30,104 @@ inline double integer_power(double base, int exponent) {
     return result;
 }
 
+// Which 1D polynomial family the tensor modes are written in. Monomials x^k are
+// simple but make the interpolation (Vandermonde) matrix exponentially
+// ill-conditioned as the order grows; tensor products of Legendre polynomials
+// P_k span exactly the same polynomial space (the serendipity exponent set is
+// downward-closed, so the change of basis is triangular) while keeping the
+// Vandermonde well-conditioned. The quadrilateral and hexahedral families use
+// Legendre; the fixed Wedge15 layout (order 2, trivially well-conditioned) keeps
+// the monomial form.
+enum class ModalAxisKind { Monomial, Legendre };
+
+// Value and first/second derivative of every 1D mode phi_0..phi_{max_degree} at a
+// fixed coordinate, indexed by per-axis degree.
+struct AxisTable {
+    std::vector<double> value;
+    std::vector<double> first;
+    std::vector<double> second;
+};
+
+// phi_k(x) = x^k and its derivatives. Matches the previous per-mode evaluation
+// exactly, so the Wedge15 monomial path is unchanged.
+void fill_monomial_table(double x, int max_degree, AxisTable& out) {
+    const std::size_t n = static_cast<std::size_t>(max_degree) + 1u;
+    out.value.assign(n, double(0));
+    out.first.assign(n, double(0));
+    out.second.assign(n, double(0));
+    for (int k = 0; k <= max_degree; ++k) {
+        const std::size_t kp = static_cast<std::size_t>(k);
+        out.value[kp] = integer_power(x, k);
+        out.first[kp] = (k > 0) ? double(k) * integer_power(x, k - 1) : double(0);
+        out.second[kp] =
+            (k > 1) ? double(k * (k - 1)) * integer_power(x, k - 2) : double(0);
+    }
+}
+
+// phi_k(x) = P_k(x), the degree-k Legendre polynomial on [-1, 1], with first and
+// second derivatives. Built from the three-term recurrences
+//   (k+1) P_{k+1}   = (2k+1) x P_k          - k P_{k-1}
+//   (k+1) P'_{k+1}  = (2k+1)(P_k + x P'_k)  - k P'_{k-1}
+//   (k+1) P''_{k+1} = (2k+1)(2 P'_k + x P''_k) - k P''_{k-1}
+// all regular at x = +/-1 (no division by 1 - x^2).
+void fill_legendre_table(double x, int max_degree, AxisTable& out) {
+    const std::size_t n = static_cast<std::size_t>(max_degree) + 1u;
+    out.value.assign(n, double(0));
+    out.first.assign(n, double(0));
+    out.second.assign(n, double(0));
+    out.value[0] = double(1);
+    if (max_degree >= 1) {
+        out.value[1] = x;
+        out.first[1] = double(1);
+    }
+    for (int k = 1; k < max_degree; ++k) {
+        const std::size_t kp = static_cast<std::size_t>(k);
+        const double kk = static_cast<double>(k);
+        const double two_k_plus_one = double(2) * kk + double(1);
+        const double inv = double(1) / (kk + double(1));
+        out.value[kp + 1] =
+            (two_k_plus_one * x * out.value[kp] - kk * out.value[kp - 1]) * inv;
+        out.first[kp + 1] =
+            (two_k_plus_one * (out.value[kp] + x * out.first[kp]) -
+             kk * out.first[kp - 1]) * inv;
+        out.second[kp + 1] =
+            (two_k_plus_one * (double(2) * out.first[kp] + x * out.second[kp]) -
+             kk * out.second[kp - 1]) * inv;
+    }
+}
+
+void fill_axis_table(ModalAxisKind kind, double x, int max_degree, AxisTable& out) {
+    if (kind == ModalAxisKind::Legendre) {
+        fill_legendre_table(x, max_degree, out);
+    } else {
+        fill_monomial_table(x, max_degree, out);
+    }
+}
+
+// Maximum tolerated infinity-norm condition number of a serendipity interpolation
+// (Vandermonde) matrix. Above this the inverse loses more than about half of
+// double precision (~1/sqrt(epsilon)), so construction throws rather than return
+// silently-degraded functions. With the Legendre modal basis and
+// Gauss-Lobatto-Legendre nodes the condition number stays far below this across
+// the recommended range (~1.7e4 at quadrilateral order 10, ~1.3e4 at hexahedral
+// order 8); the bound is the numerical-soundness backstop for orders pushed well
+// past it. The shape-function quality limit (Lebesgue constant) is the tighter,
+// inherent constraint and is documented/tested separately.
+constexpr double kSerendipityVandermondeMaxCond = double(1e8);
+
+// Infinity norm (maximum absolute row sum) of a row-major n-by-n matrix.
+double matrix_norm_inf(const std::vector<double>& matrix, std::size_t n) {
+    double max_row = double(0);
+    for (std::size_t row = 0; row < n; ++row) {
+        double sum = double(0);
+        for (std::size_t col = 0; col < n; ++col) {
+            sum += std::abs(matrix[row * n + col]);
+        }
+        max_row = std::max(max_row, sum);
+    }
+    return max_row;
+}
+
 std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
     std::vector<std::array<int, 2>> exponents;
     for (int ay = 0; ay <= order; ++ay) {
@@ -62,14 +161,17 @@ void append_quad_serendipity_interior_nodes(std::vector<Vec3>& nodes, int order)
         return;
     }
 
+    // Interior staircase placed on Gauss-Lobatto-Legendre interior nodes (the same
+    // 1D distribution as the boundary), so the reduced space stays well-conditioned
+    // at high order. The unisolvence argument above needs only a distinct y per row
+    // and distinct x within each row; GLL changes where those distinct points sit,
+    // not the staircase structure.
     const int m = order - 4;
-    const double y_denominator = double(m + 2);
     for (int row = 0; row <= m; ++row) {
         const int row_count = m + 1 - row;
-        const double y = double(-1) + double(2) * double(row + 1) / y_denominator;
-        const double x_denominator = double(row_count + 1);
+        const double y = line_coord_pm_one(row + 1, m + 2);
         for (int col = 0; col < row_count; ++col) {
-            const double x = double(-1) + double(2) * double(col + 1) / x_denominator;
+            const double x = line_coord_pm_one(col + 1, row_count + 1);
             nodes.push_back(Vec3{x, y, double(0)});
         }
     }
@@ -194,15 +296,18 @@ void append_hex_serendipity_volume_interior_nodes(std::vector<Vec3>& nodes, int
     if (order < 6) {
         return;
     }
+    // Tetrahedral staircase on Gauss-Lobatto-Legendre interior nodes, mirroring the
+    // 2D quad interior: distinct t per layer, distinct s per row, distinct r within
+    // a row keep the residual unisolvent while staying well-conditioned at order.
     const int m = order - 6;
     for (int layer = 0; layer <= m; ++layer) {
         const int tri_order = m - layer;
-        const double t = double(-1) + double(2) * double(layer + 1) / double(m + 2);
+        const double t = line_coord_pm_one(layer + 1, m + 2);
         for (int row = 0; row <= tri_order; ++row) {
             const int row_count = tri_order + 1 - row;
-            const double s = double(-1) + double(2) * double(row + 1) / double(tri_order + 2);
+            const double s = line_coord_pm_one(row + 1, tri_order + 2);
             for (int col = 0; col < row_count; ++col) {
-                const double r = double(-1) + double(2) * double(col + 1) / double(row_count + 1);
+                const double r = line_coord_pm_one(col + 1, row_count + 1);
                 nodes.push_back(Vec3{r, s, t});
             }
         }
@@ -246,34 +351,58 @@ std::vector<Vec3> hex_serendipity_nodes(int order, std::size_t total_size) {
     return nodes;
 }
 
-// Build the nodal coefficient table for a monomial-generated serendipity family:
-// assemble V[node][monomial] = r^a s^b t^c at the public-order reference nodes and
-// invert it. Because the nodes are in public order, the inverse is already in
-// public basis order and needs no output permutation. The same routine serves the
-// quadrilateral, Hex20, and Wedge15 spaces.
+// Build the nodal coefficient table for a serendipity family: assemble the
+// generalized Vandermonde V[node][mode] = phi_a(r) phi_b(s) phi_c(t) at the
+// public-order reference nodes -- with phi the monomial or Legendre 1D modes per
+// `kind` -- and invert it. Because the nodes are in public order, the inverse is
+// already in public basis order and needs no output permutation. The same routine
+// serves the quadrilateral, hexahedral, and Wedge15 spaces. `max_degree` bounds
+// the per-axis mode degree (the family's order). Construction throws if the matrix
+// is too ill-conditioned to trust (see kSerendipityVandermondeMaxCond).
 std::vector<double> build_inverse_vandermonde(
     std::span<const Vec3> nodes,
     std::span<const std::array<int, 3>> exponents,
-    const std::string& label) {
+    const std::string& label,
+    ModalAxisKind kind,
+    int max_degree) {
     const std::size_t n = nodes.size();
     svmp::throw_if<BasisConstructionException>(
         n == 0 || exponents.size() != n, SVMP_HERE,
         "SerendipityBasis: invalid serendipity interpolation setup");
 
     std::vector<double> vandermonde(n * n, double(0));
+    AxisTable tx;
+    AxisTable ty;
+    AxisTable tz;
     for (std::size_t row = 0; row < n; ++row) {
         const Vec3& p = nodes[row];
+        fill_axis_table(kind, p[0], max_degree, tx);
+        fill_axis_table(kind, p[1], max_degree, ty);
+        fill_axis_table(kind, p[2], max_degree, tz);
         for (std::size_t col = 0; col < n; ++col) {
             const auto& e = exponents[col];
             vandermonde[row * n + col] =
-                integer_power(p[0], e[0]) * integer_power(p[1], e[1]) *
-                integer_power(p[2], e[2]);
+                tx.value[static_cast<std::size_t>(e[0])] *
+                ty.value[static_cast<std::size_t>(e[1])] *
+                tz.value[static_cast<std::size_t>(e[2])];
         }
     }
 
-    return math::invert_dense_matrix(
+    // Condition-number backstop: estimate cond_inf = ||V||_inf * ||V^{-1}||_inf
+    // and reject orders where the inverse can no longer be trusted, rather than
+    // returning silently-degraded shape functions.
+    const double norm_v = matrix_norm_inf(vandermonde, n);
+    std::vector<double> inverse = math::invert_dense_matrix(
         std::move(vandermonde), n,
         "SerendipityBasis interpolation matrix for " + label);
+    const double cond_estimate = norm_v * matrix_norm_inf(inverse, n);
+    svmp::throw_if<BasisConstructionException>(
+        !(cond_estimate <= kSerendipityVandermondeMaxCond), SVMP_HERE,
+        "SerendipityBasis: " + label +
+            " interpolation matrix is too ill-conditioned (condition number ~ " +
+            std::to_string(cond_estimate) +
+            "); the requested order exceeds the well-conditioned range");
+    return inverse;
 }
 
 constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
@@ -294,67 +423,54 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{2, 0, 1}}
 }};
 
-// Value and first/second derivatives of the 1D monomial x^a. The derivative of
-// a constant or linear term collapses to zero, so negative powers never arise.
-struct MonomialAxis {
-    double value;   ///< x^a
-    double first;   ///< d/dx (x^a)     = a x^(a-1)
-    double second;  ///< d^2/dx^2 (x^a) = a (a-1) x^(a-2)
-};
-
-inline MonomialAxis monomial_axis(double x, int exponent) {
-    MonomialAxis axis;
-    axis.value = integer_power(x, exponent);
-    axis.first = (exponent > 0) ? double(exponent) * integer_power(x, exponent - 1) : double(0);
-    axis.second = (exponent > 1)
-                      ? double(exponent * (exponent - 1)) * integer_power(x, exponent - 2)
-                      : double(0);
-    return axis;
-}
-
-// Evaluate a nodal basis defined by a monomial coefficient table at one
-// reference point. For each monomial j the routine forms x^a y^b z^c and the
-// requested derivatives, then accumulates the coefficient-weighted sum into the
-// output slots. `count` is both the number of monomials and the number of basis
-// functions (the coefficient table is square). The table is in public basis
-// order, so output slot i reads coefficient column i directly. Outputs are
-// assumed pre-zeroed by the caller; an empty span skips that quantity.
+// Evaluate a nodal basis defined by a modal coefficient table at one reference
+// point. The three axis tables (tx, ty, tz) already hold phi and its derivatives
+// for every per-axis degree at this point. For each mode j the routine forms
+// phi_a(r) phi_b(s) phi_c(t) and the requested derivatives, then accumulates the
+// coefficient-weighted sum into the output slots. `count` is both the number of
+// modes and the number of basis functions (the coefficient table is square). The
+// table is in public basis order, so output slot i reads coefficient column i
+// directly. Outputs are assumed pre-zeroed by the caller; an empty span skips that
+// quantity.
 template <typename ExponentFn, typename CoeffFn>
-void eval_monomial_basis(double r, double s, double t,
-                         std::size_t count,
-                         ExponentFn&& exponent,
-                         CoeffFn&& coeff,
-                         std::span<double> values,
-                         std::span<Gradient> gradients,
-                         std::span<Hessian> hessians) {
+void eval_modal_basis(const AxisTable& tx, const AxisTable& ty, const AxisTable& tz,
+                      std::size_t count,
+                      ExponentFn&& exponent,
+                      CoeffFn&& coeff,
+                      std::span<double> values,
+                      std::span<Gradient> gradients,
+                      std::span<Hessian> hessians) {
     const bool want_values = !values.empty();
     const bool want_gradients = !gradients.empty();
     const bool want_hessians = !hessians.empty();
 
     for (std::size_t j = 0; j < count; ++j) {
         const std::array<int, 3> e = exponent(j);
-        const MonomialAxis ax = monomial_axis(r, e[0]);
-        const MonomialAxis ay = monomial_axis(s, e[1]);
-        const MonomialAxis az = monomial_axis(t, e[2]);
+        const std::size_t ex = static_cast<std::size_t>(e[0]);
+        const std::size_t ey = static_cast<std::size_t>(e[1]);
+        const std::size_t ez = static_cast<std::size_t>(e[2]);
 
-        const double phi = ax.value * ay.value * az.value;
+        const double vx = tx.value[ex];
+        const double vy = ty.value[ey];
+        const double vz = tz.value[ez];
+        const double phi = vx * vy * vz;
 
         double d_dr = double(0), d_ds = double(0), d_dt = double(0);
         if (want_gradients || want_hessians) {
-            d_dr = ax.first * ay.value * az.value;
-            d_ds = ax.value * ay.first * az.value;
-            d_dt = ax.value * ay.value * az.first;
+            d_dr = tx.first[ex] * vy * vz;
+            d_ds = vx * ty.first[ey] * vz;
+            d_dt = vx * vy * tz.first[ez];
         }
 
         double d_drr = double(0), d_dss = double(0), d_dtt = double(0);
         double d_drs = double(0), d_drt = double(0), d_dst = double(0);
         if (want_hessians) {
-            d_drr = ax.second * ay.value * az.value;
-            d_dss = ax.value * ay.second * az.value;
-            d_dtt = ax.value * ay.value * az.second;
-            d_drs = ax.first * ay.first * az.value;
-            d_drt = ax.first * ay.value * az.first;
-            d_dst = ax.value * ay.first * az.first;
+            d_drr = tx.second[ex] * vy * vz;
+            d_dss = vx * ty.second[ey] * vz;
+            d_dtt = vx * vy * tz.second[ez];
+            d_drs = tx.first[ex] * ty.first[ey] * vz;
+            d_drt = tx.first[ex] * vy * tz.first[ez];
+            d_dst = vx * ty.first[ey] * tz.first[ez];
         }
 
         for (std::size_t slot = 0; slot < count; ++slot) {
@@ -500,8 +616,10 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
+    uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
-        nodes_, monomial_exponents_, "Quad order " + std::to_string(order));
+        nodes_, monomial_exponents_, "Quad order " + std::to_string(order),
+        ModalAxisKind::Legendre, order);
 }
 
 // Build the hexahedral serendipity monomial space, reference nodes, and nodal
@@ -522,8 +640,10 @@ void SerendipityBasis::init_hexahedron(int order, bool nodes_from_reference_layo
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
+    uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
-        nodes_, monomial_exponents_, "Hex order " + std::to_string(order));
+        nodes_, monomial_exponents_, "Hex order " + std::to_string(order),
+        ModalAxisKind::Legendre, order);
 }
 
 // Build the Wedge15 serendipity layout from its tabulated monomial space and
@@ -545,7 +665,11 @@ void SerendipityBasis::init_fixed_named(ElementType type) {
         family_exponents.size() != size_, SVMP_HERE,
         "SerendipityBasis: Wedge15 monomial count does not match basis size");
     monomial_exponents_.assign(family_exponents.begin(), family_exponents.end());
-    inv_vandermonde_ = build_inverse_vandermonde(nodes_, monomial_exponents_, "Wedge15");
+    // Wedge15 is the fixed order-2 layout; its 15x15 system is trivially
+    // well-conditioned, so it keeps the monomial modal basis.
+    uses_legendre_modes_ = false;
+    inv_vandermonde_ = build_inverse_vandermonde(
+        nodes_, monomial_exponents_, "Wedge15", ModalAxisKind::Monomial, order_);
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
@@ -582,8 +706,19 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
         SVMP_HERE,
         "SerendipityBasis: interpolation tables are not initialized for evaluation");
 
-    eval_monomial_basis(
-        x, y, z, size_,
+    // Build the per-axis modal tables once, then accumulate over the modes. The
+    // mode family must match the one the coefficient table was built with.
+    const ModalAxisKind kind =
+        uses_legendre_modes_ ? ModalAxisKind::Legendre : ModalAxisKind::Monomial;
+    AxisTable tx;
+    AxisTable ty;
+    AxisTable tz;
+    fill_axis_table(kind, x, order_, tx);
+    fill_axis_table(kind, y, order_, ty);
+    fill_axis_table(kind, z, order_, tz);
+
+    eval_modal_basis(
+        tx, ty, tz, size_,
         [this](std::size_t j) { return monomial_exponents_[j]; },
         [this](std::size_t j, std::size_t i) {
             return inv_vandermonde_[j * size_ + i];
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index d6640ed07..c9aa0974a 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -109,13 +109,33 @@ namespace basis {
  * caller explicitly requests a different supported basis.
  *
  * Every supported family -- quadrilateral, hexahedral, and Wedge15 -- is built by
- * inverting the Vandermonde of its monomial space at the public-order reference
- * nodes; values, gradients, and Hessians are evaluated by differentiating the
- * monomial vector and applying the inverse-Vandermonde coefficients. Because the
- * tables are generated in public node order, evaluation needs no output
- * reordering, and there is no hand-written special case -- the Hex8 basis is the
- * order-1 instance of the generated hexahedral space, not a separate trilinear
- * evaluator.
+ * inverting the generalized Vandermonde of its mode space at the public-order
+ * reference nodes; values, gradients, and Hessians are evaluated by
+ * differentiating the mode vector and applying the inverse-Vandermonde
+ * coefficients. Because the tables are generated in public node order, evaluation
+ * needs no output reordering, and there is no hand-written special case -- the
+ * Hex8 basis is the order-1 instance of the generated hexahedral space, not a
+ * separate trilinear evaluator.
+ *
+ * ## Conditioning and the well-conditioned order range
+ *
+ * High-order nodal interpolation is governed by two conditioning factors, both
+ * addressed so that arbitrary orders produce trustworthy shape functions:
+ * - **Node distribution.** The quadrilateral and hexahedral families place their
+ *   nodes on the Gauss-Lobatto-Legendre (GLL) distribution (edges, faces, and the
+ *   interior staircase all use the GLL 1D nodes). GLL has a logarithmic Lebesgue
+ *   constant, where an equispaced layout grows exponentially (the Runge
+ *   phenomenon). The named production layouts are unaffected: GLL coincides with
+ *   the equispaced layout at orders 1 and 2, so Quad8/Hex8/Hex20 keep their exact
+ *   public coordinates; GLL differs only for order >= 3, where the layout is this
+ *   module's own convention.
+ * - **Modal basis.** The quadrilateral and hexahedral Vandermondes are assembled
+ *   in a tensor **Legendre** basis rather than raw monomials. The serendipity
+ *   exponent set is downward-closed, so the Legendre and monomial spans are
+ *   identical (the change of basis is triangular) -- the nodal shape functions are
+ *   unchanged -- but the Legendre Vandermonde is far better conditioned. (The
+ *   fixed Wedge15 layout, order 2, keeps the monomial form; it is trivially
+ *   well-conditioned.)
  */
 class SerendipityBasis final : public BasisFunction {
 public:
@@ -287,10 +307,17 @@ class SerendipityBasis final : public BasisFunction {
     int order_;
     std::size_t size_;
     std::vector<math::Vector<double, 3>> nodes_;
-    // Monomial exponents (r^a s^b t^c) spanning the family's polynomial space.
+    // Per-axis degrees (a, b, c) of the tensor modes spanning the family's
+    // polynomial space. Interpreted as monomial powers r^a s^b t^c or, when
+    // uses_legendre_modes_ is set, as tensor Legendre degrees P_a(r) P_b(s) P_c(t)
+    // (the same space; see ModalAxisKind in SerendipityBasis.cpp).
     std::vector<std::array<int, 3>> monomial_exponents_;
-    // Row-major inverse Vandermonde, indexed as [monomial, basis].
+    // Row-major inverse (generalized) Vandermonde, indexed as [mode, basis].
     std::vector<double> inv_vandermonde_;
+    // Whether the tensor modes are Legendre polynomials (quadrilateral/hexahedral
+    // families) or plain monomials (the fixed Wedge15 layout). Evaluation must use
+    // the same family the coefficient table was built with.
+    bool uses_legendre_modes_{false};
 
     // Build the quadrilateral serendipity monomial space, reference nodes, and
     // nodal coefficient table for the given order. The named Quad8 layout takes
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index cce73a39a..d80955eb3 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -406,7 +406,7 @@ TEST(BasisGradients, QuadrilateralSerendipityInactiveZDerivativesRemainZero) {
 TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
     // Arbitrary-order quadrilateral serendipity (topology path).
     const struct QuadCase { int order; double tol; } quad_cases[] = {
-        {1, double(1e-6)}, {3, double(1e-6)}, {4, double(5e-6)}, {6, double(2e-5)},
+        {1, double(1e-6)}, {3, double(1e-6)}, {4, double(2e-6)}, {6, double(5e-6)},
     };
     for (const auto& c : quad_cases) {
         SerendipityBasis basis(BasisTopology::Quadrilateral, c.order);
@@ -416,8 +416,8 @@ TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
 
     // Arbitrary-order hexahedral serendipity (topology path).
     const struct HexCase { int order; double tol; } hex_cases[] = {
-        {1, double(1e-6)}, {2, double(1e-6)}, {3, double(5e-6)},
-        {4, double(1e-5)}, {5, double(5e-5)},
+        {1, double(1e-6)}, {2, double(1e-6)}, {3, double(2e-6)},
+        {4, double(5e-6)}, {5, double(1e-5)},
     };
     for (const auto& c : hex_cases) {
         SerendipityBasis basis(BasisTopology::Hexahedron, c.order);
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index d971e4b2c..efd733a7a 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -805,3 +805,36 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     EXPECT_THROW((void)basis_factory::create(unsupported_serendipity_topology),
                  BasisElementCompatibilityException);
 }
+
+// The shared 1D tensor-axis distribution (line_coord_pm_one) is Gauss-Lobatto-
+// Legendre: endpoints exactly +/-1, symmetric about 0, strictly increasing, and
+// coincident with the equispaced layout at the production orders 1 and 2.
+TEST(LagrangeBasis, TensorAxisNodesAreGaussLobattoLegendre) {
+    // Orders 1 and 2 coincide with equispaced (the production layouts).
+    EXPECT_EQ(line_coord_pm_one(0, 1), double(-1));
+    EXPECT_EQ(line_coord_pm_one(1, 1), double(1));
+    EXPECT_EQ(line_coord_pm_one(0, 2), double(-1));
+    EXPECT_EQ(line_coord_pm_one(1, 2), double(0));
+    EXPECT_EQ(line_coord_pm_one(2, 2), double(1));
+
+    for (int order = 1; order <= 12; ++order) {
+        EXPECT_EQ(line_coord_pm_one(0, order), double(-1)) << "order=" << order;
+        EXPECT_EQ(line_coord_pm_one(order, order), double(1)) << "order=" << order;
+
+        double previous = line_coord_pm_one(0, order);
+        for (int i = 1; i <= order; ++i) {
+            const double x = line_coord_pm_one(i, order);
+            EXPECT_GT(x, previous) << "order=" << order << " i=" << i;  // strictly increasing
+            EXPECT_LE(x, double(1)) << "order=" << order << " i=" << i;
+            EXPECT_GE(x, double(-1)) << "order=" << order << " i=" << i;
+            EXPECT_NEAR(x, -line_coord_pm_one(order - i, order), double(1e-14))
+                << "order=" << order << " i=" << i;  // symmetric about 0
+            previous = x;
+        }
+    }
+
+    // For order >= 3 GLL differs from equispaced: the interior nodes cluster toward
+    // the endpoints. At order 4 the first interior node is -sqrt(3/7) ~ -0.6547,
+    // past the equispaced position -0.5.
+    EXPECT_LT(line_coord_pm_one(1, 4), double(-0.5) - double(1e-6));
+}
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index d62b8c441..668eaa73a 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -376,6 +376,96 @@ std::vector<math::Vector<double, 3>> quad8_reference_nodes_for_test() {
     return nodes;
 }
 
+// --- Conditioning oracles (Legendre Vandermonde + Lebesgue constant) ----------
+
+double legendre_value_for_test(double x, int degree) {
+    if (degree <= 0) {
+        return double(1);
+    }
+    double p_km1 = double(1);
+    double p_k = x;
+    for (int k = 1; k < degree; ++k) {
+        const double p_kp1 =
+            ((double(2) * double(k) + double(1)) * x * p_k - double(k) * p_km1) /
+            double(k + 1);
+        p_km1 = p_k;
+        p_k = p_kp1;
+    }
+    return p_k;
+}
+
+double legendre_mode_for_test(const math::Vector<double, 3>& p,
+                              const std::array<int, 3>& mode) {
+    return legendre_value_for_test(p[0], mode[0]) *
+           legendre_value_for_test(p[1], mode[1]) *
+           legendre_value_for_test(p[2], mode[2]);
+}
+
+double matrix_norm_inf_for_test(const std::vector<double>& matrix, std::size_t n) {
+    double max_row = double(0);
+    for (std::size_t row = 0; row < n; ++row) {
+        double sum = double(0);
+        for (std::size_t col = 0; col < n; ++col) {
+            sum += std::abs(matrix[row * n + col]);
+        }
+        max_row = std::max(max_row, sum);
+    }
+    return max_row;
+}
+
+// Infinity-norm condition number of the Legendre generalized Vandermonde the
+// production basis inverts, rebuilt from the basis nodes and the re-derived
+// serendipity modes (an independent check that Source B is fixed).
+double legendre_vandermonde_condition(const std::vector<math::Vector<double, 3>>& nodes,
+                                      const std::vector<std::array<int, 3>>& modes) {
+    const std::size_t n = nodes.size();
+    std::vector<double> v(n * n, double(0));
+    for (std::size_t row = 0; row < n; ++row) {
+        for (std::size_t col = 0; col < n; ++col) {
+            v[row * n + col] = legendre_mode_for_test(nodes[row], modes[col]);
+        }
+    }
+    const double norm_v = matrix_norm_inf_for_test(v, n);
+    const auto inverse = math::invert_dense_matrix(v, n, "test Legendre Vandermonde");
+    return norm_v * matrix_norm_inf_for_test(inverse, n);
+}
+
+std::vector<std::array<int, 3>> quad_serendipity_modes_3d_for_test(int order) {
+    std::vector<std::array<int, 3>> modes;
+    for (const auto& e : quad_serendipity_exponents_for_test(order)) {
+        modes.push_back({e[0], e[1], 0});
+    }
+    return modes;
+}
+
+// Lebesgue constant of the nodal basis: the maximum over a dense reference-cell
+// sample of sum_i |N_i(xi)|. Bounded and slowly growing for GLL nodes; it is the
+// "are the shape functions good" metric (equispaced nodes make it blow up).
+double serendipity_lebesgue_constant(const SerendipityBasis& basis, int samples) {
+    const int dim = basis.dimension();
+    const auto axis = [samples](int idx) {
+        return double(-1) + double(2) * double(idx) / double(samples);
+    };
+    double max_sum = double(0);
+    std::vector<double> values;
+    for (int i = 0; i <= samples; ++i) {
+        for (int j = 0; j <= samples; ++j) {
+            const int kmax = (dim >= 3) ? samples : 0;
+            for (int k = 0; k <= kmax; ++k) {
+                const math::Vector<double, 3> xi{
+                    axis(i), axis(j), dim >= 3 ? axis(k) : double(0)};
+                basis.evaluate_values(xi, values);
+                double sum = double(0);
+                for (const double v : values) {
+                    sum += std::abs(v);
+                }
+                max_sum = std::max(max_sum, sum);
+            }
+        }
+    }
+    return max_sum;
+}
+
 } // namespace
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
@@ -553,17 +643,17 @@ TEST(SerendipityBasis, QuadrilateralNodesFollowDocumentedConstructionThroughOrde
 
         std::size_t index = boundary_count;
         if (order >= 4) {
+            // The interior staircase sits on Gauss-Lobatto-Legendre interior nodes:
+            // row r at the (r+1)-th GLL node of order m+2, each row's columns at the
+            // GLL interior of order row_count+1 (same line_coord_pm_one the basis
+            // uses), re-derived here as an independent placement oracle.
             const int m = order - 4;
-            const double y_denominator = double(m + 2);
             for (int row = 0; row <= m; ++row) {
                 const int row_count = m + 1 - row;
-                const double expected_y =
-                    double(-1) + double(2) * double(row + 1) / y_denominator;
-                const double x_denominator = double(row_count + 1);
+                const double expected_y = line_coord_pm_one(row + 1, m + 2);
                 for (int col = 0; col < row_count; ++col) {
                     ASSERT_LT(index, nodes.size());
-                    const double expected_x =
-                        double(-1) + double(2) * double(col + 1) / x_denominator;
+                    const double expected_x = line_coord_pm_one(col + 1, row_count + 1);
                     EXPECT_NEAR(nodes[index][0], expected_x, kTol)
                         << "order=" << order << " row=" << row << " col=" << col;
                     EXPECT_NEAR(nodes[index][1], expected_y, kTol)
@@ -616,7 +706,10 @@ TEST(SerendipityBasis, QuadrilateralOrdersReproduceEverySerendipityMonomial) {
         const auto exponents = quad_serendipity_exponents_for_test(order);
         ASSERT_EQ(exponents.size(), basis.size()) << "order=" << order;
 
-        const double tolerance = (order <= 7) ? double(1e-10) : double(2e-8);
+        // Uniformly tight across the whole range: GLL nodes and the Legendre modal
+        // basis keep the reproduction accurate even at order 10 (the equispaced/
+        // monomial construction needed 2e-8 here).
+        const double tolerance = double(1e-10);
         for (const auto& exponent : exponents) {
             for (const auto& xi : points) {
                 const double interpolated =
@@ -946,9 +1039,9 @@ TEST(SerendipityBasis, HexahedralTopologyIsNodalAndPartitionsUnity) {
         EXPECT_EQ(basis.size(), expected_hex_serendipity_size(order)) << "order=" << order;
         ASSERT_EQ(basis.nodes().size(), basis.size());
 
-        expect_nodal_delta(basis, basis.nodes(), double(1e-7));
+        expect_nodal_delta(basis, basis.nodes(), double(1e-9));
         for (const auto& xi : points) {
-            expect_partition_of_unity(basis, xi, double(1e-7));
+            expect_partition_of_unity(basis, xi, double(1e-9));
         }
     }
 }
@@ -974,7 +1067,7 @@ TEST(SerendipityBasis, HexahedralTopologyReproducesEverySerendipityMonomial) {
                         return monomial_value_3d_for_test(node, exponent);
                     });
                 EXPECT_NEAR(interpolated, monomial_value_3d_for_test(xi, exponent),
-                            double(1e-6))
+                            double(1e-9))
                     << "order=" << order << " ax=" << exponent[0]
                     << " ay=" << exponent[1] << " az=" << exponent[2];
             }
@@ -1026,3 +1119,40 @@ TEST(SerendipityBasis, NamedHexLayoutsMatchTopologyConstruction) {
         }
     }
 }
+
+// Conditioning is a tested quantity, not a tolerance that quietly loosens. With
+// the Legendre modal basis and Gauss-Lobatto-Legendre nodes, both the Vandermonde
+// condition number and the Lebesgue constant stay small across the recommended
+// range -- a logarithmic-style growth instead of the exponential blow-up of the
+// previous equispaced/monomial construction (which lost ~8 digits by order 10).
+TEST(SerendipityBasis, SerendipityStaysWellConditionedAcrossRecommendedRange) {
+    for (int order = 1; order <= 10; ++order) {
+        SerendipityBasis basis(BasisTopology::Quadrilateral, order);
+        const double cond = legendre_vandermonde_condition(
+            basis.nodes(), quad_serendipity_modes_3d_for_test(order));
+        const double lebesgue = serendipity_lebesgue_constant(basis, 24);
+        EXPECT_LT(cond, double(5e4)) << "quad order=" << order;
+        EXPECT_LT(lebesgue, double(1e3)) << "quad order=" << order;
+    }
+    for (int order = 1; order <= 8; ++order) {
+        SerendipityBasis basis(BasisTopology::Hexahedron, order);
+        const double cond = legendre_vandermonde_condition(
+            basis.nodes(), hex_serendipity_exponents_for_test(order));
+        const double lebesgue = serendipity_lebesgue_constant(basis, 12);
+        EXPECT_LT(cond, double(5e4)) << "hex order=" << order;
+        EXPECT_LT(lebesgue, double(1e3)) << "hex order=" << order;
+    }
+}
+
+// The condition-number guard is the numerical-soundness backstop: orders pushed
+// far past the well-conditioned range throw rather than return shape functions
+// whose coefficients have lost all precision. The recommended orders construct
+// without complaint.
+TEST(SerendipityBasis, RejectsOrdersBeyondTheWellConditionedRange) {
+    EXPECT_NO_THROW((void)SerendipityBasis(BasisTopology::Quadrilateral, 10));
+    EXPECT_NO_THROW((void)SerendipityBasis(BasisTopology::Hexahedron, 8));
+    EXPECT_THROW((void)SerendipityBasis(BasisTopology::Quadrilateral, 20),
+                 BasisConstructionException);
+    EXPECT_THROW((void)SerendipityBasis(BasisTopology::Hexahedron, 16),
+                 BasisConstructionException);
+}

From 523a9833328a7b02d7632243d9d69c48b4a0b287 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 17:02:19 -0700
Subject: [PATCH 66/91] adding proper documentation to conditioning of the
 lagrange basis

---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  5 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 60 ++++++++++++++++---
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 97f5cad5f..587ef4886 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -343,8 +343,9 @@ void LagrangeBasis::build_tensor_product_nodes() {
     nodes_ = layout.coords;
     tensor_indices_.reserve(layout.lattice.size());
     for (const auto& idx : layout.lattice) {
-        // The lattice already holds the per-axis equispaced node index (unused
-        // axes are zero), so no coordinate-to-index inversion is needed.
+        // The lattice already holds the per-axis node index 0..order along each
+        // axis (unused axes are zero; the coordinate itself is the GLL node for
+        // that index), so no coordinate-to-index inversion is needed.
         tensor_indices_.push_back(TensorNodeIndex{
             static_cast<std::size_t>(idx[0]),
             static_cast<std::size_t>(idx[1]),
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index fa78d9de7..259af1db6 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -30,8 +30,8 @@ namespace basis {
 /**
  * @brief Nodal Lagrange basis on supported reference finite elements.
  *
- * @details LagrangeBasis represents the nodal interpolation basis associated
- * with an equispaced reference-node lattice. It supports point, line,
+ * @details LagrangeBasis represents the complete (full-degree) nodal
+ * interpolation basis on a reference topology. It supports point, line,
  * quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
  * topologies. The primary constructor takes a BasisTopology and an explicit
  * polynomial order, so an arbitrary order carries no node-count assumption
@@ -41,16 +41,39 @@ namespace basis {
  * must equal the order baked into that layout (1 for the linear elements, 2 for
  * the complete-quadratic aliases, 0 for Point1).
  *
+ * ## Reference-node distribution
+ *
+ * The interpolation nodes are not a single distribution across topologies; each
+ * family uses the node set its evaluator is built for (see ReferenceNodeLayout
+ * and line_coord_pm_one):
+ * - **Tensor-product (line, quadrilateral, hexahedron):** Gauss-Lobatto-Legendre
+ *   (GLL) nodes on each axis -- endpoints at @f$\pm 1@f$ and interior nodes at
+ *   the roots of @f$P'_p@f$ -- not an equispaced layout.
+ * - **Simplex (triangle, tetrahedron):** the equispaced barycentric lattice
+ *   (each barycentric coordinate at @f$i/p@f$). The closed-form evaluator below
+ *   is specific to this equispaced lattice.
+ * - **Wedge:** the tensor product of an equispaced triangle cross-section with a
+ *   GLL through-axis.
+ *
+ * GLL coincides with the equispaced layout at orders 1 and 2, so every named
+ * production element (Line2/Line3, Triangle3/Triangle6, Quad4/Quad9,
+ * Tetra4/Tetra10, Hex8/Hex27, Wedge6/Wedge18) keeps its standard coordinates;
+ * the GLL/equispaced distinction appears only for order >= 3.
+ *
+ * ## Evaluation
+ *
  * Tensor-product elements use the one-dimensional nodal polynomials
  * @f[
  *   l_i(x) = \prod_{j \ne i} \frac{x - x_j}{x_i - x_j}
  * @f]
- * on equispaced coordinates in @f$[-1, 1]@f$. Multi-dimensional basis
- * functions are products of the active axis polynomials, for example
+ * on the per-axis GLL coordinates @f$x_j \in [-1, 1]@f$ (the barycentric-weight
+ * form, valid for any distinct node set). Multi-dimensional basis functions are
+ * products of the active axis polynomials, for example
  * @f$N_{ijk}(r,s,t) = l_i(r)l_j(s)l_k(t)@f$ on a hexahedron.
  *
  * Simplex elements use barycentric coordinates and integer lattice
- * exponents. For a node with exponent tuple @f$\alpha@f$, where
+ * exponents on the equispaced lattice. For a node with exponent tuple
+ * @f$\alpha@f$, where
  * @f$\sum_a \alpha_a = p@f$, the basis is assembled from scaled
  * falling-factorial factors,
  * @f[
@@ -65,6 +88,19 @@ namespace basis {
  * basis and a one-dimensional through-axis basis:
  * @f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)@f$.
  *
+ * ## Conditioning and the supported order range
+ *
+ * Interpolation conditioning is governed by the node distribution and so differs
+ * by topology:
+ * - **Tensor-product topologies stay well-conditioned at high order.** GLL nodes
+ *   have a logarithmic Lebesgue constant, so line/quadrilateral/hexahedron bases
+ *   remain trustworthy well beyond the production orders.
+ * - **Simplex topologies degrade at high order.** The equispaced barycentric
+ *   lattice has a Lebesgue constant that grows roughly exponentially with order
+ *   (the Runge phenomenon), so triangle and tetrahedron bases are reliable
+ *   through low orders but become increasingly ill-conditioned beyond them. The
+ *   wedge inherits this through its equispaced triangle cross-section.
+ *
  * The vector-returning evaluators are convenient API wrappers. The `*_to`
  * methods write to caller-provided spans and are intended for assembly paths
  * that avoid temporary allocations.
@@ -91,6 +127,13 @@ class LagrangeBasis final : public BasisFunction {
      * store per-axis node indices, simplex bases store barycentric exponent
      * tuples, and wedge bases store the triangle-node/axis-node decomposition.
      *
+     * Reference nodes are Gauss-Lobatto-Legendre on tensor-product axes and the
+     * equispaced barycentric lattice on simplex axes (see the class description).
+     * High order stays well-conditioned on tensor-product topologies but degrades
+     * on simplex and wedge topologies, and -- unlike SerendipityBasis -- this
+     * constructor does not reject ill-conditioned high-order simplex/wedge
+     * requests; that choice is the caller's.
+     *
      * @param topology Reference topology; Point through the volume topologies.
      * @param order Polynomial order; must be non-negative. Point is order 0.
      * @throws BasisConfigurationException If the order is negative, or if Point
@@ -140,9 +183,10 @@ class LagrangeBasis final : public BasisFunction {
      *
      * @details The returned node order matches the basis-function order used
      * by all evaluators. Coordinates are reference-element coordinates:
-     * tensor-product axes use @f$[-1,1]@f$, triangles and tetrahedra use the
-     * repository's simplex reference coordinates, and wedges combine triangle
-     * reference coordinates with a @f$[-1,1]@f$ through-axis coordinate.
+     * tensor-product axes use the Gauss-Lobatto-Legendre nodes on @f$[-1,1]@f$,
+     * triangles and tetrahedra use the equispaced barycentric simplex lattice,
+     * and wedges combine the equispaced triangle lattice with a GLL @f$[-1,1]@f$
+     * through-axis coordinate.
      *
      * @return Reference node coordinates, one per basis function.
      */

From 504a416a3ca410ad610db21b2c527a92e36b952d Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 17:27:59 -0700
Subject: [PATCH 67/91] adding single arg constructor overload and removing
 `element_type()`

---
 Code/Source/solver/FE/Basis/BasisFunction.h   | 15 ----
 Code/Source/solver/FE/Basis/BasisTraits.h     |  3 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  7 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 20 +++--
 .../solver/FE/Basis/SerendipityBasis.cpp      | 63 ++++++++------
 .../Source/solver/FE/Basis/SerendipityBasis.h | 85 ++++++++++--------
 .../FE/Basis/test_BasisErrorPaths.cpp         |  3 -
 .../unitTests/FE/Basis/test_BasisHessians.cpp | 12 +--
 .../FE/Basis/test_HigherOrderWedge.cpp        |  3 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 86 +++++++++++++------
 .../FE/Basis/test_SerendipityBasis.cpp        | 25 +++++-
 11 files changed, 202 insertions(+), 120 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 584b10785..16b60f4e4 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -197,21 +197,6 @@ class BasisFunction {
      */
     virtual BasisTopology topology() const noexcept = 0;
 
-    /**
-     * @brief Return the named element type for this basis, if one exists.
-     *
-     * @details Convenience accessor that round-trips to a named mesh element
-     * when one is defined for this (topology(), order(), basis_type()) triple
-     * (orders 0-2), and returns ElementType::Unknown otherwise (for example an
-     * order-0 P0 basis on a volume topology, or any order >= 3 that has no named
-     * layout). topology() + order() are the authoritative identity; this should
-     * not be used as a discriminator for high-order or topology-constructed
-     * bases.
-     *
-     * @return Named element type, or ElementType::Unknown when none applies.
-     */
-    virtual ElementType element_type() const noexcept = 0;
-
     /**
      * @brief Return the reference-space dimension of the basis.
      * @return Reference dimension, from zero for points through three for volume elements.
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index aeb79498e..1743cdb85 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -205,7 +205,8 @@ namespace detail {
 // (topology, order, family) triple denotes, or Unknown when no named layout
 // exists (order 0 on a non-point topology, any order >= 3, or a reduced family
 // at an unsupported order). topology() + order() remain the authoritative
-// identity; this only backs the element_type() convenience accessor.
+// identity; callers that want a named ElementType for a basis pass its
+// topology(), order(), and basis_type() to this free helper directly.
 [[nodiscard]] constexpr ElementType named_element_for(BasisTopology top, int order,
                                                       BasisType family) noexcept {
     if (family == BasisType::Serendipity) {
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 587ef4886..a2188d4e9 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -271,6 +271,9 @@ LagrangeBasis::LagrangeBasis(BasisTopology topology, int order)
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
     : LagrangeBasis(validated_lagrange_topology(type, order), order) {}
 
+LagrangeBasis::LagrangeBasis(ElementType type)
+    : LagrangeBasis(type, named_lagrange_order(type)) {}
+
 // Initialize the 1D tensor-axis interpolation nodes (Gauss-Lobatto-Legendre, via
 // line_coord_pm_one) and their barycentric weights for tensor-product axes.
 void LagrangeBasis::init_tensor_axis_nodes() {
@@ -282,9 +285,7 @@ void LagrangeBasis::init_tensor_axis_nodes() {
     }
 
     // Barycentric weights w_i = 1 / prod_{j!=i}(x_i - x_j); the nodes are
-    // distinct so every denominator is nonzero. Precomputing here keeps the
-    // per-evaluation 1D Lagrange work at O(n^2) without recomputing the weights
-    // on every call.
+    // distinct so every denominator is nonzero.
     nodes_1d_weights_.assign(n, double(1));
     for (std::size_t i = 0; i < n; ++i) {
         double denom = double(1);
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 259af1db6..9cf81fa4f 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -158,17 +158,27 @@ class LagrangeBasis final : public BasisFunction {
      */
     LagrangeBasis(ElementType type, int order);
 
+    /**
+     * @brief Construct a Lagrange basis from a named element layout at its baked-in order.
+     *
+     * @details Single-argument convenience overload: the polynomial order is the
+     * one baked into the layout (0 for Point1, 1 for the linear elements, 2 for
+     * the complete-quadratic aliases such as Hex27/Tetra10), so the caller does
+     * not repeat it. Equivalent to LagrangeBasis(type, <baked-in order>).
+     * Serendipity and pyramid layouts are rejected, as for the two-argument
+     * overload.
+     *
+     * @param type Named element type; determines both topology and order.
+     * @throws BasisElementCompatibilityException If the element type is unsupported.
+     */
+    explicit LagrangeBasis(ElementType type);
+
     /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Lagrange; }
 
     /** @copydoc BasisFunction::topology() */
     BasisTopology topology() const noexcept final { return topology_; }
 
-    /** @copydoc BasisFunction::element_type() */
-    ElementType element_type() const noexcept final {
-        return named_element_for(topology_, order_, BasisType::Lagrange);
-    }
-
     /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 25ba94bb4..b52255ab3 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -213,12 +213,11 @@ int hex_serendipity_superlinear_degree(int ax, int ay, int az) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0) + (az > 1 ? az : 0);
 }
 
-// Hexahedral serendipity monomial space: every r^a s^b t^c whose superlinear
-// degree is at most `order`. This is the three-axis generalization of
-// quad_serendipity_exponents; at order 1 it is the eight multilinear monomials
-// (the Hex8 space) and at order 2 it is the twenty-monomial Hex20 space. The
-// enumeration order is internal -- evaluation sums over the monomials, so only
-// the node order, not the monomial order, is observable to a caller.
+// Hexahedral serendipity polynomial space: every monomial r^a s^b t^c whose
+// superlinear degree is at most `order`. This is the three-axis generalization
+// of quad_serendipity_exponents; at order 1 it is the eight multilinear
+// monomials (the Hex8 space) and at order 2 it is the twenty-monomial Hex20
+// space.
 std::vector<std::array<int, 3>> hex_serendipity_exponents(int order) {
     std::vector<std::array<int, 3>> exponents;
     for (int az = 0; az <= order; ++az) {
@@ -506,6 +505,19 @@ struct NormalizedSerendipityRequest {
     int order;
 };
 
+int serendipity_named_order(ElementType type) {
+    switch (type) {
+        case ElementType::Hex8:
+            return 1;
+        case ElementType::Quad8:
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+            return 2;
+        default:
+            return -1;
+    }
+}
+
 // Validate a named serendipity element/order pairing and return its topology,
 // reference dimension, and order. The named serendipity layouts (Quad8, Hex8,
 // Hex20, Wedge15) are each pinned to a single polynomial order by their node
@@ -513,28 +525,28 @@ struct NormalizedSerendipityRequest {
 // quadrilateral serendipity is not a named element: it is requested through the
 // BasisTopology::Quadrilateral constructor.
 NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int order) {
-    // The named layouts carry an inferred fixed order (Hex8 -> 1; Quad8, Hex20,
-    // and Wedge15 -> 2). The request must supply that exact order: it is never
-    // floored or otherwise adjusted to fit, so order 0 and negative orders are
-    // rejected rather than promoted to a valid layout.
+    // The request must supply the layout's fixed order (serendipity_named_order):
+    // it is never floored or otherwise adjusted to fit, so order 0 and negative
+    // orders are rejected rather than promoted to a valid layout.
+    const int expected_order = serendipity_named_order(type);
     switch (type) {
         case ElementType::Quad8:
-            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
                 "SerendipityBasis: Quad8 is the quadratic 8-node serendipity layout (order 2 only); "
                 "use BasisTopology::Quadrilateral for higher-order quadrilateral serendipity");
-            return {BasisTopology::Quadrilateral, 2, 2};
+            return {BasisTopology::Quadrilateral, 2, expected_order};
         case ElementType::Hex8:
-            svmp::throw_if<BasisConfigurationException>(order != 1, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
                 "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
-            return {BasisTopology::Hexahedron, 3, 1};
+            return {BasisTopology::Hexahedron, 3, expected_order};
         case ElementType::Hex20:
-            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
                 "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
-            return {BasisTopology::Hexahedron, 3, 2};
+            return {BasisTopology::Hexahedron, 3, expected_order};
         case ElementType::Wedge15:
-            svmp::throw_if<BasisConfigurationException>(order != 2, SVMP_HERE,
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
                 "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
-            return {BasisTopology::Wedge, 3, 2};
+            return {BasisTopology::Wedge, 3, expected_order};
         default:
             svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
                 "SerendipityBasis named elements are Quad8, Hex8, Hex20, and Wedge15; "
@@ -596,12 +608,15 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order)
     }
 }
 
-// Build the quadrilateral serendipity monomial space, reference nodes, and nodal
+SerendipityBasis::SerendipityBasis(ElementType type)
+    : SerendipityBasis(type, serendipity_named_order(type)) {}
+
+// Build the quadrilateral serendipity mode set, reference nodes, and nodal
 // coefficient table for the given order. The coefficient table is the inverse
-// Vandermonde of those monomials at the reference nodes; because the nodes are
-// in public order, evaluation needs no output permutation. The named Quad8
-// layout sources its nodes from ReferenceNodeLayout; the arbitrary-order
-// topology path generates them.
+// Vandermonde of tensor Legendre modes spanning the same polynomial space as the
+// monomial degree triples; because the nodes are in public order, evaluation
+// needs no output permutation. The named Quad8 layout sources its nodes from
+// ReferenceNodeLayout; the arbitrary-order topology path generates them.
 void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_layout) {
     const auto quad_exponents = quad_serendipity_exponents(order);
     monomial_exponents_.clear();
@@ -622,7 +637,7 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
         ModalAxisKind::Legendre, order);
 }
 
-// Build the hexahedral serendipity monomial space, reference nodes, and nodal
+// Build the hexahedral serendipity mode set, reference nodes, and nodal
 // coefficient table for the given order, mirroring init_quadrilateral. The
 // arbitrary-order topology path generates its own VTK-consistent nodes; the named
 // Hex8 (order 1) and Hex20 (order 2) layouts source their public-order nodes from
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index c9aa0974a..d310461e2 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -42,19 +42,21 @@ namespace basis {
  * Quad8, Hex8, and Hex20 are the fixed-order instances of these families
  * (quadrilateral order 2, hexahedron orders 1 and 2).
  *
- * Quadrilateral serendipity bases are built from monomials
+ * The quadrilateral serendipity polynomial space is described by monomials
  * @f$x^{a_x}y^{a_y}@f$ whose superlinear degree is at most the requested
- * order. In this implementation the superlinear degree is
+ * order. The implementation evaluates this space through tensor Legendre
+ * modes, which span the same polynomial space but give a better-conditioned
+ * Vandermonde. The superlinear degree is
  * @f[
  *   sldeg(x^{a_x}y^{a_y}) =
  *   \begin{cases} a_x, & a_x > 1 \\ 0, & a_x \le 1 \end{cases}
  *   +
  *   \begin{cases} a_y, & a_y > 1 \\ 0, & a_y \le 1 \end{cases}.
  * @f]
- * The nodal basis is recovered by inverting the Vandermonde interpolation
- * matrix at the selected reference nodes. Values, gradients, and Hessians are
- * then evaluated by differentiating the monomial vector and applying the
- * inverse Vandermonde coefficients.
+ * The nodal basis is recovered by inverting the generalized Vandermonde
+ * interpolation matrix at the selected reference nodes. Values, gradients, and
+ * Hessians are then evaluated by differentiating the modal vector and applying
+ * the inverse Vandermonde coefficients.
  * For order @f$p \ge 1@f$, this space has @f$4p@f$ boundary modes for
  * @f$p \le 3@f$ and
  * @f[
@@ -77,9 +79,10 @@ namespace basis {
  * nonsingular for the implemented quadrilateral serendipity space.
  *
  * Hexahedral serendipity generalizes the same construction to the cube. The
- * monomial space is every @f$r^{a_r}s^{a_s}t^{a_t}@f$ whose superlinear degree
- * (the three-axis form of the rule above) is at most @f$p@f$, and the nodal
- * basis is again the inverse Vandermonde at the reference nodes. Those nodes are
+ * polynomial space is described by every monomial
+ * @f$r^{a_r}s^{a_s}t^{a_t}@f$ whose superlinear degree (the three-axis form of
+ * the rule above) is at most @f$p@f$, and the nodal basis is again the inverse
+ * Vandermonde at the reference nodes. Those nodes are
  * distributed by boundary stratum: 8 corners, @f$12(p-1)@f$ edge nodes,
  * @f$6\,q(p)@f$ face-interior nodes -- each face carries the 2D quadrilateral
  * serendipity interior, since the trace of the cube space on a face is the
@@ -110,12 +113,13 @@ namespace basis {
  *
  * Every supported family -- quadrilateral, hexahedral, and Wedge15 -- is built by
  * inverting the generalized Vandermonde of its mode space at the public-order
- * reference nodes; values, gradients, and Hessians are evaluated by
- * differentiating the mode vector and applying the inverse-Vandermonde
- * coefficients. Because the tables are generated in public node order, evaluation
- * needs no output reordering, and there is no hand-written special case -- the
- * Hex8 basis is the order-1 instance of the generated hexahedral space, not a
- * separate trilinear evaluator.
+ * reference nodes. Quadrilateral and hexahedral bases use tensor Legendre modes;
+ * the fixed Wedge15 table uses monomial modes. Values, gradients, and Hessians
+ * are evaluated by differentiating the matching mode vector and applying the
+ * inverse-Vandermonde coefficients. Because the tables are generated in public
+ * node order, evaluation needs no output reordering, and there is no hand-written
+ * special case -- the Hex8 basis is the order-1 instance of the generated
+ * hexahedral space, not a separate trilinear evaluator.
  *
  * ## Conditioning and the well-conditioned order range
  *
@@ -144,10 +148,10 @@ class SerendipityBasis final : public BasisFunction {
      *
      * @details This is the arbitrary-order entry point for the serendipity
      * families with a free order: the quadrilateral and the hexahedron. The
-     * topology carries no node-count assumption; the monomial space, reference
-     * nodes (generated here in VTK-consistent stratified order), and nodal
-     * coefficient table are built from the requested order (which must be
-     * @f$p \ge 1@f$). Wedge serendipity is a single fixed layout and is not
+     * topology carries no node-count assumption; the serendipity polynomial
+     * space, reference nodes (generated here in VTK-consistent stratified order),
+     * and nodal coefficient table are built from the requested order (which must
+     * be @f$p \ge 1@f$). Wedge serendipity is a single fixed layout and is not
      * constructed this way -- use the named ElementType overload (Wedge15).
      *
      * @param topology Must be BasisTopology::Quadrilateral or BasisTopology::Hexahedron.
@@ -179,17 +183,25 @@ class SerendipityBasis final : public BasisFunction {
      */
     SerendipityBasis(ElementType type, int order);
 
+    /**
+     * @brief Construct a serendipity basis from a named layout at its fixed order.
+     *
+     * @details Single-argument convenience overload for the named serendipity
+     * layouts: the order is the one fixed by the layout (1 for Hex8; 2 for Quad8,
+     * Hex20, and Wedge15), so the caller does not repeat it. Equivalent to
+     * SerendipityBasis(type, <fixed order>).
+     *
+     * @param type Named serendipity element type (Quad8, Hex8, Hex20, or Wedge15).
+     * @throws BasisElementCompatibilityException If the element type is unsupported.
+     */
+    explicit SerendipityBasis(ElementType type);
+
     /** @copydoc BasisFunction::basis_type() */
     BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
 
     /** @copydoc BasisFunction::topology() */
     BasisTopology topology() const noexcept final { return topology_; }
 
-    /** @copydoc BasisFunction::element_type() */
-    ElementType element_type() const noexcept final {
-        return named_element_for(topology_, order_, BasisType::Serendipity);
-    }
-
     /** @copydoc BasisFunction::dimension() */
     int dimension() const noexcept final { return dimension_; }
 
@@ -210,7 +222,7 @@ class SerendipityBasis final : public BasisFunction {
      * generates its nodes here instead, in VTK-consistent stratified order:
      * corners and edges first (matching the public Quad8/Hex8/Hex20 ordering at
      * the named orders), then the face and volume interior points needed to make
-     * the reduced monomial space unisolvent. For @f$p \ge 3@f$ that interior
+     * the reduced polynomial space unisolvent. For @f$p \ge 3@f$ that interior
      * ordering is an implementation convention; callers should pair it with basis
      * values from the same object rather than assume an external mesh ordering
      * contract beyond the supported named production layouts.
@@ -222,10 +234,11 @@ class SerendipityBasis final : public BasisFunction {
     /**
      * @brief Evaluate serendipity basis function values at a reference coordinate.
      *
-     * @details Every family evaluates the serendipity monomial vector and
-     * multiplies by the generated inverse Vandermonde matrix to obtain nodal
-     * shape-function values; the coefficient table is already in public basis
-     * order, so no output reordering is needed.
+     * @details Every family evaluates the serendipity modal vector and multiplies
+     * by the generated inverse Vandermonde matrix to obtain nodal shape-function
+     * values. Quadrilateral and hexahedral bases use tensor Legendre modes; the
+     * fixed Wedge15 layout uses monomial modes. The coefficient table is already
+     * in public basis order, so no output reordering is needed.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
@@ -237,8 +250,8 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Evaluate analytical serendipity basis gradients at a reference coordinate.
      *
      * @details Gradients are derivatives with respect to reference coordinates.
-     * Every family differentiates the monomial vector and applies the same
-     * generated inverse Vandermonde coefficients used for the values.
+     * Every family differentiates the same modal vector used for values and
+     * applies the generated inverse Vandermonde coefficients.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param gradients Receives one three-component gradient per basis function.
@@ -251,8 +264,8 @@ class SerendipityBasis final : public BasisFunction {
      *
      * @details Hessians are second derivatives in reference coordinates and are
      * stored as 3-by-3 matrices. Every family uses the second derivatives of the
-     * monomial vector together with the same generated inverse Vandermonde
-     * coefficients used for the values.
+     * same modal vector used for values together with the generated inverse
+     * Vandermonde coefficients.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians Receives one 3-by-3 Hessian per basis function.
@@ -319,12 +332,12 @@ class SerendipityBasis final : public BasisFunction {
     // the same family the coefficient table was built with.
     bool uses_legendre_modes_{false};
 
-    // Build the quadrilateral serendipity monomial space, reference nodes, and
-    // nodal coefficient table for the given order. The named Quad8 layout takes
+    // Build the quadrilateral serendipity mode set, reference nodes, and nodal
+    // coefficient table for the given order. The named Quad8 layout takes
     // its nodes from ReferenceNodeLayout; the arbitrary-order topology path
     // generates them.
     void init_quadrilateral(int order, bool nodes_from_reference_layout);
-    // Build the hexahedral serendipity monomial space, reference nodes, and nodal
+    // Build the hexahedral serendipity mode set, reference nodes, and nodal
     // coefficient table for the given order. The arbitrary-order topology path
     // generates VTK-consistent nodes; the named Hex8 (order 1) and Hex20 (order 2)
     // layouts take their public-order nodes from ReferenceNodeLayout.
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index beb3028bf..e2bb1b9bd 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -42,7 +42,6 @@ class MinimalScalarBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     BasisTopology topology() const noexcept override { return BasisTopology::Line; }
-    ElementType element_type() const noexcept override { return ElementType::Line2; }
     int dimension() const noexcept override { return 1; }
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
@@ -65,7 +64,6 @@ class ExactQuadraticBasis : public BasisFunction {
 
     BasisType basis_type() const noexcept override { return BasisType::Custom; }
     BasisTopology topology() const noexcept override { return BasisTopology::Hexahedron; }
-    ElementType element_type() const noexcept override { return ElementType::Hex8; }
     int dimension() const noexcept override { return 3; }
     int order() const noexcept override { return 2; }
     std::size_t size() const noexcept override { return 2u; }
@@ -114,7 +112,6 @@ class CompleteFallbackBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     BasisTopology topology() const noexcept override { return BasisTopology::Triangle; }
-    ElementType element_type() const noexcept override { return ElementType::Triangle3; }
     int dimension() const noexcept override { return 2; }
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index d80955eb3..858e7c13b 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -110,7 +110,7 @@ void expect_gradients_match_numerical(const BasisFunction& basis,
                 const std::size_t sd = static_cast<std::size_t>(d);
                 EXPECT_NEAR(analytical[n][sd], numerical[n][sd], tol)
                     << "basis " << n << ", component " << d
-                    << ", element " << static_cast<int>(basis.element_type())
+                    << ", element " << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                     << ", order " << basis.order();
             }
         }
@@ -136,7 +136,7 @@ void expect_hessians_match_numerical(const BasisFunction& basis,
                     const std::size_t sj = static_cast<std::size_t>(j);
                     EXPECT_NEAR(analytical[n](si, sj), numerical[n](si, sj), tol)
                         << "basis " << n << ", component (" << i << "," << j
-                        << "), element " << static_cast<int>(basis.element_type())
+                        << "), element " << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                         << ", order " << basis.order();
                 }
             }
@@ -165,7 +165,7 @@ void expect_partition_hessian_sum_zero(const BasisFunction& basis,
             EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
                         double(0),
                         tol)
-                << "element " << static_cast<int>(basis.element_type())
+                << "element " << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                 << ", order " << basis.order();
         }
     }
@@ -205,16 +205,16 @@ void expect_inactive_z_derivatives_zero(const BasisFunction& basis,
         for (std::size_t n = 0; n < basis.size(); ++n) {
             EXPECT_NEAR(gradients[n][2], double(0), tol)
                 << "basis " << n << ", element "
-                << static_cast<int>(basis.element_type())
+                << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                 << ", order " << basis.order();
             for (std::size_t d = 0; d < 3u; ++d) {
                 EXPECT_NEAR(hessians[n](2, d), double(0), tol)
                     << "basis " << n << ", component (2," << d
-                    << "), element " << static_cast<int>(basis.element_type())
+                    << "), element " << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                     << ", order " << basis.order();
                 EXPECT_NEAR(hessians[n](d, 2), double(0), tol)
                     << "basis " << n << ", component (" << d
-                    << ",2), element " << static_cast<int>(basis.element_type())
+                    << ",2), element " << static_cast<int>(named_element_for(basis.topology(), basis.order(), basis.basis_type()))
                     << ", order " << basis.order();
             }
         }
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index c5db1ebac..dd4511b18 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -112,7 +112,8 @@ TEST(HigherOrderWedge, CompleteAliasMatchesGeneratedNodeLayout) {
 
     ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(ElementType::Wedge18));
     EXPECT_EQ(alias_basis.topology(), BasisTopology::Wedge);
-    EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge18);  // faithful round-trip
+    EXPECT_EQ(named_element_for(alias_basis.topology(), alias_basis.order(), alias_basis.basis_type()),
+              ElementType::Wedge18);  // faithful round-trip
     EXPECT_EQ(alias_basis.order(), 2);
     expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
 }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index efd733a7a..a171ca6d4 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -262,8 +262,8 @@ TEST(LagrangeBasis, SpanOutputSinksMatchVectorEvaluationAcrossTopologies) {
 
 // A named quadratic alias is a fixed-order shorthand for the same topology at
 // order 2: it builds the identical basis as the BasisTopology overload, reports
-// that topology, and round-trips faithfully through element_type() (Hex27 stays
-// Hex27 rather than collapsing to a canonical linear type).
+// that topology, and round-trips faithfully through named_element_for() (Hex27
+// stays Hex27 rather than collapsing to a canonical linear type).
 TEST(LagrangeBasis, CompleteAliasesMatchTopologyConstruction) {
     const std::vector<std::tuple<ElementType, BasisTopology, ElementType>> aliases = {
         {ElementType::Line3, BasisTopology::Line, ElementType::Line2},
@@ -280,7 +280,8 @@ TEST(LagrangeBasis, CompleteAliasesMatchTopologyConstruction) {
         const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(representative, 2);
 
         EXPECT_EQ(alias_basis.topology(), topo);
-        EXPECT_EQ(alias_basis.element_type(), alias);
+        EXPECT_EQ(named_element_for(alias_basis.topology(), alias_basis.order(), alias_basis.basis_type()),
+                  alias);
         EXPECT_EQ(alias_basis.order(), 2);
         expect_nodes_close(alias_basis.nodes(), generated, double(1e-14));
         expect_nodes_close(alias_basis.nodes(), topo_basis.nodes(), double(1e-14));
@@ -293,7 +294,7 @@ TEST(LagrangeBasis, CompleteAliasesMatchTopologyConstruction) {
 
 // The arbitrary order a named alias rejects is exactly what the BasisTopology
 // overload is for: a node-count-named element cannot carry a conflicting order,
-// and an order with no named layout reports element_type() == Unknown.
+// and an order with no named layout maps to named_element_for() == Unknown.
 TEST(LagrangeBasis, ArbitraryOrderRequiresTopologyNotNamedAlias) {
     EXPECT_THROW((void)LagrangeBasis(ElementType::Hex27, 3), BasisConfigurationException);
 
@@ -301,22 +302,49 @@ TEST(LagrangeBasis, ArbitraryOrderRequiresTopologyNotNamedAlias) {
     EXPECT_EQ(basis.topology(), BasisTopology::Hexahedron);
     EXPECT_EQ(basis.order(), 5);
     EXPECT_EQ(basis.size(), 216u);  // (5 + 1)^3
-    EXPECT_EQ(basis.element_type(), ElementType::Unknown);  // no named order-5 hex
-}
-
-// element_type() is the inverse of (topology, order): a named layout at orders
-// 0-2, Unknown where no named element exists (order 0 on a volume topology, or
-// any order >= 3). topology() + order() remain the authoritative identity.
-TEST(LagrangeBasis, ElementTypeAccessorReflectsTopologyAndOrder) {
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Point, 0).element_type(), ElementType::Point1);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 1).element_type(), ElementType::Hex8);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 2).element_type(), ElementType::Hex27);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 3).element_type(), ElementType::Unknown);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Tetrahedron, 1).element_type(), ElementType::Tetra4);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Tetrahedron, 2).element_type(), ElementType::Tetra10);
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Quadrilateral, 2).element_type(), ElementType::Quad9);
+    EXPECT_EQ(named_element_for(basis.topology(), basis.order(), basis.basis_type()),
+              ElementType::Unknown);  // no named order-5 hex
+}
+
+// named_element_for() is the inverse of (topology, order): a named layout at
+// orders 0-2, Unknown where no named element exists (order 0 on a volume
+// topology, or any order >= 3). topology() + order() remain the authoritative
+// identity; callers recover a named ElementType through this free helper.
+TEST(LagrangeBasis, NamedElementForReflectsTopologyAndOrder) {
+    EXPECT_EQ(named_element_for(BasisTopology::Point, 0, BasisType::Lagrange), ElementType::Point1);
+    EXPECT_EQ(named_element_for(BasisTopology::Hexahedron, 1, BasisType::Lagrange), ElementType::Hex8);
+    EXPECT_EQ(named_element_for(BasisTopology::Hexahedron, 2, BasisType::Lagrange), ElementType::Hex27);
+    EXPECT_EQ(named_element_for(BasisTopology::Hexahedron, 3, BasisType::Lagrange), ElementType::Unknown);
+    EXPECT_EQ(named_element_for(BasisTopology::Tetrahedron, 1, BasisType::Lagrange), ElementType::Tetra4);
+    EXPECT_EQ(named_element_for(BasisTopology::Tetrahedron, 2, BasisType::Lagrange), ElementType::Tetra10);
+    EXPECT_EQ(named_element_for(BasisTopology::Quadrilateral, 2, BasisType::Lagrange), ElementType::Quad9);
     // An order-0 P0 basis on a volume topology has no named element.
-    EXPECT_EQ(LagrangeBasis(BasisTopology::Hexahedron, 0).element_type(), ElementType::Unknown);
+    EXPECT_EQ(named_element_for(BasisTopology::Hexahedron, 0, BasisType::Lagrange), ElementType::Unknown);
+}
+
+// The single-argument named overload infers the layout's baked-in order, so it
+// builds the same basis as passing that order explicitly, and still rejects the
+// non-Lagrange (serendipity/pyramid) layouts the two-argument overload rejects.
+TEST(LagrangeBasis, SingleArgumentNamedOverloadInfersBakedOrder) {
+    const std::vector<std::pair<ElementType, int>> named = {
+        {ElementType::Point1, 0},
+        {ElementType::Line2, 1},     {ElementType::Line3, 2},
+        {ElementType::Triangle3, 1}, {ElementType::Triangle6, 2},
+        {ElementType::Quad4, 1},     {ElementType::Quad9, 2},
+        {ElementType::Tetra4, 1},    {ElementType::Tetra10, 2},
+        {ElementType::Hex8, 1},      {ElementType::Hex27, 2},
+        {ElementType::Wedge6, 1},    {ElementType::Wedge18, 2},
+    };
+    for (const auto& [type, baked_order] : named) {
+        const LagrangeBasis inferred(type);
+        const LagrangeBasis explicit_order(type, baked_order);
+        EXPECT_EQ(inferred.order(), baked_order) << "type=" << static_cast<int>(type);
+        EXPECT_EQ(inferred.topology(), explicit_order.topology()) << "type=" << static_cast<int>(type);
+        EXPECT_EQ(inferred.size(), explicit_order.size()) << "type=" << static_cast<int>(type);
+    }
+
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Quad8), BasisElementCompatibilityException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Pyramid5), BasisElementCompatibilityException);
 }
 
 TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
@@ -583,7 +611,8 @@ TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
 TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
     LagrangeBasis basis(ElementType::Point1, 0);
 
-    EXPECT_EQ(basis.element_type(), ElementType::Point1);
+    EXPECT_EQ(named_element_for(basis.topology(), basis.order(), basis.basis_type()),
+              ElementType::Point1);
     EXPECT_EQ(basis.size(), 1u);
     EXPECT_EQ(basis.dimension(), 0);
     ASSERT_EQ(basis.nodes().size(), 1u);
@@ -730,7 +759,8 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     ASSERT_NE(lagrange, nullptr);
     EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
     EXPECT_EQ(lagrange->topology(), BasisTopology::Hexahedron);
-    EXPECT_EQ(lagrange->element_type(), ElementType::Hex27);
+    EXPECT_EQ(named_element_for(lagrange->topology(), lagrange->order(), lagrange->basis_type()),
+              ElementType::Hex27);
     EXPECT_EQ(lagrange->order(), 2);
 
     // The factory inherits the named-element order validation: Hex27 is order 2.
@@ -746,7 +776,9 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     ASSERT_NE(high_order_lagrange, nullptr);
     EXPECT_EQ(high_order_lagrange->basis_type(), BasisType::Lagrange);
     EXPECT_EQ(high_order_lagrange->topology(), BasisTopology::Hexahedron);
-    EXPECT_EQ(high_order_lagrange->element_type(), ElementType::Unknown);
+    EXPECT_EQ(named_element_for(high_order_lagrange->topology(), high_order_lagrange->order(),
+                               high_order_lagrange->basis_type()),
+              ElementType::Unknown);
     EXPECT_EQ(high_order_lagrange->order(), 5);
     EXPECT_EQ(high_order_lagrange->size(), 216u);
 
@@ -763,7 +795,10 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     ASSERT_NE(high_order_quad_serendipity, nullptr);
     EXPECT_EQ(high_order_quad_serendipity->basis_type(), BasisType::Serendipity);
     EXPECT_EQ(high_order_quad_serendipity->topology(), BasisTopology::Quadrilateral);
-    EXPECT_EQ(high_order_quad_serendipity->element_type(), ElementType::Unknown);
+    EXPECT_EQ(named_element_for(high_order_quad_serendipity->topology(),
+                               high_order_quad_serendipity->order(),
+                               high_order_quad_serendipity->basis_type()),
+              ElementType::Unknown);
     EXPECT_EQ(high_order_quad_serendipity->order(), 4);
     EXPECT_EQ(high_order_quad_serendipity->size(), 17u);
 
@@ -775,7 +810,10 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     ASSERT_NE(high_order_hex_serendipity, nullptr);
     EXPECT_EQ(high_order_hex_serendipity->basis_type(), BasisType::Serendipity);
     EXPECT_EQ(high_order_hex_serendipity->topology(), BasisTopology::Hexahedron);
-    EXPECT_EQ(high_order_hex_serendipity->element_type(), ElementType::Unknown);
+    EXPECT_EQ(named_element_for(high_order_hex_serendipity->topology(),
+                               high_order_hex_serendipity->order(),
+                               high_order_hex_serendipity->basis_type()),
+              ElementType::Unknown);
     EXPECT_EQ(high_order_hex_serendipity->order(), 3);
     EXPECT_EQ(high_order_hex_serendipity->size(), 32u);
 
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 668eaa73a..605a1bacd 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -548,12 +548,33 @@ TEST(SerendipityBasis, TopologyConstructionSupportsQuadrilateralAndHexahedron) {
     SerendipityBasis quad(BasisTopology::Quadrilateral, 2);
     EXPECT_EQ(quad.topology(), BasisTopology::Quadrilateral);
     EXPECT_EQ(quad.order(), 2);
-    EXPECT_EQ(quad.element_type(), ElementType::Quad8);
+    EXPECT_EQ(named_element_for(quad.topology(), quad.order(), quad.basis_type()),
+              ElementType::Quad8);
 
     SerendipityBasis hex(BasisTopology::Hexahedron, 2);
     EXPECT_EQ(hex.topology(), BasisTopology::Hexahedron);
     EXPECT_EQ(hex.order(), 2);
-    EXPECT_EQ(hex.element_type(), ElementType::Hex20);
+    EXPECT_EQ(named_element_for(hex.topology(), hex.order(), hex.basis_type()),
+              ElementType::Hex20);
+}
+
+TEST(SerendipityBasis, SingleArgumentNamedOverloadInfersFixedOrder) {
+    const std::vector<std::pair<ElementType, int>> named = {
+        {ElementType::Quad8, 2},
+        {ElementType::Hex8, 1},
+        {ElementType::Hex20, 2},
+        {ElementType::Wedge15, 2},
+    };
+    for (const auto& [type, fixed_order] : named) {
+        const SerendipityBasis inferred(type);
+        const SerendipityBasis explicit_order(type, fixed_order);
+        EXPECT_EQ(inferred.order(), fixed_order) << "type=" << static_cast<int>(type);
+        EXPECT_EQ(inferred.topology(), explicit_order.topology()) << "type=" << static_cast<int>(type);
+        EXPECT_EQ(inferred.size(), explicit_order.size()) << "type=" << static_cast<int>(type);
+    }
+
+    EXPECT_THROW((void)SerendipityBasis(ElementType::Quad4), BasisElementCompatibilityException);
+    EXPECT_THROW((void)SerendipityBasis(ElementType::Tetra4), BasisElementCompatibilityException);
 }
 
 TEST(SerendipityBasis, QuadrilateralRejectsOrdersBelowOne) {

From 2376874773f7c2afb3d5793afea3a819d288f550 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 17:54:59 -0700
Subject: [PATCH 68/91] removing un-used code and adding covergence criteria
 for high-order basis functions

---
 Code/Source/solver/FE/Basis/BasisTraits.h     | 85 +++++--------------
 .../FE/Basis/NodeOrderingConventions.cpp      | 16 +++-
 .../FE/Basis/test_ConstexprBasis.cpp          | 26 ------
 3 files changed, 35 insertions(+), 92 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 1743cdb85..a9235d32c 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -7,7 +7,6 @@
 #include "Types.h"
 
 #include <cstddef>
-#include <limits>
 
 namespace svmp {
 namespace FE {
@@ -24,36 +23,27 @@ enum class BasisTopology {
     Wedge,
 };
 
-namespace detail {
-
-[[nodiscard]] constexpr double basis_abs(double value) noexcept {
-    return value < double(0) ? -value : value;
-}
-
-[[nodiscard]] constexpr double basis_max(double lhs, double rhs) noexcept {
-    return lhs < rhs ? rhs : lhs;
-}
-
-[[nodiscard]] constexpr double basis_scaled_tolerance(double scale = double(1),
-                                                    double multiplier = double(64)) noexcept {
-    return multiplier * std::numeric_limits<double>::epsilon() *
-           basis_max(double(1), basis_abs(scale));
-}
-
-[[nodiscard]] constexpr bool basis_near_zero(double value,
-                                             double scale = double(1),
-                                             double multiplier = double(64)) noexcept {
-    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-[[nodiscard]] constexpr bool basis_nearly_equal(double a,
-                                                double b,
-                                                double multiplier = double(64)) noexcept {
-    const double scale = basis_max(double(1), basis_max(basis_abs(a), basis_abs(b)));
-    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-} // namespace detail
+// ---------------------------------------------------------------------------
+// ElementType / BasisTopology / order mapping helpers.
+//
+// A basis identity is expressed three ways -- a named ElementType, a
+// (BasisTopology, order) pair, and a reference dimension -- and the constexpr
+// maps below convert between them. They are grouped here so the relationships
+// stay in one place:
+//
+//   ElementType   -> BasisTopology    topology()
+//   ElementType   -> ElementType      canonical_lagrange_type()   (alias -> linear representative)
+//   ElementType   -> order            complete_lagrange_alias_order(), named_lagrange_order()
+//   BasisTopology -> int (dimension)  topology_dimension()
+//   BasisTopology -> ElementType      lagrange_topology_representative() (lowest-order representative)
+//   (BasisTopology, order, family) -> ElementType   named_element_for() (inverse of topology() + order())
+//
+// The two ElementType -> order maps differ only at Point1:
+// complete_lagrange_alias_order() returns -1 (a point is not a complete-Lagrange
+// alias) while named_lagrange_order() returns 0 (the point layout's order).
+// named_lagrange_order() is defined in terms of complete_lagrange_alias_order(),
+// so the order-1 / order-2 alias values have a single source of truth.
+// ---------------------------------------------------------------------------
 
 // Reference-cell topology is derived from the single mesh cell-family
 // classification (to_mesh_family) so the basis layer never maintains a parallel
@@ -77,39 +67,6 @@ namespace detail {
     }
 }
 
-// The shape predicates derive from topology() so they share its single source.
-[[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Point;
-}
-
-[[nodiscard]] constexpr bool is_line(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Line;
-}
-
-[[nodiscard]] constexpr bool is_triangle(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Triangle;
-}
-
-[[nodiscard]] constexpr bool is_quadrilateral(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Quadrilateral;
-}
-
-[[nodiscard]] constexpr bool is_tetrahedron(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Tetrahedron;
-}
-
-[[nodiscard]] constexpr bool is_hexahedron(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Hexahedron;
-}
-
-[[nodiscard]] constexpr bool is_wedge(ElementType type) noexcept {
-    return topology(type) == BasisTopology::Wedge;
-}
-
-[[nodiscard]] constexpr int reference_dimension(ElementType type) noexcept {
-    return element_dimension(type);
-}
-
 [[nodiscard]] constexpr ElementType canonical_lagrange_type(ElementType type) noexcept {
     switch (type) {
         case ElementType::Line2:
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index d70ea4293..571ba0476 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -31,6 +31,12 @@ const std::vector<double>& gll_points(int order) {
         return found->second;
     }
 
+    // Newton converges quadratically from the Chebyshev-Gauss-Lobatto seed, so a
+    // few iterations suffice for any practical order; the cap is only a safety
+    // bound. Reaching it without meeting the tolerance signals a real failure
+    constexpr int kMaxNewtonIterations = 100;
+    constexpr double kNewtonTolerance = double(1e-15);
+
     std::vector<double> pts(static_cast<std::size_t>(order + 1), double(0));
     if (order >= 1) {
         pts.front() = double(-1);
@@ -44,7 +50,8 @@ const std::vector<double>& gll_points(int order) {
             continue;
         }
         double x = -std::cos(pi * static_cast<double>(j) / static_cast<double>(order));
-        for (int iter = 0; iter < 100; ++iter) {
+        bool converged = false;
+        for (int iter = 0; iter < kMaxNewtonIterations; ++iter) {
             // Legendre P_k and P'_k up to k = order at x, by the three-term
             // recurrences (regular at x = +/-1).
             double p_km1 = double(1);   // P_0
@@ -68,10 +75,15 @@ const std::vector<double>& gll_points(int order) {
             const double f_prime = p_k + x * d_k - d_km1;
             const double dx = f / f_prime;
             x -= dx;
-            if (std::abs(dx) <= double(1e-15)) {
+            if (std::abs(dx) <= kNewtonTolerance) {
+                converged = true;
                 break;
             }
         }
+        svmp::throw_if<BasisConstructionException>(
+            !converged, SVMP_HERE,
+            "ReferenceNodeLayout: Gauss-Lobatto-Legendre Newton iteration did not converge "
+            "(order outside the trustworthy range)");
         pts[static_cast<std::size_t>(j)] = x;
     }
     for (int j = half + 1; j < order; ++j) {
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index 209d15c48..39e1ddece 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -9,7 +9,6 @@
 
 #include <gtest/gtest.h>
 
-#include <limits>
 #include <tuple>
 #include <vector>
 
@@ -18,24 +17,11 @@ namespace FE {
 namespace basis {
 namespace {
 
-static_assert(is_line(ElementType::Line2));
-static_assert(is_line(ElementType::Line3));
-static_assert(is_triangle(ElementType::Triangle6));
-static_assert(is_quadrilateral(ElementType::Quad8));
-static_assert(is_tetrahedron(ElementType::Tetra10));
-static_assert(is_hexahedron(ElementType::Hex20));
-static_assert(is_wedge(ElementType::Wedge18));
 static_assert(topology(ElementType::Pyramid5) == BasisTopology::Unknown);
 static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
 static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
 static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
-static_assert(detail::basis_abs(double(-2)) == double(2));
-static_assert(detail::basis_max(double(2), double(3)) == double(3));
-static_assert(detail::basis_near_zero(detail::basis_scaled_tolerance() * double(0.5)));
-static_assert(detail::basis_nearly_equal(
-    double(1),
-    double(1) + detail::basis_scaled_tolerance() * double(0.5)));
 
 // Topology/order helpers backing the BasisTopology construction path.
 static_assert(topology_dimension(BasisTopology::Line) == 1);
@@ -77,18 +63,6 @@ TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     }
 }
 
-TEST(ConstexprBasis, TraitToleranceScalesWithDoublePrecision) {
-    const double eps = std::numeric_limits<double>::epsilon();
-    const double tol = detail::basis_scaled_tolerance();
-    // Probes straddle the tolerance itself rather than hardcoding the multiplier,
-    // so retuning basis_scaled_tolerance cannot silently invalidate them.
-    EXPECT_GT(tol, eps);
-    EXPECT_TRUE(detail::basis_near_zero(tol * double(0.5)));
-    EXPECT_FALSE(detail::basis_near_zero(tol * double(2)));
-    EXPECT_TRUE(detail::basis_nearly_equal(double(1), double(1) + tol * double(0.5)));
-    EXPECT_FALSE(detail::basis_nearly_equal(double(1), double(1) + tol * double(2)));
-}
-
 TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
     const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
         {ElementType::Line2, ElementType::Line2, 1},

From ff8c289ee2899b7416a9178e1b184886ac549e31 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 18:14:22 -0700
Subject: [PATCH 69/91] BasisNodeOrderingException for negative or oversized i,
 while preserving order <= 0

---
 Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp | 6 ++++++
 Code/Source/solver/FE/Basis/NodeOrderingConventions.h   | 5 +++--
 tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp       | 8 ++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 571ba0476..7f0dd09fe 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -683,8 +683,14 @@ void validate_lattice(const LagrangeNodeLayout& layout, ElementType type, int or
 
 double line_coord_pm_one(int i, int order) {
     if (order <= 0) {
+        svmp::throw_if<BasisNodeOrderingException>(
+            i != 0, SVMP_HERE,
+            "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
         return double(0);
     }
+    svmp::throw_if<BasisNodeOrderingException>(
+        i < 0 || i > order, SVMP_HERE,
+        "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
     return gll_points(order)[static_cast<std::size_t>(i)];
 }
 
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 62658d4a3..4545c581f 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -26,14 +26,15 @@ namespace basis {
  * exponential growth of equispaced nodes). At order 1 the nodes are
  * @f$\{-1, +1\}@f$ and at order 2 @f$\{-1, 0, +1\}@f$, so they coincide with the
  * equispaced layout for the production orders and differ only for order >= 3.
- * Returns 0 for order <= 0.
+ * Returns 0 for order <= 0 when @p i is 0. Invalid indices throw.
  *
  * Shared by the reference-node layout generators and the Lagrange tensor-axis
  * node initialization so the 1D distribution lives in a single place.
  *
- * @param i Node index in [0, order].
+ * @param i Node index in [0, order] for positive orders, or 0 for order <= 0.
  * @param order Polynomial order of the 1D distribution.
  * @return GLL node coordinate on [-1, 1].
+ * @throws BasisNodeOrderingException If @p i is outside the valid range.
  */
 [[nodiscard]] double line_coord_pm_one(int i, int order);
 
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index e2bb1b9bd..a9d6b864b 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -321,6 +321,14 @@ TEST(BasisErrorPaths, CoreHelpersPreserveSourceLocation) {
 }
 
 TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
+    EXPECT_THROW((void)line_coord_pm_one(-1, 1),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)line_coord_pm_one(2, 1),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)line_coord_pm_one(-1, 0),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)line_coord_pm_one(1, 0),
+                 BasisNodeOrderingException);
     EXPECT_THROW((void)ReferenceNodeLayout::get_node_coords(ElementType::Quad8, 99u),
                  BasisNodeOrderingException);
     EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),

From 7544c4d606ab836f9e64943aa5e91d66efe79e1f Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 18:38:28 -0700
Subject: [PATCH 70/91] renamed misleading identifiers

---
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |   9 ++
 .../FE/Basis/NodeOrderingConventions.cpp      |  71 ++++++------
 .../solver/FE/Basis/NodeOrderingConventions.h |  10 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      | 106 +++++++++---------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  26 ++---
 .../FE/Basis/test_BasisErrorPaths.cpp         |   2 +-
 .../FE/Basis/test_ConstexprBasis.cpp          |   2 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |   2 +-
 .../FE/Basis/test_SerendipityBasis.cpp        |   2 +-
 9 files changed, 120 insertions(+), 110 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 9cf81fa4f..76cc92590 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -311,6 +311,15 @@ class LagrangeBasis final : public BasisFunction {
     int dimension_{0};
     int order_{0};
 
+    // Topology-specific construction data. nodes_ (the reference nodes in basis
+    // order) is populated for every topology and backs size(); each remaining
+    // vector is filled only for the topologies that use it and stays empty
+    // otherwise:
+    //   line/quad/hex : nodes_1d_, nodes_1d_weights_, tensor_indices_
+    //   triangle/tetra: simplex_exponents_
+    //   wedge         : nodes_1d_, nodes_1d_weights_, wedge_indices_, and
+    //                   simplex_exponents_ (the triangle cross-section exponents)
+    //   point         : nodes_ only
     std::vector<double> nodes_1d_;
     std::vector<double> nodes_1d_weights_;
     std::vector<math::Vector<double, 3>> nodes_;
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 7f0dd09fe..aabd861c9 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -8,6 +8,7 @@
 #include <array>
 #include <cmath>
 #include <map>
+#include <span>
 #include <utility>
 
 namespace svmp {
@@ -157,12 +158,31 @@ void append_triangle_face_interior(LagrangeNodeLayout& out,
     }
 }
 
+// One-node layout for the order-0 (constant) basis: the element centroid carried
+// with the origin lattice index. Shared by every generator's order-0 branch.
+LagrangeNodeLayout single_node_layout(const Point& centroid) {
+    LagrangeNodeLayout out;
+    out.coords.push_back(centroid);
+    out.lattice.push_back(Lattice{0, 0, 0});
+    return out;
+}
+
+// Append the element corner vertices (coordinate paired with lattice index) in
+// the given order. Shared by the volume generators, which all open with the same
+// corner loop.
+void append_vertices(LagrangeNodeLayout& out,
+                     std::span<const Point> verts,
+                     std::span<const Lattice> vert_lattice) {
+    for (std::size_t v = 0; v < verts.size(); ++v) {
+        out.coords.push_back(verts[v]);
+        out.lattice.push_back(vert_lattice[v]);
+    }
+}
+
 LagrangeNodeLayout generate_line_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(0), double(0), double(0)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(0), double(0), double(0)});
     }
 
     out.coords.reserve(static_cast<std::size_t>(order + 1));
@@ -181,9 +201,7 @@ LagrangeNodeLayout generate_line_nodes(int order) {
 LagrangeNodeLayout generate_triangle_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(1) / double(3), double(1) / double(3), double(0)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(1) / double(3), double(1) / double(3), double(0)});
     }
 
     out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
@@ -223,9 +241,7 @@ LagrangeNodeLayout generate_triangle_nodes(int order) {
 LagrangeNodeLayout generate_quad_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(0), double(0), double(0)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(0), double(0), double(0)});
     }
 
     out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
@@ -268,9 +284,7 @@ LagrangeNodeLayout generate_quad_nodes(int order) {
 LagrangeNodeLayout generate_tetra_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(0.25), double(0.25), double(0.25)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(0.25), double(0.25), double(0.25)});
     }
 
     const Point verts[] = {
@@ -288,11 +302,9 @@ LagrangeNodeLayout generate_tetra_nodes(int order) {
 
     out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
     out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
-    for (std::size_t v = 0; v < 4u; ++v) {
-        out.coords.push_back(verts[v]);
-        out.lattice.push_back(vert_lattice[v]);
-    }
+    append_vertices(out, verts, vert_lattice);
 
+    // Edge vertex pairs in VTK quadratic-tetra edge order.
     const int edges[6][2] = {{0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}};
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
@@ -302,6 +314,7 @@ LagrangeNodeLayout generate_tetra_nodes(int order) {
         }
     }
 
+    // Triangular faces in VTK tetra face order (vertex triples).
     const int faces[4][3] = {{0, 1, 2}, {0, 1, 3}, {1, 2, 3}, {0, 2, 3}};
     for (const auto& face : faces) {
         append_triangle_face_interior(out,
@@ -326,9 +339,7 @@ LagrangeNodeLayout generate_tetra_nodes(int order) {
 LagrangeNodeLayout generate_hex_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(0), double(0), double(0)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(0), double(0), double(0)});
     }
 
     const Point verts[] = {
@@ -354,11 +365,10 @@ LagrangeNodeLayout generate_hex_nodes(int order) {
 
     out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
     out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
-    for (std::size_t v = 0; v < 8u; ++v) {
-        out.coords.push_back(verts[v]);
-        out.lattice.push_back(vert_lattice[v]);
-    }
+    append_vertices(out, verts, vert_lattice);
 
+    // Edge vertex pairs in VTK quadratic-hex edge order (bottom ring, top ring,
+    // then the four vertical edges).
     const int edges[12][2] = {
         {0, 1}, {1, 2}, {2, 3}, {3, 0},
         {4, 5}, {5, 6}, {6, 7}, {7, 4},
@@ -444,9 +454,7 @@ LagrangeNodeLayout generate_hex_nodes(int order) {
 LagrangeNodeLayout generate_wedge_nodes(int order) {
     LagrangeNodeLayout out;
     if (order == 0) {
-        out.coords.push_back(Point{double(1) / double(3), double(1) / double(3), double(0)});
-        out.lattice.push_back(Lattice{0, 0, 0});
-        return out;
+        return single_node_layout(Point{double(1) / double(3), double(1) / double(3), double(0)});
     }
 
     const Point verts[] = {
@@ -468,11 +476,10 @@ LagrangeNodeLayout generate_wedge_nodes(int order) {
 
     out.coords.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
     out.lattice.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
-    for (std::size_t v = 0; v < 6u; ++v) {
-        out.coords.push_back(verts[v]);
-        out.lattice.push_back(vert_lattice[v]);
-    }
+    append_vertices(out, verts, vert_lattice);
 
+    // Edge vertex pairs in VTK quadratic-wedge edge order (bottom triangle, top
+    // triangle, then the three vertical edges).
     const int edges[9][2] = {
         {0, 1}, {1, 2}, {2, 0},
         {3, 4}, {4, 5}, {5, 3},
@@ -694,11 +701,11 @@ double line_coord_pm_one(int i, int order) {
     return gll_points(order)[static_cast<std::size_t>(i)];
 }
 
-math::Vector<double, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+math::Vector<double, 3> ReferenceNodeLayout::node_coord_at(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
     svmp::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
-                                             "ReferenceNodeLayout::get_node_coords: node index out of range");
+                                             "ReferenceNodeLayout::node_coord_at: node index out of range");
     return nodes[local_node];
 }
 
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 4545c581f..71e4f6ad3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -60,7 +60,11 @@ struct LagrangeNodeLayout {
 
 class ReferenceNodeLayout {
 public:
-    static math::Vector<double, 3> get_node_coords(ElementType elem_type,
+    /**
+     * @brief One reference node coordinate by local index. Regenerates the full
+     * layout per call; prefer node_coords() when more than one node is needed.
+     */
+    static math::Vector<double, 3> node_coord_at(ElementType elem_type,
                                                  std::size_t local_node);
     static std::size_t num_nodes(ElementType elem_type);
 
@@ -68,9 +72,9 @@ class ReferenceNodeLayout {
      * @brief All reference node coordinates for an element type, in public layout order.
      *
      * @details Returns the complete public reference layout for @p elem_type
-     * (the same coordinates get_node_coords() returns one at a time), including
+     * (the same coordinates node_coord_at() returns one at a time), including
      * the serendipity layouts. Prefer this single call when the whole layout is
-     * needed: get_node_coords() regenerates the full list on every call.
+     * needed: node_coord_at() regenerates the full list on every call.
      *
      * @param elem_type Element type to look up.
      * @return Reference node coordinates, one per node.
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index b52255ab3..1ff37f7a4 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -18,8 +18,12 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<double, 3>;
 
-int quad_serendipity_superlinear_degree(int ax, int ay) {
-    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
+// Superlinear contribution of one axis exponent: degrees 0 and 1 are free (they
+// do not raise the superlinear degree), every higher degree contributes its full
+// value. Summed over the active axes this gives the serendipity superlinear
+// degree that bounds the mode set.
+int superlinear_term(int a) {
+    return a > 1 ? a : 0;
 }
 
 inline double integer_power(double base, int exponent) {
@@ -128,12 +132,23 @@ double matrix_norm_inf(const std::vector<double>& matrix, std::size_t n) {
     return max_row;
 }
 
-std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
-    std::vector<std::array<int, 2>> exponents;
-    for (int ay = 0; ay <= order; ++ay) {
-        for (int ax = 0; ax <= order; ++ax) {
-            if (quad_serendipity_superlinear_degree(ax, ay) <= order) {
-                exponents.push_back({ax, ay});
+// Per-axis degree triples (ax, ay, az) of the serendipity mode space: every
+// combination whose superlinear degree (the sum of superlinear_term over the
+// axes) is at most `order`. `active_axes` is 2 for the quadrilateral (az pinned
+// to 0) and 3 for the hexahedron, so the quad space is exactly the hex space
+// restricted to az = 0. The same downward-closed set spans both the monomial and
+// the tensor Legendre basis (see ModalAxisKind), and the resulting nodal basis is
+// independent of how this set is ordered.
+std::vector<std::array<int, 3>> serendipity_exponents(int order, int active_axes) {
+    const int max_y = active_axes >= 2 ? order : 0;
+    const int max_z = active_axes >= 3 ? order : 0;
+    std::vector<std::array<int, 3>> exponents;
+    for (int az = 0; az <= max_z; ++az) {
+        for (int ay = 0; ay <= max_y; ++ay) {
+            for (int ax = 0; ax <= order; ++ax) {
+                if (superlinear_term(ax) + superlinear_term(ay) + superlinear_term(az) <= order) {
+                    exponents.push_back({ax, ay, az});
+                }
             }
         }
     }
@@ -209,29 +224,6 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
     return nodes;
 }
 
-int hex_serendipity_superlinear_degree(int ax, int ay, int az) {
-    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0) + (az > 1 ? az : 0);
-}
-
-// Hexahedral serendipity polynomial space: every monomial r^a s^b t^c whose
-// superlinear degree is at most `order`. This is the three-axis generalization
-// of quad_serendipity_exponents; at order 1 it is the eight multilinear
-// monomials (the Hex8 space) and at order 2 it is the twenty-monomial Hex20
-// space.
-std::vector<std::array<int, 3>> hex_serendipity_exponents(int order) {
-    std::vector<std::array<int, 3>> exponents;
-    for (int az = 0; az <= order; ++az) {
-        for (int ay = 0; ay <= order; ++ay) {
-            for (int ax = 0; ax <= order; ++ax) {
-                if (hex_serendipity_superlinear_degree(ax, ay, az) <= order) {
-                    exponents.push_back({ax, ay, az});
-                }
-            }
-        }
-    }
-    return exponents;
-}
-
 // Volume-interior node count for hexahedral serendipity. Once the boundary trace
 // is fixed, an interior serendipity function factors as the cube bubble
 // (1 - r^2)(1 - s^2)(1 - t^2) times a quotient of total degree at most order - 6,
@@ -387,23 +379,33 @@ std::vector<double> build_inverse_vandermonde(
         }
     }
 
-    // Condition-number backstop: estimate cond_inf = ||V||_inf * ||V^{-1}||_inf
-    // and reject orders where the inverse can no longer be trusted, rather than
-    // returning silently-degraded shape functions.
+    // Condition-number backstop: the inverse is explicitly formed just above, so
+    // this is the true infinity-norm condition number
+    // cond_inf = ||V||_inf * ||V^{-1}||_inf, not an estimate. Reject orders where
+    // the inverse can no longer be trusted rather than returning silently-degraded
+    // shape functions; the comparison is negated so a non-finite value is rejected
+    // too.
     const double norm_v = matrix_norm_inf(vandermonde, n);
     std::vector<double> inverse = math::invert_dense_matrix(
         std::move(vandermonde), n,
         "SerendipityBasis interpolation matrix for " + label);
-    const double cond_estimate = norm_v * matrix_norm_inf(inverse, n);
+    const double condition_number = norm_v * matrix_norm_inf(inverse, n);
     svmp::throw_if<BasisConstructionException>(
-        !(cond_estimate <= kSerendipityVandermondeMaxCond), SVMP_HERE,
+        !(condition_number <= kSerendipityVandermondeMaxCond), SVMP_HERE,
         "SerendipityBasis: " + label +
             " interpolation matrix is too ill-conditioned (condition number ~ " +
-            std::to_string(cond_estimate) +
+            std::to_string(condition_number) +
             "); the requested order exceeds the well-conditioned range");
     return inverse;
 }
 
+// Wedge15 serendipity monomial space, as (x, y, z) exponent triples. The prism is
+// the triangle cross-section (x, y) crossed with the through-axis (z): the 6
+// triangle monomials x^a y^b with a + b <= 2 times the 3 line monomials z^c with
+// c <= 2 form the complete 18-mode Wedge18 space. Wedge15 serendipity drops the 3
+// superlinear modes -- a quadratic triangle monomial (a + b == 2) times z^2,
+// i.e. (2,0,2), (1,1,2), (0,2,2) -- leaving 6*3 - 3 = 15. The set (not its order)
+// fixes the space; the nodal basis is the inverse Vandermonde at the Wedge15 nodes.
 constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{0, 0, 0}},
     {{0, 0, 1}},
@@ -557,7 +559,7 @@ NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int
 } // namespace
 
 SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
-    : topology_(topology), dimension_(0), order_(0), size_(0) {
+    : topology_(topology) {
     const bool supported_topology = topology_ == BasisTopology::Quadrilateral ||
                                     topology_ == BasisTopology::Hexahedron;
     svmp::throw_if<BasisElementCompatibilityException>(
@@ -576,8 +578,7 @@ SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
     }
 }
 
-SerendipityBasis::SerendipityBasis(ElementType type, int order)
-    : topology_(BasisTopology::Unknown), dimension_(0), order_(0), size_(0) {
+SerendipityBasis::SerendipityBasis(ElementType type, int order) {
     const NormalizedSerendipityRequest normalized = normalize_serendipity_request(type, order);
     topology_ = normalized.topology;
     dimension_ = normalized.dimension;
@@ -618,13 +619,8 @@ SerendipityBasis::SerendipityBasis(ElementType type)
 // needs no output permutation. The named Quad8 layout sources its nodes from
 // ReferenceNodeLayout; the arbitrary-order topology path generates them.
 void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_layout) {
-    const auto quad_exponents = quad_serendipity_exponents(order);
-    monomial_exponents_.clear();
-    monomial_exponents_.reserve(quad_exponents.size());
-    for (const auto& e : quad_exponents) {
-        monomial_exponents_.push_back({e[0], e[1], 0});
-    }
-    size_ = monomial_exponents_.size();
+    mode_exponents_ = serendipity_exponents(order, /*active_axes=*/2);
+    size_ = mode_exponents_.size();
     nodes_ = nodes_from_reference_layout
                  ? ReferenceNodeLayout::node_coords(ElementType::Quad8)
                  : quad_serendipity_nodes(order, size_);
@@ -633,7 +629,7 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
         "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
     uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
-        nodes_, monomial_exponents_, "Quad order " + std::to_string(order),
+        nodes_, mode_exponents_, "Quad order " + std::to_string(order),
         ModalAxisKind::Legendre, order);
 }
 
@@ -643,8 +639,8 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
 // Hex8 (order 1) and Hex20 (order 2) layouts source their public-order nodes from
 // ReferenceNodeLayout so the generated layout matches the public ordering exactly.
 void SerendipityBasis::init_hexahedron(int order, bool nodes_from_reference_layout) {
-    monomial_exponents_ = hex_serendipity_exponents(order);
-    size_ = monomial_exponents_.size();
+    mode_exponents_ = serendipity_exponents(order, /*active_axes=*/3);
+    size_ = mode_exponents_.size();
     if (nodes_from_reference_layout) {
         const ElementType named =
             (order == 1) ? ElementType::Hex8 : ElementType::Hex20;
@@ -657,7 +653,7 @@ void SerendipityBasis::init_hexahedron(int order, bool nodes_from_reference_layo
         "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
     uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
-        nodes_, monomial_exponents_, "Hex order " + std::to_string(order),
+        nodes_, mode_exponents_, "Hex order " + std::to_string(order),
         ModalAxisKind::Legendre, order);
 }
 
@@ -679,12 +675,12 @@ void SerendipityBasis::init_fixed_named(ElementType type) {
     svmp::throw_if<BasisConstructionException>(
         family_exponents.size() != size_, SVMP_HERE,
         "SerendipityBasis: Wedge15 monomial count does not match basis size");
-    monomial_exponents_.assign(family_exponents.begin(), family_exponents.end());
+    mode_exponents_.assign(family_exponents.begin(), family_exponents.end());
     // Wedge15 is the fixed order-2 layout; its 15x15 system is trivially
     // well-conditioned, so it keeps the monomial modal basis.
     uses_legendre_modes_ = false;
     inv_vandermonde_ = build_inverse_vandermonde(
-        nodes_, monomial_exponents_, "Wedge15", ModalAxisKind::Monomial, order_);
+        nodes_, mode_exponents_, "Wedge15", ModalAxisKind::Monomial, order_);
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
@@ -716,7 +712,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     // Every serendipity family evaluates through its generated coefficient table,
     // which is already in public basis order.
     svmp::throw_if<BasisEvaluationException>(
-        monomial_exponents_.size() != size_ ||
+        mode_exponents_.size() != size_ ||
             inv_vandermonde_.size() != size_ * size_,
         SVMP_HERE,
         "SerendipityBasis: interpolation tables are not initialized for evaluation");
@@ -734,7 +730,7 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
 
     eval_modal_basis(
         tx, ty, tz, size_,
-        [this](std::size_t j) { return monomial_exponents_[j]; },
+        [this](std::size_t j) { return mode_exponents_[j]; },
         [this](std::size_t j, std::size_t i) {
             return inv_vandermonde_[j * size_ + i];
         },
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index d310461e2..09b2ced50 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -315,16 +315,16 @@ class SerendipityBasis final : public BasisFunction {
                               std::span<Hessian> hessians_out) const final;
 
 private:
-    BasisTopology topology_;
-    int dimension_;
-    int order_;
-    std::size_t size_;
+    BasisTopology topology_{BasisTopology::Unknown};
+    int dimension_{0};
+    int order_{0};
+    std::size_t size_{0};
     std::vector<math::Vector<double, 3>> nodes_;
     // Per-axis degrees (a, b, c) of the tensor modes spanning the family's
     // polynomial space. Interpreted as monomial powers r^a s^b t^c or, when
     // uses_legendre_modes_ is set, as tensor Legendre degrees P_a(r) P_b(s) P_c(t)
     // (the same space; see ModalAxisKind in SerendipityBasis.cpp).
-    std::vector<std::array<int, 3>> monomial_exponents_;
+    std::vector<std::array<int, 3>> mode_exponents_;
     // Row-major inverse (generalized) Vandermonde, indexed as [mode, basis].
     std::vector<double> inv_vandermonde_;
     // Whether the tensor modes are Legendre polynomials (quadrilateral/hexahedral
@@ -332,19 +332,13 @@ class SerendipityBasis final : public BasisFunction {
     // the same family the coefficient table was built with.
     bool uses_legendre_modes_{false};
 
-    // Build the quadrilateral serendipity mode set, reference nodes, and nodal
-    // coefficient table for the given order. The named Quad8 layout takes
-    // its nodes from ReferenceNodeLayout; the arbitrary-order topology path
-    // generates them.
+    // Build the quadrilateral serendipity mode set, nodes, and Legendre
+    // coefficient table for the given order. (Details at the definition.)
     void init_quadrilateral(int order, bool nodes_from_reference_layout);
-    // Build the hexahedral serendipity mode set, reference nodes, and nodal
-    // coefficient table for the given order. The arbitrary-order topology path
-    // generates VTK-consistent nodes; the named Hex8 (order 1) and Hex20 (order 2)
-    // layouts take their public-order nodes from ReferenceNodeLayout.
+    // Build the hexahedral serendipity mode set, nodes, and Legendre coefficient
+    // table for the given order; Hex8/Hex20 are its order-1/order-2 instances.
     void init_hexahedron(int order, bool nodes_from_reference_layout);
-    // Build the Wedge15 serendipity layout from its tabulated monomial space and
-    // ReferenceNodeLayout nodes. Hexahedral serendipity (Hex8/Hex20) is generated
-    // by init_hexahedron, so the prism is the only remaining fixed named layout.
+    // Build the fixed Wedge15 layout from its tabulated monomial mode space.
     void init_fixed_named(ElementType type);
 
     void evaluate_all_to(const math::Vector<double, 3>& xi,
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index a9d6b864b..41ffb3c3a 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -329,7 +329,7 @@ TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
                  BasisNodeOrderingException);
     EXPECT_THROW((void)line_coord_pm_one(1, 0),
                  BasisNodeOrderingException);
-    EXPECT_THROW((void)ReferenceNodeLayout::get_node_coords(ElementType::Quad8, 99u),
+    EXPECT_THROW((void)ReferenceNodeLayout::node_coord_at(ElementType::Quad8, 99u),
                  BasisNodeOrderingException);
     EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),
                  BasisNodeOrderingException);
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index 39e1ddece..f1d50d83f 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -83,7 +83,7 @@ TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
         const auto nodes = ReferenceNodeLayout::get_lagrange_node_coords(canonical_type, order);
         ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(alias));
         for (std::size_t i = 0; i < nodes.size(); ++i) {
-            const auto direct = ReferenceNodeLayout::get_node_coords(alias, i);
+            const auto direct = ReferenceNodeLayout::node_coord_at(alias, i);
             EXPECT_EQ(nodes[i][0], direct[0]);
             EXPECT_EQ(nodes[i][1], direct[1]);
             EXPECT_EQ(nodes[i][2], direct[2]);
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index a171ca6d4..c749d6acf 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -368,7 +368,7 @@ TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
         ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
 
         for (std::size_t i = 0; i < generated.size(); ++i) {
-            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
+            const auto public_node = ReferenceNodeLayout::node_coord_at(alias, i);
             EXPECT_NEAR(public_node[0], generated[i][0], double(1e-14)) << "node=" << i;
             EXPECT_NEAR(public_node[1], generated[i][1], double(1e-14)) << "node=" << i;
             EXPECT_NEAR(public_node[2], generated[i][2], double(1e-14)) << "node=" << i;
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 605a1bacd..7b932cf15 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -68,7 +68,7 @@ std::vector<math::Vector<double, 3>> reference_nodes(ElementType type,
     std::vector<math::Vector<double, 3>> nodes;
     nodes.reserve(count);
     for (std::size_t i = 0; i < count; ++i) {
-        nodes.push_back(ReferenceNodeLayout::get_node_coords(type, i));
+        nodes.push_back(ReferenceNodeLayout::node_coord_at(type, i));
     }
     return nodes;
 }

From f1c0c316c887cb56ae7520a6bb4c870c16f549fe Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 22:57:47 -0700
Subject: [PATCH 71/91] adding documentation for tolerance measurements

---
 .../FE/Basis/test_BasisErrorPaths.cpp         |  8 +-
 .../unitTests/FE/Basis/test_BasisHessians.cpp | 73 +++++++++++--------
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 70 +++++++++++-------
 .../FE/Basis/test_SerendipityBasis.cpp        |  8 +-
 4 files changed, 94 insertions(+), 65 deletions(-)

diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 41ffb3c3a..0fcc25e0b 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -351,6 +351,10 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     ExactQuadraticBasis basis;
     const math::Vector<double, 3> xi{double(0.2), double(-0.35), double(0.4)};
 
+    // On a quadratic, centered differences are exact except for the round-off
+    // floor ~ eps_machine/step. The tolerances below are a few times
+    // those floors -- tight enough that a wrong difference or analytic formula
+    // (which would give an O(step) or O(1) error) cannot slip through.
     std::vector<Gradient> exact_gradients;
     basis.evaluate_gradients(xi, exact_gradients);
 
@@ -360,7 +364,7 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     for (std::size_t n = 0; n < basis.size(); ++n) {
         for (int d = 0; d < basis.dimension(); ++d) {
             const std::size_t sd = static_cast<std::size_t>(d);
-            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], double(1e-8))
+            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], double(3e-9))
                 << "basis=" << n << " component=" << d;
         }
     }
@@ -377,7 +381,7 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
                 const std::size_t sr = static_cast<std::size_t>(r);
                 const std::size_t sc = static_cast<std::size_t>(c);
                 EXPECT_NEAR(approx_hessians[n](sr, sc), exact_hessians[n](sr, sc),
-                            double(1e-8))
+                            double(2e-10))
                     << "basis=" << n << " component=(" << r << "," << c << ")";
             }
         }
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index 858e7c13b..fd66a9e74 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -18,6 +18,13 @@ using namespace svmp::FE::basis;
 
 namespace {
 
+// The exact Hessian identities below -- the partition sum (sum_i Hess N_i = 0) and
+// symmetry (Hess N_i = Hess N_i^T) -- have a floating-point round-off residual at
+// every order and family, so they share one tolerance. The finite-difference-vs-
+// analytic comparisons keep their own, larger, order-dependent tolerances because
+// finite-difference error grows with order.
+constexpr double kHessianInvariantTol = double(1e-12);
+
 void numerical_gradient_helper(const BasisFunction& basis,
                                const math::Vector<double, 3>& xi,
                                std::vector<Gradient>& gradients,
@@ -259,20 +266,19 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
         BasisTopology topology;
         int order;
         math::Vector<double, 3> xi;
-        double tol;
     } cases[] = {
-        {BasisTopology::Line, 3, {double(0.15), double(0), double(0)}, double(1e-12)},
-        {BasisTopology::Triangle, 3, {double(0.2), double(0.25), double(0)}, double(1e-10)},
-        {BasisTopology::Quadrilateral, 3, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
-        {BasisTopology::Tetrahedron, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
-        {BasisTopology::Hexahedron, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
-        {BasisTopology::Wedge, 2, {double(0.2), double(0.15), double(-0.3)}, double(1e-10)},
+        {BasisTopology::Line, 3, {double(0.15), double(0), double(0)}},
+        {BasisTopology::Triangle, 3, {double(0.2), double(0.25), double(0)}},
+        {BasisTopology::Quadrilateral, 3, {double(0.3), double(-0.2), double(0)}},
+        {BasisTopology::Tetrahedron, 2, {double(0.15), double(0.2), double(0.1)}},
+        {BasisTopology::Hexahedron, 2, {double(0.1), double(-0.2), double(0.3)}},
+        {BasisTopology::Wedge, 2, {double(0.2), double(0.15), double(-0.3)}},
     };
 
     for (const auto& c : cases) {
         LagrangeBasis basis(c.topology, c.order);
-        expect_partition_hessian_sum_zero(basis, c.xi, double(10) * c.tol);
-        expect_hessians_symmetric(basis, c.xi, c.tol);
+        expect_partition_hessian_sum_zero(basis, c.xi, kHessianInvariantTol);
+        expect_hessians_symmetric(basis, c.xi, kHessianInvariantTol);
     }
 }
 
@@ -281,47 +287,52 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
         ElementType type;
         int order;
         math::Vector<double, 3> xi;
-        double tol;
     } cases[] = {
-        {ElementType::Quad8, 2, {double(0.17), double(-0.31), double(0)}, double(1e-10)},
-        {ElementType::Hex20, 2, {double(0.2), double(-0.1), double(0.3)}, double(1e-10)},
-        {ElementType::Wedge15, 2, {double(0.2), double(0.3), double(0.1)}, double(1e-10)},
+        {ElementType::Quad8, 2, {double(0.17), double(-0.31), double(0)}},
+        {ElementType::Hex20, 2, {double(0.2), double(-0.1), double(0.3)}},
+        {ElementType::Wedge15, 2, {double(0.2), double(0.3), double(0.1)}},
     };
 
     for (const auto& c : cases) {
         SerendipityBasis basis(c.type, c.order);
-        expect_partition_hessian_sum_zero(basis, c.xi, c.tol);
-        expect_hessians_symmetric(basis, c.xi, c.tol);
+        expect_partition_hessian_sum_zero(basis, c.xi, kHessianInvariantTol);
+        expect_hessians_symmetric(basis, c.xi, kHessianInvariantTol);
     }
 }
 
 TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
+    // Mirrors the full default element set pinned in
+    // BasisFactoryDefaults.SelectionsArePinnedForAllSupportedElements (including
+    // both wedge defaults: Wedge15 serendipity and Wedge18 Lagrange), so every
+    // family the solver adapter can map is exercised for the Hessian invariants.
     const struct Case {
         ElementType type;
         BasisType basis_type;
         int order;
         math::Vector<double, 3> xi;
-        double tol;
     } cases[] = {
-        {ElementType::Line2, BasisType::Lagrange, 1, {double(0.15), double(0), double(0)}, double(1e-12)},
-        {ElementType::Line3, BasisType::Lagrange, 2, {double(-0.25), double(0), double(0)}, double(1e-12)},
-        {ElementType::Triangle3, BasisType::Lagrange, 1, {double(0.2), double(0.25), double(0)}, double(1e-12)},
-        {ElementType::Triangle6, BasisType::Lagrange, 2, {double(0.2), double(0.25), double(0)}, double(1e-12)},
-        {ElementType::Quad4, BasisType::Lagrange, 1, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
-        {ElementType::Quad8, BasisType::Serendipity, 2, {double(0.17), double(-0.31), double(0)}, double(1e-10)},
-        {ElementType::Quad9, BasisType::Lagrange, 2, {double(0.3), double(-0.2), double(0)}, double(1e-12)},
-        {ElementType::Tetra4, BasisType::Lagrange, 1, {double(0.15), double(0.2), double(0.1)}, double(1e-12)},
-        {ElementType::Tetra10, BasisType::Lagrange, 2, {double(0.15), double(0.2), double(0.1)}, double(1e-10)},
-        {ElementType::Hex8, BasisType::Lagrange, 1, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
-        {ElementType::Hex20, BasisType::Serendipity, 2, {double(0.2), double(-0.1), double(0.3)}, double(1e-10)},
-        {ElementType::Hex27, BasisType::Lagrange, 2, {double(0.1), double(-0.2), double(0.3)}, double(1e-12)},
-        {ElementType::Wedge6, BasisType::Lagrange, 1, {double(0.2), double(0.15), double(-0.3)}, double(1e-12)},
+        {ElementType::Line2, BasisType::Lagrange, 1, {double(0.15), double(0), double(0)}},
+        {ElementType::Line3, BasisType::Lagrange, 2, {double(-0.25), double(0), double(0)}},
+        {ElementType::Triangle3, BasisType::Lagrange, 1, {double(0.2), double(0.25), double(0)}},
+        {ElementType::Triangle6, BasisType::Lagrange, 2, {double(0.2), double(0.25), double(0)}},
+        {ElementType::Quad4, BasisType::Lagrange, 1, {double(0.3), double(-0.2), double(0)}},
+        {ElementType::Quad8, BasisType::Serendipity, 2, {double(0.17), double(-0.31), double(0)}},
+        {ElementType::Quad9, BasisType::Lagrange, 2, {double(0.3), double(-0.2), double(0)}},
+        {ElementType::Tetra4, BasisType::Lagrange, 1, {double(0.15), double(0.2), double(0.1)}},
+        {ElementType::Tetra10, BasisType::Lagrange, 2, {double(0.15), double(0.2), double(0.1)}},
+        {ElementType::Hex8, BasisType::Lagrange, 1, {double(0.1), double(-0.2), double(0.3)}},
+        {ElementType::Hex20, BasisType::Serendipity, 2, {double(0.2), double(-0.1), double(0.3)}},
+        {ElementType::Hex27, BasisType::Lagrange, 2, {double(0.1), double(-0.2), double(0.3)}},
+        {ElementType::Wedge6, BasisType::Lagrange, 1, {double(0.2), double(0.15), double(-0.3)}},
+        {ElementType::Wedge15, BasisType::Serendipity, 2, {double(0.2), double(0.3), double(0.1)}},
+        {ElementType::Wedge18, BasisType::Lagrange, 2, {double(0.2), double(0.15), double(-0.3)}},
     };
 
     for (const auto& c : cases) {
         auto basis = basis_factory::create(BasisRequest{c.type, c.basis_type, c.order});
-        expect_partition_hessian_sum_zero(*basis, c.xi, c.tol);
-        expect_hessians_symmetric(*basis, c.xi, c.tol);
+        ASSERT_NE(basis, nullptr) << "element=" << static_cast<int>(c.type);
+        expect_partition_hessian_sum_zero(*basis, c.xi, kHessianInvariantTol);
+        expect_hessians_symmetric(*basis, c.xi, kHessianInvariantTol);
     }
 }
 
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index c749d6acf..48c5897a1 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -23,6 +23,17 @@ namespace {
 
 using Point = math::Vector<double, 3>;
 
+// Tolerance convention (set from measured residuals, not guessed). Identities that
+// share a generator -- node coordinates, the lattice forward-image, span vs vector
+// evaluation, P0 constants -- are exact up to round-off and use 1e-14. The
+// nodal/partition identities here -- the Kronecker delta at the nodes and the
+// value/gradient/Hessian partition sums (sum_i N_i = 1, sum_i grad N_i = 0,
+// sum_i Hess N_i = 0) -- are exact at every order (at a node an off-diagonal shape
+// function has an exactly-zero factor, and the partition sums differentiate the
+// constant 1), so one tolerance covers them all instead of an order-dependent
+// ladder. Measured residuals stay near round-off (~7e-15 through the orders here).
+constexpr double kPartitionTol = double(1e-12);
+
 struct CanonicalCase {
     BasisTopology topology;
     ElementType representative;  // linear element for sample-point lookup and labeling
@@ -30,29 +41,22 @@ struct CanonicalCase {
     std::size_t size;
     int dimension;
     std::vector<Point> points;
-    double derivative_tol;
 };
 
 const std::vector<CanonicalCase>& canonical_cases() {
     static const std::vector<CanonicalCase> cases = {
         {BasisTopology::Line, ElementType::Line2, 3, 4u, 1,
-         {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}},
-         double(1e-11)},
+         {{double(-0.35), double(0), double(0)}, {double(0.2), double(0), double(0)}}},
         {BasisTopology::Triangle, ElementType::Triangle3, 3, 10u, 2,
-         {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}},
-         double(1e-9)},
+         {{double(0.15), double(0.2), double(0)}, {double(0.25), double(0.1), double(0)}}},
         {BasisTopology::Quadrilateral, ElementType::Quad4, 3, 16u, 2,
-         {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}},
-         double(1e-11)},
+         {{double(0.2), double(-0.3), double(0)}, {double(-0.45), double(0.25), double(0)}}},
         {BasisTopology::Tetrahedron, ElementType::Tetra4, 2, 10u, 3,
-         {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}},
-         double(1e-9)},
+         {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}}},
         {BasisTopology::Hexahedron, ElementType::Hex8, 2, 27u, 3,
-         {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}},
-         double(1e-10)},
+         {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}}},
         {BasisTopology::Wedge, ElementType::Wedge6, 2, 18u, 3,
-         {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}},
-         double(1e-9)},
+         {{double(0.18), double(0.22), double(-0.2)}, {double(0.12), double(0.16), double(0.1)}}},
     };
     return cases;
 }
@@ -83,8 +87,7 @@ void expect_kronecker_at_nodes(const LagrangeBasis& basis, double tol)
 }
 
 void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
-                                            const std::vector<Point>& points,
-                                            double derivative_tol)
+                                            const std::vector<Point>& points)
 {
     for (const auto& xi : points) {
         std::vector<double> values;
@@ -105,14 +108,14 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
             }
         }
 
-        EXPECT_NEAR(value_sum, double(1), double(1e-12));
+        EXPECT_NEAR(value_sum, double(1), kPartitionTol);
         for (int d = 0; d < basis.dimension(); ++d) {
-            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], double(0), derivative_tol);
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], double(0), kPartitionTol);
             for (int e = 0; e < basis.dimension(); ++e) {
                 EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
                                         static_cast<std::size_t>(e)),
                             double(0),
-                            derivative_tol);
+                            kPartitionTol);
             }
         }
     }
@@ -248,8 +251,8 @@ TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
 TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
     for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.topology, c.order);
-        expect_kronecker_at_nodes(basis, double(2e-10));
-        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
+        expect_kronecker_at_nodes(basis, kPartitionTol);
+        expect_partition_gradient_hessian_sums(basis, c.points);
     }
 }
 
@@ -535,23 +538,21 @@ TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
         BasisTopology topology;
         int order;
         std::size_t size;
-        double kronecker_tol;
-        double derivative_tol;
         std::vector<Point> points;
     } cases[] = {
-        {BasisTopology::Tetrahedron, 3, 20u, double(5e-10), double(1e-8),
+        {BasisTopology::Tetrahedron, 3, 20u,
          {{double(0.12), double(0.18), double(0.16)}, {double(0.3), double(0.2), double(0.25)}}},
-        {BasisTopology::Tetrahedron, 4, 35u, double(1e-9), double(1e-7),
+        {BasisTopology::Tetrahedron, 4, 35u,
          {{double(0.12), double(0.18), double(0.16)}, {double(0.2), double(0.1), double(0.18)}}},
-        {BasisTopology::Hexahedron, 3, 64u, double(5e-10), double(1e-8),
+        {BasisTopology::Hexahedron, 3, 64u,
          {{double(0.1), double(-0.2), double(0.3)}, {double(-0.35), double(0.25), double(-0.15)}}},
     };
 
     for (const auto& c : cases) {
         LagrangeBasis basis(c.topology, c.order);
         EXPECT_EQ(basis.size(), c.size);
-        expect_kronecker_at_nodes(basis, c.kronecker_tol);
-        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
+        expect_kronecker_at_nodes(basis, kPartitionTol);
+        expect_partition_gradient_hessian_sums(basis, c.points);
     }
 }
 
@@ -753,7 +754,10 @@ TEST(BasisFactoryDefaults, RejectsElementsWithoutDefaultBasis) {
                  BasisElementCompatibilityException);
 }
 
-TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
+// The factory-creation paths below were one 90-line test; they are split by the
+// path under test so an early ASSERT_NE in one cannot mask the others, and each
+// name says what it covers.
+TEST(LagrangeBasis, FactoryCreatesNamedLagrangeBasis) {
     auto lagrange =
         basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 2});
     ASSERT_NE(lagrange, nullptr);
@@ -767,7 +771,9 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1}),
                  BasisConfigurationException);
+}
 
+TEST(LagrangeBasis, FactoryCreatesArbitraryOrderLagrangeBasis) {
     BasisRequest arbitrary_lagrange;
     arbitrary_lagrange.basis_type = BasisType::Lagrange;
     arbitrary_lagrange.order = 5;
@@ -781,12 +787,16 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
               ElementType::Unknown);
     EXPECT_EQ(high_order_lagrange->order(), 5);
     EXPECT_EQ(high_order_lagrange->size(), 216u);
+}
 
+TEST(LagrangeBasis, FactoryCreatesNamedSerendipityBasis) {
     auto serendipity =
         basis_factory::create(BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
     ASSERT_NE(serendipity, nullptr);
     EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+}
 
+TEST(LagrangeBasis, FactoryCreatesArbitraryOrderQuadSerendipityBasis) {
     BasisRequest arbitrary_quad_serendipity;
     arbitrary_quad_serendipity.basis_type = BasisType::Serendipity;
     arbitrary_quad_serendipity.order = 4;
@@ -801,7 +811,9 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
               ElementType::Unknown);
     EXPECT_EQ(high_order_quad_serendipity->order(), 4);
     EXPECT_EQ(high_order_quad_serendipity->size(), 17u);
+}
 
+TEST(LagrangeBasis, FactoryCreatesArbitraryOrderHexSerendipityBasis) {
     BasisRequest arbitrary_hex_serendipity;
     arbitrary_hex_serendipity.basis_type = BasisType::Serendipity;
     arbitrary_hex_serendipity.order = 3;
@@ -816,7 +828,9 @@ TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
               ElementType::Unknown);
     EXPECT_EQ(high_order_hex_serendipity->order(), 3);
     EXPECT_EQ(high_order_hex_serendipity->size(), 32u);
+}
 
+TEST(LagrangeBasis, FactoryRejectsInvalidScalarRequests) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
                  BasisElementCompatibilityException);
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 7b932cf15..62122676c 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -1152,16 +1152,16 @@ TEST(SerendipityBasis, SerendipityStaysWellConditionedAcrossRecommendedRange) {
         const double cond = legendre_vandermonde_condition(
             basis.nodes(), quad_serendipity_modes_3d_for_test(order));
         const double lebesgue = serendipity_lebesgue_constant(basis, 24);
-        EXPECT_LT(cond, double(5e4)) << "quad order=" << order;
-        EXPECT_LT(lebesgue, double(1e3)) << "quad order=" << order;
+        EXPECT_LT(cond, double(2.5e4)) << "quad order=" << order;
+        EXPECT_LT(lebesgue, double(9e2)) << "quad order=" << order;
     }
     for (int order = 1; order <= 8; ++order) {
         SerendipityBasis basis(BasisTopology::Hexahedron, order);
         const double cond = legendre_vandermonde_condition(
             basis.nodes(), hex_serendipity_exponents_for_test(order));
         const double lebesgue = serendipity_lebesgue_constant(basis, 12);
-        EXPECT_LT(cond, double(5e4)) << "hex order=" << order;
-        EXPECT_LT(lebesgue, double(1e3)) << "hex order=" << order;
+        EXPECT_LT(cond, double(2e4)) << "hex order=" << order;
+        EXPECT_LT(lebesgue, double(3.5e2)) << "hex order=" << order;
     }
 }
 

From ca68c724b444549550c0620b5a831450a911c5fc Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 24 Jun 2026 23:59:42 -0700
Subject: [PATCH 72/91] Consolidated all serendipity reference-node generation
 into a single   ReferenceNodeLayout::serendipity_node_coords owner

---
 .../FE/Basis/NodeOrderingConventions.cpp      | 185 ++++++++++++-
 .../solver/FE/Basis/NodeOrderingConventions.h |  23 ++
 .../solver/FE/Basis/SerendipityBasis.cpp      | 255 +++---------------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  45 ++--
 .../FE/Basis/test_SerendipityBasis.cpp        |   8 +-
 5 files changed, 269 insertions(+), 247 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index aabd861c9..58a090b42 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -634,6 +634,168 @@ std::vector<Point> serendipity_subset_nodes(BasisTopology top,
     return nodes;
 }
 
+// ---------------------------------------------------------------------------
+// Arbitrary-order serendipity node geometry (quadrilateral and hexahedral).
+//
+// The corner+edge skeleton is the leading prefix of the complete Lagrange layout
+// of the same order (the complete generators above); only the reduced face/volume
+// interior below is serendipity-specific. These back
+// ReferenceNodeLayout::serendipity_node_coords and the named Quad8/Hex20 layouts,
+// so serendipity node geometry has a single owner. (Wedge15 is a fixed named
+// layout, handled by serendipity_subset_nodes above.)
+// ---------------------------------------------------------------------------
+
+std::size_t quad_serendipity_interior_count(int order) {
+    if (order < 4) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 4);
+    return (m + 1u) * (m + 2u) / 2u;
+}
+
+// Interior nodes are a triangular row set for P_m, m = order - 4: a serendipity
+// polynomial vanishing at the p + 1 boundary nodes on every edge factors as
+// (1 - x^2)(1 - y^2) q with q in P_m, and the staircase below is unisolvent for q
+// by induction over rows. It sits on Gauss-Lobatto-Legendre interior nodes (the
+// same 1D distribution as the boundary) so the reduced space stays well-conditioned
+// at high order; GLL only moves where the distinct points sit, not the staircase
+// structure.
+void append_quad_serendipity_interior_nodes(std::vector<Point>& nodes, int order) {
+    if (order < 4) {
+        return;
+    }
+    const int m = order - 4;
+    for (int row = 0; row <= m; ++row) {
+        const int row_count = m + 1 - row;
+        const double y = line_coord_pm_one(row + 1, m + 2);
+        for (int col = 0; col < row_count; ++col) {
+            const double x = line_coord_pm_one(col + 1, row_count + 1);
+            nodes.push_back(Point{x, y, double(0)});
+        }
+    }
+}
+
+// Quadrilateral serendipity reference nodes at the given order: the 4 corners and
+// 4(order-1) edge nodes (the leading prefix of the complete Quad layout, in VTK
+// boundary order) followed by the reduced triangular interior.
+std::vector<Point> quad_serendipity_nodes(int order) {
+    std::vector<Point> nodes = generate_quad_nodes(order).coords;
+    const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
+    svmp::throw_if<BasisConstructionException>(
+        boundary_count > nodes.size(), SVMP_HERE,
+        "ReferenceNodeLayout: quadrilateral serendipity skeleton exceeds the complete Lagrange layout");
+    nodes.resize(boundary_count);
+    append_quad_serendipity_interior_nodes(nodes, order);
+    return nodes;
+}
+
+// Volume-interior node count for hexahedral serendipity. Once the boundary trace
+// is fixed, an interior serendipity function factors as the cube bubble
+// (1 - r^2)(1 - s^2)(1 - t^2) times a quotient of total degree at most order - 6,
+// so the interior space is P_{order-6} in three variables: empty until order 6,
+// then dim P_{order-6} = (m+1)(m+2)(m+3)/6 with m = order - 6.
+std::size_t hex_serendipity_volume_interior_count(int order) {
+    if (order < 6) {
+        return 0u;
+    }
+    const auto m = static_cast<std::size_t>(order - 6);
+    return (m + 1u) * (m + 2u) * (m + 3u) / 6u;
+}
+
+// Append the face-interior nodes. The restriction of the order-`order` cube
+// serendipity space to a face is the order-`order` quadrilateral serendipity
+// space, so every face carries the same 2D quad-serendipity interior set,
+// embedded into the face plane. Faces are visited in VTK face order
+// (-X, +X, -Y, +Y, -Z, +Z); the in-plane (u, v) point maps to the two free axes
+// of each face. Empty until order 4 (when the quad interior first appears).
+void append_hex_serendipity_face_interior_nodes(std::vector<Point>& nodes, int order) {
+    std::vector<Point> face_interior;  // (u, v, 0) interior points of one quad face
+    append_quad_serendipity_interior_nodes(face_interior, order);
+    if (face_interior.empty()) {
+        return;
+    }
+
+    // Each face: the fixed axis (0 = r, 1 = s, 2 = t), its +/-1 value, and the two
+    // in-plane axes that carry the 2D interior point (u, v).
+    struct Face {
+        int fixed_axis;
+        double fixed_value;
+        int u_axis;
+        int v_axis;
+    };
+    static constexpr Face faces[6] = {
+        {0, double(-1), 1, 2},  // -X: (s, t) in plane
+        {0, double(1),  1, 2},  // +X
+        {1, double(-1), 0, 2},  // -Y: (r, t) in plane
+        {1, double(1),  0, 2},  // +Y
+        {2, double(-1), 0, 1},  // -Z: (r, s) in plane
+        {2, double(1),  0, 1},  // +Z
+    };
+
+    for (const auto& face : faces) {
+        for (const auto& p : face_interior) {
+            Point node = Point::Zero();
+            node[static_cast<std::size_t>(face.fixed_axis)] = face.fixed_value;
+            node[static_cast<std::size_t>(face.u_axis)] = p[0];
+            node[static_cast<std::size_t>(face.v_axis)] = p[1];
+            nodes.push_back(node);
+        }
+    }
+}
+
+// Append the volume-interior nodes: a tetrahedral staircase unisolvent for the
+// interior residual P_{order-6}, on Gauss-Lobatto-Legendre interior nodes. Each
+// t-layer is a triangular staircase whose total degree decreases by one per layer,
+// so the layers consume P_{order-6} by induction in t exactly as the quad interior
+// consumes P_{order-4} by induction in s. Empty until order 6.
+void append_hex_serendipity_volume_interior_nodes(std::vector<Point>& nodes, int order) {
+    if (order < 6) {
+        return;
+    }
+    const int m = order - 6;
+    for (int layer = 0; layer <= m; ++layer) {
+        const int tri_order = m - layer;
+        const double t = line_coord_pm_one(layer + 1, m + 2);
+        for (int row = 0; row <= tri_order; ++row) {
+            const int row_count = tri_order + 1 - row;
+            const double s = line_coord_pm_one(row + 1, tri_order + 2);
+            for (int col = 0; col < row_count; ++col) {
+                const double r = line_coord_pm_one(col + 1, row_count + 1);
+                nodes.push_back(Point{r, s, t});
+            }
+        }
+    }
+}
+
+// Hexahedral serendipity reference nodes in VTK-consistent stratified order: 8
+// corners, 12(order-1) edge nodes (the leading prefix of the complete Hex layout),
+// then the 6 face interiors in VTK face order, then the volume interior. At order 1
+// (corners) and order 2 (corners + edge midpoints) this is exactly the public
+// Hex8 / Hex20 ordering; higher-order face/volume sets are this module's own
+// convention.
+std::vector<Point> hex_serendipity_nodes(int order) {
+    std::vector<Point> nodes = generate_hex_nodes(order).coords;
+    const std::size_t skeleton_count =
+        8u + 12u * static_cast<std::size_t>(order - 1);
+    svmp::throw_if<BasisConstructionException>(
+        skeleton_count > nodes.size(), SVMP_HERE,
+        "ReferenceNodeLayout: hexahedral serendipity skeleton exceeds the complete Lagrange layout");
+    nodes.resize(skeleton_count);
+
+    const std::size_t skeleton = nodes.size();
+    append_hex_serendipity_face_interior_nodes(nodes, order);
+    svmp::throw_if<BasisConstructionException>(
+        nodes.size() - skeleton != 6u * quad_serendipity_interior_count(order), SVMP_HERE,
+        "ReferenceNodeLayout: hexahedral serendipity face-interior node count mismatch");
+
+    const std::size_t before_volume = nodes.size();
+    append_hex_serendipity_volume_interior_nodes(nodes, order);
+    svmp::throw_if<BasisConstructionException>(
+        nodes.size() - before_volume != hex_serendipity_volume_interior_count(order), SVMP_HERE,
+        "ReferenceNodeLayout: hexahedral serendipity volume-interior node count mismatch");
+    return nodes;
+}
+
 std::vector<Point> element_nodes(ElementType elem_type) {
     const int order = complete_lagrange_alias_order(elem_type);
     if (order >= 0) {
@@ -642,11 +804,9 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 
     switch (elem_type) {
         case ElementType::Quad8:
-            return serendipity_subset_nodes(BasisTopology::Quadrilateral,
-                                            generate_quad_nodes(2), 8u, 9u);
+            return quad_serendipity_nodes(2);
         case ElementType::Hex20:
-            return serendipity_subset_nodes(BasisTopology::Hexahedron,
-                                            generate_hex_nodes(2), 20u, 27u);
+            return hex_serendipity_nodes(2);
         case ElementType::Wedge15:
             return serendipity_subset_nodes(BasisTopology::Wedge,
                                             generate_wedge_nodes(2), 15u, 18u);
@@ -730,6 +890,23 @@ ReferenceNodeLayout::get_lagrange_lattice(ElementType canonical_type, int order)
     return layout;
 }
 
+std::vector<math::Vector<double, 3>>
+ReferenceNodeLayout::serendipity_node_coords(BasisTopology topology, int order) {
+    svmp::throw_if<BasisConstructionException>(
+        order < 1, SVMP_HERE,
+        "ReferenceNodeLayout::serendipity_node_coords requires a polynomial order >= 1");
+    switch (topology) {
+        case BasisTopology::Quadrilateral:
+            return quad_serendipity_nodes(order);
+        case BasisTopology::Hexahedron:
+            return hex_serendipity_nodes(order);
+        default:
+            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "ReferenceNodeLayout::serendipity_node_coords: generated serendipity layouts "
+                "exist only for Quadrilateral and Hexahedron (Wedge15 is the fixed named layout)");
+    }
+}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 71e4f6ad3..599febcaa 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -4,6 +4,7 @@
 #ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 #define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 
+#include "BasisTraits.h"
 #include "Math/Vector.h"
 #include "Types.h"
 
@@ -99,6 +100,28 @@ class ReferenceNodeLayout {
      */
     static LagrangeNodeLayout
     get_lagrange_lattice(ElementType canonical_type, int order);
+
+    /**
+     * @brief Reference nodes for an arbitrary-order serendipity layout.
+     *
+     * @details Generates the stratified serendipity node set for the
+     * quadrilateral or hexahedral family at the requested order: the
+     * corner+edge skeleton (the leading prefix of the complete Lagrange layout
+     * of the same order, in VTK boundary order) followed by the reduced face
+     * and volume interior. This is the single source of serendipity node
+     * geometry -- SerendipityBasis builds its mode space and coefficient table
+     * on top of these coordinates for both the arbitrary-order path and the
+     * named Quad8/Hex20 layouts (the order-2 instances). Wedge serendipity
+     * (Wedge15) is a fixed named layout and is not generated here.
+     *
+     * @param topology BasisTopology::Quadrilateral or BasisTopology::Hexahedron.
+     * @param order Polynomial order; must be >= 1.
+     * @return Reference node coordinates in stratified (skeleton-then-interior) order.
+     * @throws BasisConstructionException If @p order is below 1.
+     * @throws BasisElementCompatibilityException If @p topology is not Quadrilateral or Hexahedron.
+     */
+    static std::vector<math::Vector<double, 3>>
+    serendipity_node_coords(BasisTopology topology, int order);
 };
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 1ff37f7a4..080ca34ce 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -155,193 +155,6 @@ std::vector<std::array<int, 3>> serendipity_exponents(int order, int active_axes
     return exponents;
 }
 
-std::size_t quad_serendipity_interior_count(int order) {
-    if (order < 4) {
-        return 0u;
-    }
-    const auto m = static_cast<std::size_t>(order - 4);
-    return (m + 1u) * (m + 2u) / 2u;
-}
-
-// Interior nodes are a triangular row set for P_m, m = order - 4. If a
-// serendipity polynomial vanishes at the p + 1 boundary nodes on each edge,
-// each edge restriction is identically zero and the polynomial factors as
-// (1 - x^2)(1 - y^2) q with q in P_m. Row 0 has m + 1 distinct x-values; if q
-// vanishes there, q(x, y_0) is the zero one-variable polynomial and
-// q = (y - y_0) q_1 with q_1 in P_{m-1}. Repeating over the remaining rows
-// proves q = 0, so the full quadrilateral serendipity Vandermonde is
-// nonsingular for this node set.
-void append_quad_serendipity_interior_nodes(std::vector<Vec3>& nodes, int order) {
-    if (order < 4) {
-        return;
-    }
-
-    // Interior staircase placed on Gauss-Lobatto-Legendre interior nodes (the same
-    // 1D distribution as the boundary), so the reduced space stays well-conditioned
-    // at high order. The unisolvence argument above needs only a distinct y per row
-    // and distinct x within each row; GLL changes where those distinct points sit,
-    // not the staircase structure.
-    const int m = order - 4;
-    for (int row = 0; row <= m; ++row) {
-        const int row_count = m + 1 - row;
-        const double y = line_coord_pm_one(row + 1, m + 2);
-        for (int col = 0; col < row_count; ++col) {
-            const double x = line_coord_pm_one(col + 1, row_count + 1);
-            nodes.push_back(Vec3{x, y, double(0)});
-        }
-    }
-}
-
-std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
-    if (order <= 0) {
-        return {};
-    }
-
-    // The corner+edge skeleton is the leading prefix of the complete quadrilateral
-    // Lagrange layout of the same order: 4 corners followed by 4(order-1) edge
-    // nodes, in the same VTK boundary order. Source it from the single
-    // ReferenceNodeLayout generator and drop that layout's interior, so the
-    // reference-cell corner/edge geometry has one owner; only the reduced interior
-    // appended below is serendipity-specific.
-    std::vector<Vec3> nodes =
-        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad4, order);
-    const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
-    svmp::throw_if<BasisConstructionException>(
-        boundary_count > nodes.size(), SVMP_HERE,
-        "SerendipityBasis: quadrilateral skeleton exceeds the complete Lagrange layout");
-    nodes.resize(boundary_count);
-
-    svmp::throw_if<BasisConstructionException>(
-        nodes.size() > total_size, SVMP_HERE,
-        "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size");
-
-    const std::size_t interior_count = quad_serendipity_interior_count(order);
-    svmp::throw_if<BasisConstructionException>(
-        nodes.size() + interior_count != total_size, SVMP_HERE,
-        "SerendipityBasis: quadrilateral serendipity monomial/node count mismatch");
-
-    append_quad_serendipity_interior_nodes(nodes, order);
-    return nodes;
-}
-
-// Volume-interior node count for hexahedral serendipity. Once the boundary trace
-// is fixed, an interior serendipity function factors as the cube bubble
-// (1 - r^2)(1 - s^2)(1 - t^2) times a quotient of total degree at most order - 6,
-// so the interior space is P_{order-6} in three variables: empty until order 6,
-// then dim P_{order-6} = (m+1)(m+2)(m+3)/6 with m = order - 6.
-std::size_t hex_serendipity_volume_interior_count(int order) {
-    if (order < 6) {
-        return 0u;
-    }
-    const auto m = static_cast<std::size_t>(order - 6);
-    return (m + 1u) * (m + 2u) * (m + 3u) / 6u;
-}
-
-// Append the face-interior nodes. The restriction of the order-`order` cube
-// serendipity space to a face is the order-`order` quadrilateral serendipity
-// space, so every face carries the same 2D quad-serendipity interior set,
-// embedded into the face plane. Faces are visited in VTK face order
-// (-X, +X, -Y, +Y, -Z, +Z); the in-plane (u, v) point maps to the two free axes
-// of each face. Empty until order 4 (when the quad interior first appears).
-void append_hex_serendipity_face_interior_nodes(std::vector<Vec3>& nodes, int order) {
-    std::vector<Vec3> face_interior;  // (u, v, 0) interior points of one quad face
-    append_quad_serendipity_interior_nodes(face_interior, order);
-    if (face_interior.empty()) {
-        return;
-    }
-
-    // Each face: the fixed axis (0 = r, 1 = s, 2 = t), its +/-1 value, and the two
-    // in-plane axes that carry the 2D interior point (u, v).
-    struct Face {
-        int fixed_axis;
-        double fixed_value;
-        int u_axis;
-        int v_axis;
-    };
-    static constexpr Face faces[6] = {
-        {0, double(-1), 1, 2},  // -X: (s, t) in plane
-        {0, double(1),  1, 2},  // +X
-        {1, double(-1), 0, 2},  // -Y: (r, t) in plane
-        {1, double(1),  0, 2},  // +Y
-        {2, double(-1), 0, 1},  // -Z: (r, s) in plane
-        {2, double(1),  0, 1},  // +Z
-    };
-
-    for (const auto& face : faces) {
-        for (const auto& p : face_interior) {
-            Vec3 node = Vec3::Zero();
-            node[static_cast<std::size_t>(face.fixed_axis)] = face.fixed_value;
-            node[static_cast<std::size_t>(face.u_axis)] = p[0];
-            node[static_cast<std::size_t>(face.v_axis)] = p[1];
-            nodes.push_back(node);
-        }
-    }
-}
-
-// Append the volume-interior nodes: a tetrahedral staircase unisolvent for the
-// interior residual P_{order-6}. Each t-layer is a triangular staircase (the 2D
-// construction reused) whose total degree decreases by one per layer, so the
-// layers consume P_{order-6} by induction in t exactly as the quad interior
-// consumes P_{order-4} by induction in s. Empty until order 6.
-void append_hex_serendipity_volume_interior_nodes(std::vector<Vec3>& nodes, int order) {
-    if (order < 6) {
-        return;
-    }
-    // Tetrahedral staircase on Gauss-Lobatto-Legendre interior nodes, mirroring the
-    // 2D quad interior: distinct t per layer, distinct s per row, distinct r within
-    // a row keep the residual unisolvent while staying well-conditioned at order.
-    const int m = order - 6;
-    for (int layer = 0; layer <= m; ++layer) {
-        const int tri_order = m - layer;
-        const double t = line_coord_pm_one(layer + 1, m + 2);
-        for (int row = 0; row <= tri_order; ++row) {
-            const int row_count = tri_order + 1 - row;
-            const double s = line_coord_pm_one(row + 1, tri_order + 2);
-            for (int col = 0; col < row_count; ++col) {
-                const double r = line_coord_pm_one(col + 1, row_count + 1);
-                nodes.push_back(Vec3{r, s, t});
-            }
-        }
-    }
-}
-
-// Generate the hexahedral serendipity reference nodes in the generalized
-// right-hand-rule / VTK-consistent stratified order: 8 corners, then 12 edges in
-// VTK quadratic-hex edge order, then the 6 face interiors in VTK face order, then
-// the volume interior. The corner and edge strata are taken directly from the
-// complete hexahedral Lagrange layout (generate_hex_nodes, via ReferenceNodeLayout),
-// so they share that single generator's VTK ordering: at order 1 (corners only)
-// and order 2 (corners plus edge midpoints) the layout is exactly the public
-// Hex8 / Hex20 ordering, and for higher order the reduced face/volume sets are
-// this module's own convention.
-std::vector<Vec3> hex_serendipity_nodes(int order, std::size_t total_size) {
-    std::vector<Vec3> nodes =
-        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Hex8, order);
-    const std::size_t skeleton_count =
-        8u + 12u * static_cast<std::size_t>(order - 1);
-    svmp::throw_if<BasisConstructionException>(
-        skeleton_count > nodes.size(), SVMP_HERE,
-        "SerendipityBasis: hexahedral skeleton exceeds the complete Lagrange layout");
-    nodes.resize(skeleton_count);
-
-    const std::size_t skeleton = nodes.size();
-    append_hex_serendipity_face_interior_nodes(nodes, order);
-    svmp::throw_if<BasisConstructionException>(
-        nodes.size() - skeleton != 6u * quad_serendipity_interior_count(order), SVMP_HERE,
-        "SerendipityBasis: hexahedral serendipity face-interior node count mismatch");
-
-    const std::size_t before_volume = nodes.size();
-    append_hex_serendipity_volume_interior_nodes(nodes, order);
-    svmp::throw_if<BasisConstructionException>(
-        nodes.size() - before_volume != hex_serendipity_volume_interior_count(order), SVMP_HERE,
-        "SerendipityBasis: hexahedral serendipity volume-interior node count mismatch");
-
-    svmp::throw_if<BasisConstructionException>(
-        nodes.size() != total_size, SVMP_HERE,
-        "SerendipityBasis: hexahedral serendipity node count does not match the monomial count");
-    return nodes;
-}
-
 // Build the nodal coefficient table for a serendipity family: assemble the
 // generalized Vandermonde V[node][mode] = phi_a(r) phi_b(s) phi_c(t) at the
 // public-order reference nodes -- with phi the monomial or Legendre 1D modes per
@@ -386,9 +199,27 @@ std::vector<double> build_inverse_vandermonde(
     // shape functions; the comparison is negated so a non-finite value is rejected
     // too.
     const double norm_v = matrix_norm_inf(vandermonde, n);
-    std::vector<double> inverse = math::invert_dense_matrix(
-        std::move(vandermonde), n,
-        "SerendipityBasis interpolation matrix for " + label);
+
+    // invert_dense_matrix raises a generic FEException if the Vandermonde is
+    // exactly singular (a rank-deficient pivot). For a serendipity family that
+    // means the node set is not unisolvent at this order -- a construction failure
+    // in basis terms -- so translate it to BasisConstructionException, presenting
+    // the singular and the ill-conditioned cases (below) as one catchable type in
+    // one vocabulary. The matrix was just built n-by-n from n modes, so a
+    // size-mismatch FEException cannot originate here; rank deficiency is the only
+    // FEException this call can raise. (Defensive: the supported node sets are
+    // provably unisolvent, so this branch is not reachable for the shipped orders.)
+    std::vector<double> inverse;
+    try {
+        inverse = math::invert_dense_matrix(
+            std::move(vandermonde), n,
+            "SerendipityBasis interpolation matrix for " + label);
+    } catch (const FEException&) {
+        svmp::raise<BasisConstructionException>(SVMP_HERE,
+            "SerendipityBasis: " + label +
+                " interpolation matrix is singular; the serendipity node set is not "
+                "unisolvent at the requested order");
+    }
     const double condition_number = norm_v * matrix_norm_inf(inverse, n);
     svmp::throw_if<BasisConstructionException>(
         !(condition_number <= kSerendipityVandermondeMaxCond), SVMP_HERE,
@@ -572,9 +403,9 @@ SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
     dimension_ = topology_ == BasisTopology::Hexahedron ? 3 : 2;
     order_ = order;
     if (topology_ == BasisTopology::Hexahedron) {
-        init_hexahedron(order_, /*nodes_from_reference_layout=*/false);
+        init_hexahedron(order_);
     } else {
-        init_quadrilateral(order_, /*nodes_from_reference_layout=*/false);
+        init_quadrilateral(order_);
     }
 }
 
@@ -586,18 +417,17 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order) {
 
     switch (type) {
         case ElementType::Quad8:
-            // Quad8 is the named quadratic layout; its nodes come from
-            // ReferenceNodeLayout so the basis shares the single public Quad8
-            // ordering (the same source Hex8/Hex20/Wedge15 use).
-            init_quadrilateral(order_, /*nodes_from_reference_layout=*/true);
+            // Quad8 is the order-2 instance of the quadrilateral serendipity
+            // space; the named overload only pins the order.
+            init_quadrilateral(order_);
             return;
         case ElementType::Hex8:
             // Hex8 is the order-1 instance of the hexahedral serendipity space.
-            init_hexahedron(1, /*nodes_from_reference_layout=*/true);
+            init_hexahedron(1);
             return;
         case ElementType::Hex20:
             // Hex20 is the order-2 instance of the hexahedral serendipity space.
-            init_hexahedron(2, /*nodes_from_reference_layout=*/true);
+            init_hexahedron(2);
             return;
         case ElementType::Wedge15:
             init_fixed_named(type);
@@ -616,14 +446,13 @@ SerendipityBasis::SerendipityBasis(ElementType type)
 // coefficient table for the given order. The coefficient table is the inverse
 // Vandermonde of tensor Legendre modes spanning the same polynomial space as the
 // monomial degree triples; because the nodes are in public order, evaluation
-// needs no output permutation. The named Quad8 layout sources its nodes from
-// ReferenceNodeLayout; the arbitrary-order topology path generates them.
-void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_layout) {
+// needs no output permutation. Reference nodes come from the single
+// ReferenceNodeLayout serendipity generator for both the named Quad8 layout and
+// the arbitrary-order path.
+void SerendipityBasis::init_quadrilateral(int order) {
     mode_exponents_ = serendipity_exponents(order, /*active_axes=*/2);
     size_ = mode_exponents_.size();
-    nodes_ = nodes_from_reference_layout
-                 ? ReferenceNodeLayout::node_coords(ElementType::Quad8)
-                 : quad_serendipity_nodes(order, size_);
+    nodes_ = ReferenceNodeLayout::serendipity_node_coords(BasisTopology::Quadrilateral, order);
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
@@ -634,20 +463,14 @@ void SerendipityBasis::init_quadrilateral(int order, bool nodes_from_reference_l
 }
 
 // Build the hexahedral serendipity mode set, reference nodes, and nodal
-// coefficient table for the given order, mirroring init_quadrilateral. The
-// arbitrary-order topology path generates its own VTK-consistent nodes; the named
-// Hex8 (order 1) and Hex20 (order 2) layouts source their public-order nodes from
-// ReferenceNodeLayout so the generated layout matches the public ordering exactly.
-void SerendipityBasis::init_hexahedron(int order, bool nodes_from_reference_layout) {
+// coefficient table for the given order, mirroring init_quadrilateral. Reference
+// nodes come from the single ReferenceNodeLayout serendipity generator; Hex8
+// (order 1) and Hex20 (order 2) are its order-1/order-2 instances and match the
+// public Hex8/Hex20 ordering exactly.
+void SerendipityBasis::init_hexahedron(int order) {
     mode_exponents_ = serendipity_exponents(order, /*active_axes=*/3);
     size_ = mode_exponents_.size();
-    if (nodes_from_reference_layout) {
-        const ElementType named =
-            (order == 1) ? ElementType::Hex8 : ElementType::Hex20;
-        nodes_ = ReferenceNodeLayout::node_coords(named);
-    } else {
-        nodes_ = hex_serendipity_nodes(order, size_);
-    }
+    nodes_ = ReferenceNodeLayout::serendipity_node_coords(BasisTopology::Hexahedron, order);
     svmp::throw_if<BasisConstructionException>(
         nodes_.size() != size_, SVMP_HERE,
         "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 09b2ced50..b4a513fd7 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -96,16 +96,16 @@ namespace basis {
  *
  * `SerendipityBasis(BasisTopology::Quadrilateral, p)` and
  * `SerendipityBasis(BasisTopology::Hexahedron, p)` are the arbitrary-order entry
- * points (@f$p \ge 1@f$; orders below one are rejected). They generate their own
- * reference nodes in a VTK-consistent stratified order; for @f$p \ge 3@f$ the
- * interior ordering is an implementation convention rather than a public layout.
- * The named fixed layouts -- `ElementType::Quad8` (order 2), `Hex8` (order 1),
- * and `Hex20` (order 2) -- are the same construction at those orders but take
- * their nodes from ReferenceNodeLayout so they carry the single public node
- * ordering the solver permutes against. Because the generator reuses the VTK
- * corner/edge ordering, its order-1 and order-2 hexahedral layouts match the
- * public Hex8/Hex20 ordering exactly, so the named and topology constructions
- * produce identical objects. Wedge serendipity remains a single fixed layout
+ * points (@f$p \ge 1@f$; orders below one are rejected). Reference nodes for both
+ * the arbitrary-order and the named paths come from the single
+ * ReferenceNodeLayout serendipity generator, in a VTK-consistent stratified
+ * order; for @f$p \ge 3@f$ the interior ordering is an implementation convention
+ * rather than a public layout. The named fixed layouts -- `ElementType::Quad8`
+ * (order 2), `Hex8` (order 1), and `Hex20` (order 2) -- are the same construction
+ * at those orders; the named overload only pins the order, so the named and
+ * topology constructions produce identical objects and share the single public
+ * node ordering the solver permutes against (order 1 and order 2 reuse the VTK
+ * corner/edge ordering exactly). Wedge serendipity remains a single fixed layout
  * (Wedge15), constructed only from its named ElementType. Solver-default basis
  * selection is separate: `basis_factory` maps the complete Quad4 layout to the
  * default linear Lagrange basis and maps Quad8/Hex20 to serendipity unless a
@@ -215,17 +215,16 @@ class SerendipityBasis final : public BasisFunction {
      * @brief Return the reference interpolation nodes in basis ordering.
      *
      * @details Node coordinates are the points at which the serendipity basis
-     * satisfies the nodal interpolation property. The named fixed layouts (Quad8,
-     * Hex8, Hex20, Wedge15) take their nodes from ReferenceNodeLayout, the public
-     * node-ordering source the solver adapter permutes against. Arbitrary-order
-     * quadrilateral and hexahedral serendipity (constructed from a BasisTopology)
-     * generates its nodes here instead, in VTK-consistent stratified order:
-     * corners and edges first (matching the public Quad8/Hex8/Hex20 ordering at
-     * the named orders), then the face and volume interior points needed to make
-     * the reduced polynomial space unisolvent. For @f$p \ge 3@f$ that interior
-     * ordering is an implementation convention; callers should pair it with basis
-     * values from the same object rather than assume an external mesh ordering
-     * contract beyond the supported named production layouts.
+     * satisfies the nodal interpolation property. All families take their nodes
+     * from ReferenceNodeLayout, the public node-ordering source the solver adapter
+     * permutes against: the fixed Wedge15 layout and the quadrilateral/hexahedral
+     * families (named or arbitrary-order) alike, in VTK-consistent stratified
+     * order -- corners and edges first (matching the public Quad8/Hex8/Hex20
+     * ordering at the named orders), then the face and volume interior points
+     * needed to make the reduced polynomial space unisolvent. For @f$p \ge 3@f$
+     * that interior ordering is an implementation convention; callers should pair
+     * it with basis values from the same object rather than assume an external
+     * mesh ordering contract beyond the supported named production layouts.
      *
      * @return Reference node coordinates, one per basis function.
      */
@@ -334,10 +333,10 @@ class SerendipityBasis final : public BasisFunction {
 
     // Build the quadrilateral serendipity mode set, nodes, and Legendre
     // coefficient table for the given order. (Details at the definition.)
-    void init_quadrilateral(int order, bool nodes_from_reference_layout);
+    void init_quadrilateral(int order);
     // Build the hexahedral serendipity mode set, nodes, and Legendre coefficient
     // table for the given order; Hex8/Hex20 are its order-1/order-2 instances.
-    void init_hexahedron(int order, bool nodes_from_reference_layout);
+    void init_hexahedron(int order);
     // Build the fixed Wedge15 layout from its tabulated monomial mode space.
     void init_fixed_named(ElementType type);
 
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 62122676c..5ed806f5e 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -473,10 +473,10 @@ TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
     SerendipityBasis topology_quad_basis(BasisTopology::Quadrilateral, 2);
 
     EXPECT_EQ(basis.size(), 8u);
-    // Quad8 sources its nodes from ReferenceNodeLayout while the arbitrary-order
-    // Quadrilateral path at order 2 uses the local generator, so this also pins
-    // the two independent quadrilateral node sources to agree at the production
-    // order.
+    // The named Quad8 and the arbitrary-order Quadrilateral path at order 2 now
+    // share the single ReferenceNodeLayout serendipity generator, so this pins
+    // that the named and topology overloads build the same object. The independent
+    // node-coordinate oracle is Quad8ReferenceNodesMatchIndependentConstruction.
     expect_nodes_near(basis.nodes(), topology_quad_basis.nodes(), double(1e-14));
     expect_nodal_delta(basis, basis.nodes(), double(1e-10));
     expect_partition_of_unity(basis, {double(0.17), double(-0.31), double(0)});

From 955a7de9d5319c2a91949444b3cf2830038316fa Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 00:23:49 -0700
Subject: [PATCH 73/91] removing un-used code for quad and hex serendipity
 basis

---
 .../FE/Basis/NodeOrderingConventions.cpp      | 67 ++++++++-----------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 58a090b42..52837c311 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -572,42 +572,34 @@ LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order
     }
 }
 
-// Topological interior dimension of an integer lattice node: the number of
-// independent directions in which the point sits in the relative interior of
-// the reference cell. A vertex gives 0, an edge-interior node 1, a
-// face-interior node 2, and a volume-interior node 3.
-int serendipity_interior_dim(BasisTopology top, const Lattice& idx, int order) {
+// Topological interior dimension of a wedge-prism lattice node: the number of
+// independent directions in which the point sits in the relative interior of the
+// reference cell. A vertex gives 0, an edge-interior node 1, a face-interior node
+// 2, and a volume-interior node 3. Only the wedge needs this classification -- it
+// is the one serendipity layout still built by truncating a complete layout
+// (serendipity_subset_nodes). Quadrilateral and hexahedral serendipity geometries
+// are generated directly by quad_/hex_serendipity_nodes and never go through here.
+int wedge_interior_dim(const Lattice& idx, int order) {
     const auto tensor_interior = [order](int v) { return (v > 0 && v < order) ? 1 : 0; };
-    switch (top) {
-        case BasisTopology::Quadrilateral:
-            return tensor_interior(idx[0]) + tensor_interior(idx[1]);
-        case BasisTopology::Hexahedron:
-            return tensor_interior(idx[0]) + tensor_interior(idx[1]) +
-                   tensor_interior(idx[2]);
-        case BasisTopology::Wedge: {
-            // (idx[0], idx[1]) is the triangle cross-section with implied third
-            // barycentric index k; idx[2] is the tensor through-axis. A triangle
-            // vertex contributes 0, a triangle edge 1, and the triangle interior 2.
-            const int i = idx[0];
-            const int j = idx[1];
-            const int k = order - i - j;
-            const bool tri_vertex = (i == order) || (j == order) || (i + j == 0);
-            const bool tri_interior = (i > 0) && (j > 0) && (k > 0);
-            const int tri_dim = tri_vertex ? 0 : (tri_interior ? 2 : 1);
-            return tri_dim + tensor_interior(idx[2]);
-        }
-        default:
-            return 0;
-    }
-}
-
-// Build a serendipity reference layout (Quad8, Hex20, Wedge15) from the complete
-// quadratic layout of the same topology. Serendipity layouts keep only the
-// element's vertices and edge midpoints and drop the face- and volume-interior
-// nodes; the complete-quadratic generators emit the vertex/edge nodes first, so
-// the serendipity set is exactly the leading keep_count nodes.
-std::vector<Point> serendipity_subset_nodes(BasisTopology top,
-                                            LagrangeNodeLayout complete,
+    // (idx[0], idx[1]) is the triangle cross-section with implied third
+    // barycentric index k; idx[2] is the tensor through-axis. A triangle vertex
+    // contributes 0, a triangle edge 1, and the triangle interior 2.
+    const int i = idx[0];
+    const int j = idx[1];
+    const int k = order - i - j;
+    const bool tri_vertex = (i == order) || (j == order) || (i + j == 0);
+    const bool tri_interior = (i > 0) && (j > 0) && (k > 0);
+    const int tri_dim = tri_vertex ? 0 : (tri_interior ? 2 : 1);
+    return tri_dim + tensor_interior(idx[2]);
+}
+
+// Build the Wedge15 serendipity reference layout from the complete quadratic wedge
+// layout. Serendipity layouts keep only the element's vertices and edge midpoints
+// and drop the face- and volume-interior nodes; the complete-quadratic generators
+// emit the vertex/edge nodes first, so the serendipity set is exactly the leading
+// keep_count nodes. (Quadrilateral and hexahedral serendipity geometries are
+// generated directly by quad_/hex_serendipity_nodes, not by truncation here.)
+std::vector<Point> serendipity_subset_nodes(LagrangeNodeLayout complete,
                                             std::size_t keep_count,
                                             std::size_t complete_count) {
     constexpr int kQuadraticOrder = 2;
@@ -622,7 +614,7 @@ std::vector<Point> serendipity_subset_nodes(BasisTopology top,
 
     for (std::size_t n = 0; n < complete.lattice.size(); ++n) {
         const bool on_skeleton =
-            serendipity_interior_dim(top, complete.lattice[n], kQuadraticOrder) <= 1;
+            wedge_interior_dim(complete.lattice[n], kQuadraticOrder) <= 1;
         const bool kept = n < keep_count;
         svmp::throw_if<BasisConstructionException>(
             kept != on_skeleton, SVMP_HERE,
@@ -808,8 +800,7 @@ std::vector<Point> element_nodes(ElementType elem_type) {
         case ElementType::Hex20:
             return hex_serendipity_nodes(2);
         case ElementType::Wedge15:
-            return serendipity_subset_nodes(BasisTopology::Wedge,
-                                            generate_wedge_nodes(2), 15u, 18u);
+            return serendipity_subset_nodes(generate_wedge_nodes(2), 15u, 18u);
         case ElementType::Pyramid13:
             svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
                 "ReferenceNodeLayout: pyramid node ordering is disabled");

From 718649a58cf6d39706e2dd648a8c15f0775ebdd3 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 11:02:47 -0700
Subject: [PATCH 74/91] simplifying documentation and making sure that basis
 specific exceptions are used

---
 Code/Source/solver/FE/Basis/BasisExceptions.h |  7 ++++
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 36 ++++++++-----------
 .../solver/FE/Basis/NodeOrderingConventions.h |  7 ++--
 .../solver/FE/Basis/SerendipityBasis.cpp      | 26 +++++++-------
 .../Source/solver/FE/Basis/SerendipityBasis.h | 14 ++++----
 Code/Source/solver/FE/Common/Types.h          | 11 +-----
 .../FE/Basis/test_BasisErrorPaths.cpp         |  4 ++-
 .../FE/Basis/test_SerendipityBasis.cpp        | 12 +++----
 8 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index e4afc3153..3dd9cf756 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -12,6 +12,13 @@ namespace basis {
 
 /**
  * @brief Base exception type for errors originating in the Basis module
+ *
+ * @details The Basis module raises these basis-specific types -- rather than the
+ * generic FE exceptions in FEException.h -- so a caller can catch a basis failure
+ * precisely: an unsupported element/order pairing, a non-unisolvent node set, an
+ * out-of-range reference-node index. They all derive from FEException, so code
+ * that only wants "some FE error" can still catch the base type, and they carry
+ * the same StatusCode and source location as the rest of the hierarchy.
  */
 class BasisException : public FEException {
 public:
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 76cc92590..487af5044 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -44,21 +44,19 @@ namespace basis {
  * ## Reference-node distribution
  *
  * The interpolation nodes are not a single distribution across topologies; each
- * family uses the node set its evaluator is built for (see ReferenceNodeLayout
- * and line_coord_pm_one):
- * - **Tensor-product (line, quadrilateral, hexahedron):** Gauss-Lobatto-Legendre
- *   (GLL) nodes on each axis -- endpoints at @f$\pm 1@f$ and interior nodes at
- *   the roots of @f$P'_p@f$ -- not an equispaced layout.
+ * family uses the node set its evaluator is built for:
+ * - **Tensor-product (line, quadrilateral, hexahedron):** the shared
+ *   Gauss-Lobatto-Legendre (GLL) tensor-axis nodes -- see line_coord_pm_one for
+ *   the distribution and its conditioning -- not an equispaced layout.
  * - **Simplex (triangle, tetrahedron):** the equispaced barycentric lattice
  *   (each barycentric coordinate at @f$i/p@f$). The closed-form evaluator below
  *   is specific to this equispaced lattice.
  * - **Wedge:** the tensor product of an equispaced triangle cross-section with a
  *   GLL through-axis.
  *
- * GLL coincides with the equispaced layout at orders 1 and 2, so every named
- * production element (Line2/Line3, Triangle3/Triangle6, Quad4/Quad9,
- * Tetra4/Tetra10, Hex8/Hex27, Wedge6/Wedge18) keeps its standard coordinates;
- * the GLL/equispaced distinction appears only for order >= 3.
+ * Because GLL coincides with the equispaced layout at orders 1 and 2
+ * (line_coord_pm_one), every named production element keeps its standard
+ * coordinates and the GLL/equispaced distinction appears only for order >= 3.
  *
  * ## Evaluation
  *
@@ -127,12 +125,11 @@ class LagrangeBasis final : public BasisFunction {
      * store per-axis node indices, simplex bases store barycentric exponent
      * tuples, and wedge bases store the triangle-node/axis-node decomposition.
      *
-     * Reference nodes are Gauss-Lobatto-Legendre on tensor-product axes and the
-     * equispaced barycentric lattice on simplex axes (see the class description).
-     * High order stays well-conditioned on tensor-product topologies but degrades
-     * on simplex and wedge topologies, and -- unlike SerendipityBasis -- this
-     * constructor does not reject ill-conditioned high-order simplex/wedge
-     * requests; that choice is the caller's.
+     * Reference nodes follow the per-topology distribution described in the class
+     * documentation (Reference-node distribution). Unlike SerendipityBasis, this
+     * constructor does not reject ill-conditioned high-order simplex/wedge requests
+     * (where the equispaced barycentric lattice degrades); that choice is the
+     * caller's.
      *
      * @param topology Reference topology; Point through the volume topologies.
      * @param order Polynomial order; must be non-negative. Point is order 0.
@@ -191,12 +188,9 @@ class LagrangeBasis final : public BasisFunction {
     /**
      * @brief Return the reference interpolation nodes in basis ordering.
      *
-     * @details The returned node order matches the basis-function order used
-     * by all evaluators. Coordinates are reference-element coordinates:
-     * tensor-product axes use the Gauss-Lobatto-Legendre nodes on @f$[-1,1]@f$,
-     * triangles and tetrahedra use the equispaced barycentric simplex lattice,
-     * and wedges combine the equispaced triangle lattice with a GLL @f$[-1,1]@f$
-     * through-axis coordinate.
+     * @details The returned node order matches the basis-function order used by
+     * all evaluators; the coordinates follow the per-topology distribution
+     * described in the class documentation (Reference-node distribution).
      *
      * @return Reference node coordinates, one per basis function.
      */
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 599febcaa..cf692a8f5 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -29,8 +29,11 @@ namespace basis {
  * equispaced layout for the production orders and differ only for order >= 3.
  * Returns 0 for order <= 0 when @p i is 0. Invalid indices throw.
  *
- * Shared by the reference-node layout generators and the Lagrange tensor-axis
- * node initialization so the 1D distribution lives in a single place.
+ * This is the single definition of the tensor-axis node distribution: the
+ * reference-node layout generators, the Lagrange tensor-axis initialization, and
+ * the serendipity edge/face/interior strata all source their 1D nodes here. The
+ * LagrangeBasis and SerendipityBasis docs point back to this description of the
+ * GLL distribution and its conditioning rather than restating it.
  *
  * @param i Node index in [0, order] for positive orders, or 0 for order <= 0.
  * @param order Polynomial order of the 1D distribution.
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 080ca34ce..fc57de402 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -34,14 +34,11 @@ inline double integer_power(double base, int exponent) {
     return result;
 }
 
-// Which 1D polynomial family the tensor modes are written in. Monomials x^k are
-// simple but make the interpolation (Vandermonde) matrix exponentially
-// ill-conditioned as the order grows; tensor products of Legendre polynomials
-// P_k span exactly the same polynomial space (the serendipity exponent set is
-// downward-closed, so the change of basis is triangular) while keeping the
-// Vandermonde well-conditioned. The quadrilateral and hexahedral families use
-// Legendre; the fixed Wedge15 layout (order 2, trivially well-conditioned) keeps
-// the monomial form.
+// Which 1D polynomial family the tensor modes are written in: the quadrilateral
+// and hexahedral families use Legendre, the fixed Wedge15 layout (order 2) keeps
+// the monomial form. Both span the same serendipity polynomial space; see the
+// SerendipityBasis class documentation (Modal basis) for why Legendre is used to
+// keep the Vandermonde well-conditioned.
 enum class ModalAxisKind { Monomial, Legendre };
 
 // Value and first/second derivative of every 1D mode phi_0..phi_{max_degree} at a
@@ -133,12 +130,13 @@ double matrix_norm_inf(const std::vector<double>& matrix, std::size_t n) {
 }
 
 // Per-axis degree triples (ax, ay, az) of the serendipity mode space: every
-// combination whose superlinear degree (the sum of superlinear_term over the
-// axes) is at most `order`. `active_axes` is 2 for the quadrilateral (az pinned
-// to 0) and 3 for the hexahedron, so the quad space is exactly the hex space
-// restricted to az = 0. The same downward-closed set spans both the monomial and
-// the tensor Legendre basis (see ModalAxisKind), and the resulting nodal basis is
-// independent of how this set is ordered.
+// combination whose superlinear degree (the sum of superlinear_term over the axes;
+// see the SerendipityBasis class documentation for the rule) is at most `order`.
+// `active_axes` is 2 for the quadrilateral (az pinned to 0) and 3 for the
+// hexahedron, so the quad space is exactly the hex space restricted to az = 0. The
+// set is downward-closed (so it spans both the monomial and the tensor Legendre
+// basis; see ModalAxisKind), and the resulting nodal basis is independent of how
+// the set is ordered.
 std::vector<std::array<int, 3>> serendipity_exponents(int order, int active_axes) {
     const int max_y = active_axes >= 2 ? order : 0;
     const int max_z = active_axes >= 3 ? order : 0;
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index b4a513fd7..2efb4e788 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -126,13 +126,13 @@ namespace basis {
  * High-order nodal interpolation is governed by two conditioning factors, both
  * addressed so that arbitrary orders produce trustworthy shape functions:
  * - **Node distribution.** The quadrilateral and hexahedral families place their
- *   nodes on the Gauss-Lobatto-Legendre (GLL) distribution (edges, faces, and the
- *   interior staircase all use the GLL 1D nodes). GLL has a logarithmic Lebesgue
- *   constant, where an equispaced layout grows exponentially (the Runge
- *   phenomenon). The named production layouts are unaffected: GLL coincides with
- *   the equispaced layout at orders 1 and 2, so Quad8/Hex8/Hex20 keep their exact
- *   public coordinates; GLL differs only for order >= 3, where the layout is this
- *   module's own convention.
+ *   nodes on the shared Gauss-Lobatto-Legendre (GLL) distribution -- edges, faces,
+ *   and the interior staircase all use the GLL 1D nodes (line_coord_pm_one), whose
+ *   logarithmic Lebesgue constant keeps high-order interpolation well-conditioned.
+ *   The named production layouts are unaffected, since GLL coincides with the
+ *   equispaced layout at orders 1 and 2 (so Quad8/Hex8/Hex20 keep their exact
+ *   public coordinates); the layout is this module's own convention only for
+ *   order >= 3.
  * - **Modal basis.** The quadrilateral and hexahedral Vandermondes are assembled
  *   in a tensor **Legendre** basis rather than raw monomials. The serendipity
  *   exponent set is downward-closed, so the Legendre and monomial spans are
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index d90388114..5beefcce3 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -272,17 +272,8 @@ enum class QuadratureType : std::uint8_t {
  */
 enum class BasisType : std::uint8_t {
     Lagrange,          ///< Standard nodal Lagrange basis
-    Hierarchical,      ///< Hierarchical/modal basis
-    Bernstein,         ///< Bernstein polynomials
-    NURBS,             ///< Non-uniform rational B-splines
-    BSpline,           ///< Non-rational B-spline basis
-    Spectral,          ///< Spectral element basis
+    NURBS,             ///< Non-uniform rational B-splines (reserved; not yet implemented)
     Serendipity,       ///< Serendipity elements
-    Hermite,           ///< Hermite C1 continuity basis
-    RaviartThomas,     ///< H(div) Raviart-Thomas family
-    Nedelec,           ///< H(curl) Nedelec edge elements
-    BDM,               ///< H(div) Brezzi-Douglas-Marini family
-    Bubble,            ///< Interior bubble functions for enrichment
     Custom             ///< User-defined basis
 };
 
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 0fcc25e0b..0c1f8d6a0 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -258,8 +258,10 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
                  BasisConfigurationException);
+    // NURBS is a declared but unimplemented family, so the scalar factory rejects
+    // it as outside the Lagrange/Serendipity scope.
     EXPECT_THROW((void)basis_factory::create(
-                     BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
+                     BasisRequest{ElementType::Line2, BasisType::NURBS, 1}),
                  BasisConfigurationException);
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
diff --git a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
index 5ed806f5e..b764c8a8a 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityBasis.cpp
@@ -522,14 +522,14 @@ TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
 }
 
 TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
-    EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), FEException);
-    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2), FEException);
-    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
-    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
-    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 1), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2), BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 1), BasisConfigurationException);
     // Quad4 is the linear Lagrange quad, not a named serendipity layout; arbitrary
     // quadrilateral serendipity is requested through BasisTopology::Quadrilateral.
-    EXPECT_THROW(SerendipityBasis(ElementType::Quad4, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad4, 2), BasisElementCompatibilityException);
 }
 
 // Topology construction is the arbitrary-order entry point and exists only for

From c114538156e062c769274473593cbbe8141b28e7 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 11:54:29 -0700
Subject: [PATCH 75/91] improving doxygen documentation with internal callouts

---
 Code/Source/solver/FE/Basis/BasisExceptions.h | 14 +++++
 Code/Source/solver/FE/Basis/BasisFactory.h    | 45 ++++++++------
 Code/Source/solver/FE/Basis/BasisTraits.h     | 59 ++++++++++++++-----
 .../FE/Basis/NodeOrderingConventions.cpp      |  6 ++
 .../solver/FE/Basis/NodeOrderingConventions.h |  8 +++
 5 files changed, 101 insertions(+), 31 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index 3dd9cf756..dd642e16b 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -10,6 +10,18 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/**
+ * @defgroup FE_BasisExceptions Exceptions
+ * @ingroup FE_Basis
+ * @brief Basis-module exception hierarchy.
+ *
+ * @details Every Basis exception derives from BasisException (and thus FEException),
+ * so a caller can catch a specific basis failure or the FE base type. See
+ * BasisException for why the module raises these basis-specific types rather than
+ * the generic FE exceptions.
+ * @{
+ */
+
 /**
  * @brief Base exception type for errors originating in the Basis module
  *
@@ -90,6 +102,8 @@ class BasisConstructionException : public BasisException {
         : BasisException(message, file, line, function, StatusCode::InternalError) {}
 };
 
+/** @} */
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index 0bc9c5c9e..5ce99682e 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -19,24 +19,32 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/**
+ * @brief Runtime description of a basis to construct.
+ * @ingroup FE_Basis
+ *
+ * @details A request identifies exactly one construction target -- a named
+ * ElementType layout, or a reference BasisTopology with an explicit order -- plus
+ * the family and field policy; basis_factory::create() validates and builds from
+ * it. The spline/NURBS fields are reserved for future families and are unused by
+ * the scalar Lagrange/Serendipity factory.
+ */
 struct BasisRequest {
-    // Named mesh element layout for default/mesh-compatible bases. Leave Unknown
-    // when requesting an arbitrary-order basis by reference topology.
-    ElementType element_type{ElementType::Unknown};
-    BasisType basis_type{BasisType::Lagrange};
-    std::optional<int> order{};
-    Continuity continuity{Continuity::C0};
-    FieldType field_type{FieldType::Scalar};
-    std::vector<double> knot_vector{};
-    std::vector<double> weights{};
-    std::vector<int> axis_orders{};
-    std::vector<std::vector<double>> axis_knot_vectors{};
-    std::vector<std::vector<double>> axis_weights{};
-    std::vector<int> tensor_extents{};
-    std::string custom_id{};
-    // Reference topology for arbitrary-order bases. This field is intentionally
-    // last so existing aggregate initializers for named elements keep their
-    // positional meaning.
+    ElementType element_type{ElementType::Unknown};  ///< Named element layout, or Unknown to request by topology.
+    BasisType basis_type{BasisType::Lagrange};       ///< Basis family to construct.
+    std::optional<int> order{};                      ///< Polynomial order; required by the factory.
+    Continuity continuity{Continuity::C0};           ///< Inter-element continuity (Lagrange/Serendipity are C0).
+    FieldType field_type{FieldType::Scalar};         ///< Field type (Lagrange/Serendipity support Scalar).
+    std::vector<double> knot_vector{};               ///< Reserved for spline/NURBS families; unused here.
+    std::vector<double> weights{};                   ///< Reserved for rational (NURBS) families; unused here.
+    std::vector<int> axis_orders{};                  ///< Reserved for per-axis tensor spline orders; unused here.
+    std::vector<std::vector<double>> axis_knot_vectors{};  ///< Reserved for per-axis spline knots; unused here.
+    std::vector<std::vector<double>> axis_weights{};       ///< Reserved for per-axis rational weights; unused here.
+    std::vector<int> tensor_extents{};               ///< Reserved for tensor-product extents; unused here.
+    std::string custom_id{};                         ///< Optional identifier for Custom families.
+    /// Reference topology for arbitrary-order requests, or Unknown to request by
+    /// element_type. Kept last so existing aggregate initializers for named
+    /// elements keep their positional meaning.
     BasisTopology topology{BasisTopology::Unknown};
 };
 
@@ -44,6 +52,7 @@ namespace basis_factory {
 
 /**
  * @brief Create a basis from a runtime request.
+ * @ingroup FE_Basis
  *
  * @details A request must identify exactly one construction target: set
  * BasisRequest::element_type for a named mesh-node layout, or set
@@ -59,6 +68,7 @@ namespace basis_factory {
 
 /**
  * @brief Return the default basis request (family and order) for an element type.
+ * @ingroup FE_Basis
  *
  * @details This is the single source of truth for which basis family and
  * polynomial order a given element type uses by default: serendipity node
@@ -77,6 +87,7 @@ namespace basis_factory {
 
 /**
  * @brief Create the default basis for an element type.
+ * @ingroup FE_Basis
  *
  * @details Equivalent to create(default_basis_request(element_type)).
  *
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index a9235d32c..79d4653f7 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -4,6 +4,12 @@
 #ifndef SVMP_FE_BASIS_BASISTRAITS_H
 #define SVMP_FE_BASIS_BASISTRAITS_H
 
+/**
+ * @file BasisTraits.h
+ * @brief Reference-topology vocabulary (BasisTopology) and the internal
+ *        ElementType/topology/order maps.
+ */
+
 #include "Types.h"
 
 #include <cstddef>
@@ -12,17 +18,30 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/**
+ * @brief Reference-cell topology of a basis (the shape, independent of order).
+ * @ingroup FE_Basis
+ *
+ * @details Together with a polynomial order this is the order-agnostic identity a
+ * basis is built from: the arbitrary-order constructors take a BasisTopology and an
+ * order, and BasisRequest::topology selects that path. A named ElementType maps to
+ * one of these through topology().
+ */
 enum class BasisTopology {
-    Unknown,
-    Point,
-    Line,
-    Triangle,
-    Quadrilateral,
-    Tetrahedron,
-    Hexahedron,
-    Wedge,
+    Unknown,        ///< Unrecognized or uninitialized topology.
+    Point,          ///< 0D point.
+    Line,           ///< 1D line segment.
+    Triangle,       ///< 2D triangle (simplex).
+    Quadrilateral,  ///< 2D quadrilateral (tensor product).
+    Tetrahedron,    ///< 3D tetrahedron (simplex).
+    Hexahedron,     ///< 3D hexahedron (tensor product).
+    Wedge,          ///< 3D triangular prism.
 };
 
+// The maps below are internal to the Basis module (used to build the basis classes
+// and the factory); they are excluded from the public Doxygen output.
+/** @cond INTERNAL */
+
 // ---------------------------------------------------------------------------
 // ElementType / BasisTopology / order mapping helpers.
 //
@@ -158,12 +177,24 @@ enum class BasisTopology {
     return complete_lagrange_alias_order(type);
 }
 
-// Inverse of (topology(), order()) for the named layouts: the ElementType that a
-// (topology, order, family) triple denotes, or Unknown when no named layout
-// exists (order 0 on a non-point topology, any order >= 3, or a reduced family
-// at an unsupported order). topology() + order() remain the authoritative
-// identity; callers that want a named ElementType for a basis pass its
-// topology(), order(), and basis_type() to this free helper directly.
+/** @endcond */
+
+/**
+ * @brief Named ElementType denoted by a (topology, order, family) triple.
+ * @ingroup FE_Basis
+ *
+ * @details Inverse of topology() + order() for the named layouts: returns the
+ * ElementType a basis identity denotes, or ElementType::Unknown when no named
+ * layout exists (order 0 on a non-point topology, any order >= 3, or a reduced
+ * family at an unsupported order). topology() + order() remain the authoritative
+ * identity; callers that want a named ElementType for a basis pass its topology(),
+ * order(), and basis_type() here.
+ *
+ * @param top Reference topology.
+ * @param order Polynomial order.
+ * @param family Basis family; only Serendipity is distinguished from nodal/Lagrange naming.
+ * @return Named ElementType, or ElementType::Unknown when none applies.
+ */
 [[nodiscard]] constexpr ElementType named_element_for(BasisTopology top, int order,
                                                       BasisType family) noexcept {
     if (family == BasisType::Serendipity) {
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 52837c311..0b4796db6 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -15,6 +15,10 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+// Internal to the Basis module; excluded from the public Doxygen output (the
+// matching conditional region is in NodeOrderingConventions.h).
+/** @cond INTERNAL */
+
 namespace {
 
 using Point = math::Vector<double, 3>;
@@ -898,6 +902,8 @@ ReferenceNodeLayout::serendipity_node_coords(BasisTopology topology, int order)
     }
 }
 
+/** @endcond */
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index cf692a8f5..fc0b9a5dd 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -17,6 +17,12 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+// NodeOrderingConventions is internal to the Basis module: it is the reference-node
+// generator the basis families build on, not a consumer entry point (callers obtain
+// a basis through basis_factory and read BasisFunction::nodes()), so it is excluded
+// from the public Doxygen output below.
+/** @cond INTERNAL */
+
 /**
  * @brief The i-th 1D tensor-axis reference node on [-1, 1] at the given order.
  *
@@ -127,6 +133,8 @@ class ReferenceNodeLayout {
     serendipity_node_coords(BasisTopology topology, int order);
 };
 
+/** @endcond */
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp

From 319d1164ab466491ded1862f0331920b4ce85145 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 14:58:14 -0700
Subject: [PATCH 76/91] improving comments and inlining `eval_modal_basis` for
 clarity in serendipity

---
 Code/Source/solver/FE/Basis/BasisFunction.h   |  13 --
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  10 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      | 153 ++++++++----------
 3 files changed, 71 insertions(+), 105 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 16b60f4e4..c7caf07bf 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -148,19 +148,6 @@ using Hessian  = math::Matrix<double, 3, 3>;
  */
 void require_span_size(std::size_t actual, std::size_t expected, const char* label);
 
-/**
- * @brief Check a requested output span unless it is empty, following the
- * "skip this output" convention used by the combined evaluators.
- */
-template <typename T>
-void require_requested_span_size(std::span<T> output,
-                                 std::size_t expected,
-                                 const char* label) {
-    if (!output.empty()) {
-        require_span_size(output.size(), expected, label);
-    }
-}
-
 /**
  * @brief Abstract interface for finite-element basis-function families.
  * @ingroup FE_Basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index a2188d4e9..dbb29bff3 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -71,8 +71,8 @@ BasisTopology validated_lagrange_topology(ElementType element_type, int order) {
 
 // Convert an integer lattice index (i, j[, k]) into the barycentric exponent
 // tuple (order - i - j - k, i, j, k). The lattice already carries the exact
-// coordinate indices, so no floating-point round-trip is needed; the accessor's
-// structural invariants guarantee i + j + k <= order, hence e[0] >= 0.
+// coordinate indices, the accessor's structural invariants guarantee
+//  i + j + k <= order, hence e[0] >= 0.
 LagrangeBasis::SimplexExponent simplex_exponent_from_lattice(const std::array<int, 3>& idx,
                                                             BasisTopology top,
                                                             int order) {
@@ -557,9 +557,9 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
                                     std::span<double> values_out,
                                     std::span<Gradient> gradients_out,
                                     std::span<Hessian> hessians_out) const {
-    require_requested_span_size(values_out, size(), "LagrangeBasis::evaluate_all_to values");
-    require_requested_span_size(gradients_out, size(), "LagrangeBasis::evaluate_all_to gradients");
-    require_requested_span_size(hessians_out, size(), "LagrangeBasis::evaluate_all_to hessians");
+    // Private sink: callers guarantee valid output spans -- the public *_to methods
+    // validate their one output with require_span_size, and the vector evaluators
+    // resize to size(). An empty span here means "skip that quantity".
 
     if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
         return;
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index fc57de402..e47718320 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -253,83 +253,6 @@ constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{2, 0, 1}}
 }};
 
-// Evaluate a nodal basis defined by a modal coefficient table at one reference
-// point. The three axis tables (tx, ty, tz) already hold phi and its derivatives
-// for every per-axis degree at this point. For each mode j the routine forms
-// phi_a(r) phi_b(s) phi_c(t) and the requested derivatives, then accumulates the
-// coefficient-weighted sum into the output slots. `count` is both the number of
-// modes and the number of basis functions (the coefficient table is square). The
-// table is in public basis order, so output slot i reads coefficient column i
-// directly. Outputs are assumed pre-zeroed by the caller; an empty span skips that
-// quantity.
-template <typename ExponentFn, typename CoeffFn>
-void eval_modal_basis(const AxisTable& tx, const AxisTable& ty, const AxisTable& tz,
-                      std::size_t count,
-                      ExponentFn&& exponent,
-                      CoeffFn&& coeff,
-                      std::span<double> values,
-                      std::span<Gradient> gradients,
-                      std::span<Hessian> hessians) {
-    const bool want_values = !values.empty();
-    const bool want_gradients = !gradients.empty();
-    const bool want_hessians = !hessians.empty();
-
-    for (std::size_t j = 0; j < count; ++j) {
-        const std::array<int, 3> e = exponent(j);
-        const std::size_t ex = static_cast<std::size_t>(e[0]);
-        const std::size_t ey = static_cast<std::size_t>(e[1]);
-        const std::size_t ez = static_cast<std::size_t>(e[2]);
-
-        const double vx = tx.value[ex];
-        const double vy = ty.value[ey];
-        const double vz = tz.value[ez];
-        const double phi = vx * vy * vz;
-
-        double d_dr = double(0), d_ds = double(0), d_dt = double(0);
-        if (want_gradients || want_hessians) {
-            d_dr = tx.first[ex] * vy * vz;
-            d_ds = vx * ty.first[ey] * vz;
-            d_dt = vx * vy * tz.first[ez];
-        }
-
-        double d_drr = double(0), d_dss = double(0), d_dtt = double(0);
-        double d_drs = double(0), d_drt = double(0), d_dst = double(0);
-        if (want_hessians) {
-            d_drr = tx.second[ex] * vy * vz;
-            d_dss = vx * ty.second[ey] * vz;
-            d_dtt = vx * vy * tz.second[ez];
-            d_drs = tx.first[ex] * ty.first[ey] * vz;
-            d_drt = tx.first[ex] * vy * tz.first[ez];
-            d_dst = vx * ty.first[ey] * tz.first[ez];
-        }
-
-        for (std::size_t slot = 0; slot < count; ++slot) {
-            const double c = coeff(j, slot);
-            if (want_values) {
-                values[slot] += c * phi;
-            }
-            if (want_gradients) {
-                Gradient& g = gradients[slot];
-                g[0] += c * d_dr;
-                g[1] += c * d_ds;
-                g[2] += c * d_dt;
-            }
-            if (want_hessians) {
-                Hessian& h = hessians[slot];
-                h(0, 0) += c * d_drr;
-                h(1, 1) += c * d_dss;
-                h(2, 2) += c * d_dtt;
-                h(0, 1) += c * d_drs;
-                h(1, 0) += c * d_drs;
-                h(0, 2) += c * d_drt;
-                h(2, 0) += c * d_drt;
-                h(1, 2) += c * d_dst;
-                h(2, 1) += c * d_dst;
-            }
-        }
-    }
-}
-
 struct NormalizedSerendipityRequest {
     BasisTopology topology;
     int dimension;
@@ -508,9 +431,9 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
                                        std::span<double> values_out,
                                        std::span<Gradient> gradients_out,
                                        std::span<Hessian> hessians_out) const {
-    require_requested_span_size(values_out, size_, "SerendipityBasis::evaluate_all_to values");
-    require_requested_span_size(gradients_out, size_, "SerendipityBasis::evaluate_all_to gradients");
-    require_requested_span_size(hessians_out, size_, "SerendipityBasis::evaluate_all_to hessians");
+    // Private sink: callers guarantee valid output spans -- the public *_to methods
+    // validate their one output with require_span_size, and the vector evaluators
+    // resize to size_. An empty span here means "skip that quantity".
 
     if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
         return;
@@ -549,13 +472,69 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     fill_axis_table(kind, y, order_, ty);
     fill_axis_table(kind, z, order_, tz);
 
-    eval_modal_basis(
-        tx, ty, tz, size_,
-        [this](std::size_t j) { return mode_exponents_[j]; },
-        [this](std::size_t j, std::size_t i) {
-            return inv_vandermonde_[j * size_ + i];
-        },
-        values_out, gradients_out, hessians_out);
+    // Accumulate the nodal shape functions from the modal tables. For each mode j,
+    // phi = phi_a(r) phi_b(s) phi_c(t) (and its derivatives) is weighted by the
+    // inverse-Vandermonde coefficient for each basis slot; the table is already in
+    // public basis order, so slot i reads column i directly. The spans were zeroed
+    // above and an empty span is skipped.
+    const bool want_values = !values_out.empty();
+    const bool want_gradients = !gradients_out.empty();
+    const bool want_hessians = !hessians_out.empty();
+
+    for (std::size_t j = 0; j < size_; ++j) {
+        const std::array<int, 3>& e = mode_exponents_[j];
+        const std::size_t ex = static_cast<std::size_t>(e[0]);
+        const std::size_t ey = static_cast<std::size_t>(e[1]);
+        const std::size_t ez = static_cast<std::size_t>(e[2]);
+
+        const double vx = tx.value[ex];
+        const double vy = ty.value[ey];
+        const double vz = tz.value[ez];
+        const double phi = vx * vy * vz;
+
+        double d_dr = double(0), d_ds = double(0), d_dt = double(0);
+        if (want_gradients || want_hessians) {
+            d_dr = tx.first[ex] * vy * vz;
+            d_ds = vx * ty.first[ey] * vz;
+            d_dt = vx * vy * tz.first[ez];
+        }
+
+        double d_drr = double(0), d_dss = double(0), d_dtt = double(0);
+        double d_drs = double(0), d_drt = double(0), d_dst = double(0);
+        if (want_hessians) {
+            d_drr = tx.second[ex] * vy * vz;
+            d_dss = vx * ty.second[ey] * vz;
+            d_dtt = vx * vy * tz.second[ez];
+            d_drs = tx.first[ex] * ty.first[ey] * vz;
+            d_drt = tx.first[ex] * vy * tz.first[ez];
+            d_dst = vx * ty.first[ey] * tz.first[ez];
+        }
+
+        for (std::size_t slot = 0; slot < size_; ++slot) {
+            const double c = inv_vandermonde_[j * size_ + slot];
+            if (want_values) {
+                values_out[slot] += c * phi;
+            }
+            if (want_gradients) {
+                Gradient& g = gradients_out[slot];
+                g[0] += c * d_dr;
+                g[1] += c * d_ds;
+                g[2] += c * d_dt;
+            }
+            if (want_hessians) {
+                Hessian& h = hessians_out[slot];
+                h(0, 0) += c * d_drr;
+                h(1, 1) += c * d_dss;
+                h(2, 2) += c * d_dtt;
+                h(0, 1) += c * d_drs;
+                h(1, 0) += c * d_drs;
+                h(0, 2) += c * d_drt;
+                h(2, 0) += c * d_drt;
+                h(1, 2) += c * d_dst;
+                h(2, 1) += c * d_dst;
+            }
+        }
+    }
 }
 
 void SerendipityBasis::evaluate_values(const math::Vector<double, 3>& xi,

From e97f49d89ee57ae5ae959d308e756d5b8df94c3e Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 15:50:42 -0700
Subject: [PATCH 77/91] FE/Basis: clarify BasisFunction documentation

- include the basis family in the documented basis identity (topology, order, family)
- replace the order() normalization wording with a plain description and example
- add cross-reference links to the module entities in the group documentation
- add topology and family examples to the basis object contract
- refer to shape function values in the BasisFunction class description
---
 Code/Source/solver/FE/Basis/BasisFunction.h | 38 +++++++++++++--------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index c7caf07bf..9668a8354 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -32,24 +32,26 @@
  * material model, and equation context.
  *
  * The main pieces are:
- * - BasisFunction (BasisFunction.h): the abstract query and evaluation
+ * - @ref BasisFunction (BasisFunction.h): the abstract query and evaluation
  *   contract for code that does not need to know the concrete family.
  * - @ref FE_LagrangeBasis "LagrangeBasis" and
  *   @ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
  *   families, including analytical first and second derivatives in reference
  *   coordinates.
- * - basis_factory (BasisFactory.h): runtime construction from a BasisRequest.
- *   basis_factory::default_basis_request() centralizes the family/order that
- *   matches each supported element's public node layout.
- * - ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
+ * - basis_factory (BasisFactory.h): runtime construction from a
+ *   @ref BasisRequest. basis_factory::default_basis_request() centralizes the
+ *   family/order that matches each supported element's public node layout.
+ * - @ref ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
  *   coordinates and the output ordering used by every basis evaluator.
- * - BasisTraits.h and BasisExceptions.h: topology classification,
+ * - @ref BasisTopology (BasisTraits.h) and the @ref FE_BasisExceptions
+ *   "basis exceptions" (BasisExceptions.h): topology classification,
  *   compile-time helpers, and module-specific exception types.
  *
  * ## Object and evaluation contract
  *
  * A basis object is immutable after construction. It represents one reference
- * topology, basis family, and effective polynomial order, and can be shared
+ * topology (e.g. tetrahedron, hexahedron), basis family (Lagrange or
+ * serendipity), and effective polynomial order, and can be shared
  * safely across evaluations. Construction may build node lattices or invert
  * interpolation matrices, so callers should construct through basis_factory
  * and cache one instance for each distinct basis request instead of rebuilding
@@ -154,8 +156,8 @@ void require_span_size(std::size_t actual, std::size_t expected, const char* lab
  *
  * BasisFunction defines the common query and evaluation API used by solver
  * code that does not need to know the concrete basis implementation. Derived
- * classes provide values at minimum and can override analytical gradients,
- * Hessians, combined evaluation, and span output paths. The interface
+ * classes provide shape function values at minimum and can override analytical
+ * gradients, Hessians, combined evaluation, and span output paths. The interface
  * is deliberately limited to reference-space quantities; callers own node
  * ordering translation, physical mapping, and any field-level discretization
  * policy.
@@ -174,11 +176,15 @@ class BasisFunction {
     /**
      * @brief Return the reference topology of this basis.
      *
-     * @details Together with order(), this is the authoritative identity of a
-     * basis: a topology plus a polynomial order, with no node-count assumption.
-     * Arbitrary-order bases are constructed from a BasisTopology and an order;
-     * named ElementType layouts (Hex8, Hex27, ...) are a fixed-order shorthand
-     * that maps to the same (topology, order) pair.
+     * @details Together with order() and basis_type(), this is the authoritative
+     * identity of a basis: a topology, a polynomial order, and a basis family,
+     * with no node-count assumption. The family is part of the identity because
+     * the same topology and order can denote different bases -- a hexahedron at
+     * order 2 is the Hex20 serendipity space or the Hex27 Lagrange space
+     * depending on basis_type(). Arbitrary-order bases are constructed from a
+     * BasisTopology and an order; named ElementType layouts (Hex8, Hex27, ...)
+     * are a fixed-order shorthand that maps to the same (topology, order, family)
+     * triple.
      *
      * @return Reference topology.
      */
@@ -192,7 +198,9 @@ class BasisFunction {
 
     /**
      * @brief Return the polynomial order represented by this basis.
-     * @return Effective polynomial order after any element-family normalization.
+     * @return Polynomial order of the basis. A named element layout reports the
+     *         order implied by that layout (Quad8 and Hex20 report 2, Hex8
+     *         reports 1), not its node count.
      */
     virtual int order() const noexcept = 0;
 

From 0e34aa294b3ba80a82603b5a2ce396fddd9b4c6e Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 15:52:22 -0700
Subject: [PATCH 78/91] FE/Basis: document when each basis exception type is
 raised

Add an explanatory sentence and a concrete example to each derived basis
exception, drawn from its real throw sites, so the specific types guide
developers on when to raise or expect them.
---
 Code/Source/solver/FE/Basis/BasisExceptions.h | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index dd642e16b..e32a60c66 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -44,6 +44,13 @@ class BasisException : public FEException {
 
 /**
  * @brief Invalid Basis request or configuration
+ *
+ * @details Raised when a request is malformed before any geometry is built: a
+ * missing or negative polynomial order, a named element layout paired with an
+ * explicit order that does not match its fixed order, or a field type or
+ * continuity the scalar Lagrange/Serendipity factory does not support. Example:
+ * constructing a LagrangeBasis for Tetra10 at order 1, when that layout is fixed
+ * at order 2.
  */
 class BasisConfigurationException : public BasisException {
 public:
@@ -56,6 +63,11 @@ class BasisConfigurationException : public BasisException {
 
 /**
  * @brief Requested element topology is incompatible with the basis family
+ *
+ * @details Raised when the family cannot represent the requested topology or
+ * named layout. Example: requesting wedge serendipity through the arbitrary-order
+ * topology path (only the named Wedge15 layout is supported), or requesting a
+ * basis on ElementType::Unknown.
  */
 class BasisElementCompatibilityException : public BasisException {
 public:
@@ -68,6 +80,10 @@ class BasisElementCompatibilityException : public BasisException {
 
 /**
  * @brief Basis evaluation request cannot be satisfied
+ *
+ * @details Raised at evaluation time rather than construction time. Example: an
+ * output span smaller than size(), or requesting analytical gradients or Hessians
+ * from a basis that does not provide them.
  */
 class BasisEvaluationException : public BasisException {
 public:
@@ -80,6 +96,10 @@ class BasisEvaluationException : public BasisException {
 
 /**
  * @brief Public-to-canonical node ordering or coordinate lookup failure
+ *
+ * @details Raised when a node index or coordinate lookup falls outside the
+ * reference layout. Example: requesting a tensor-axis node index outside
+ * [0, order] from line_coord_pm_one.
  */
 class BasisNodeOrderingException : public BasisException {
 public:
@@ -92,6 +112,10 @@ class BasisNodeOrderingException : public BasisException {
 
 /**
  * @brief Internal basis construction or transform setup failure
+ *
+ * @details Signals a violated internal invariant during setup (StatusCode::
+ * InternalError) rather than bad user input. Example: a generated Lagrange node
+ * lattice whose index components fall outside [0, order] in get_lagrange_lattice.
  */
 class BasisConstructionException : public BasisException {
 public:

From 512043a20df7c193e61b4be7e6e7e29cd9f72e4c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 15:52:59 -0700
Subject: [PATCH 79/91] FE/Basis: state the GLL/equispaced node distinction
 without implementation history

Name the orders and elements that use equispaced nodes directly, instead of
referring to what previous element layouts kept, so the note stays accurate as
the implementation evolves.
---
 Code/Source/solver/FE/Basis/LagrangeBasis.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 487af5044..8334613e7 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -55,8 +55,9 @@ namespace basis {
  *   GLL through-axis.
  *
  * Because GLL coincides with the equispaced layout at orders 1 and 2
- * (line_coord_pm_one), every named production element keeps its standard
- * coordinates and the GLL/equispaced distinction appears only for order >= 3.
+ * (line_coord_pm_one), the linear and quadratic tensor elements -- Line2/Line3,
+ * Quad4/Quad9, Hex8/Hex27, and the wedge through-axis -- are built on equispaced
+ * nodes, and the GLL/equispaced distinction appears only for order >= 3.
  *
  * ## Evaluation
  *

From ecc15fac24346b066dcb9f44ca55298d87352a78 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 16:02:59 -0700
Subject: [PATCH 80/91] FE/Basis: return unique_ptr from basis_factory create
 functions

create() and create_default_for() now return std::unique_ptr<BasisFunction>,
which has simpler ownership semantics and is cheaper than shared_ptr. The single
caching consumer keeps a shared_ptr cache; the returned unique_ptr converts to it
on insertion. All FE/Basis unit tests pass.
---
 Code/Source/solver/FE/Basis/BasisFactory.cpp | 16 ++++++++--------
 Code/Source/solver/FE/Basis/BasisFactory.h   | 10 ++++++----
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index dddf549d9..79841ae13 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -49,35 +49,35 @@ void require_scalar_c0_request(const BasisRequest& req) {
         "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
 }
 
-std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
+std::unique_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
     require_scalar_c0_request(req);
     const int order = require_basis_order(
         req,
         "BasisFactory: Lagrange creation requires an explicit order",
         "BasisFactory: Lagrange requires non-negative order");
     if (require_single_request_target(req) == RequestTarget::Topology) {
-        return std::make_shared<LagrangeBasis>(req.topology, order);
+        return std::make_unique<LagrangeBasis>(req.topology, order);
     }
-    return std::make_shared<LagrangeBasis>(req.element_type, order);
+    return std::make_unique<LagrangeBasis>(req.element_type, order);
 }
 
-std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
+std::unique_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
     require_scalar_c0_request(req);
     const int order = require_basis_order(
         req,
         "BasisFactory: Serendipity creation requires an explicit order",
         "BasisFactory: Serendipity requires non-negative order");
     if (require_single_request_target(req) == RequestTarget::Topology) {
-        return std::make_shared<SerendipityBasis>(req.topology, order);
+        return std::make_unique<SerendipityBasis>(req.topology, order);
     }
-    return std::make_shared<SerendipityBasis>(req.element_type, order);
+    return std::make_unique<SerendipityBasis>(req.element_type, order);
 }
 
 } // namespace
 
 namespace basis_factory {
 
-std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
+std::unique_ptr<BasisFunction> create(const BasisRequest& req) {
     switch (req.basis_type) {
         case BasisType::Lagrange:
             return create_lagrange(req);
@@ -110,7 +110,7 @@ BasisRequest default_basis_request(ElementType element_type) {
     }
 }
 
-std::shared_ptr<BasisFunction> create_default_for(ElementType element_type) {
+std::unique_ptr<BasisFunction> create_default_for(ElementType element_type) {
     return create(default_basis_request(element_type));
 }
 
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index 5ce99682e..a49365249 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -62,9 +62,10 @@ namespace basis_factory {
  * arbitrary-order path.
  *
  * @param req Basis family, target, and order request.
- * @return Shared basis instance.
+ * @return Unique basis instance. Move it into a std::shared_ptr at the call site
+ *         if shared ownership is needed.
  */
-[[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
+[[nodiscard]] std::unique_ptr<BasisFunction> create(const BasisRequest& req);
 
 /**
  * @brief Return the default basis request (family and order) for an element type.
@@ -92,9 +93,10 @@ namespace basis_factory {
  * @details Equivalent to create(default_basis_request(element_type)).
  *
  * @param element_type Element type to create a default basis for.
- * @return Shared basis instance.
+ * @return Unique basis instance. Move it into a std::shared_ptr at the call site
+ *         if shared ownership is needed.
  */
-[[nodiscard]] std::shared_ptr<BasisFunction> create_default_for(ElementType element_type);
+[[nodiscard]] std::unique_ptr<BasisFunction> create_default_for(ElementType element_type);
 
 } // namespace basis_factory
 

From 6cb363fcfb6fe0b0148a5ea4f5c770b1493016e2 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 16:18:58 -0700
Subject: [PATCH 81/91] FE/Basis: make span evaluators the override point and
 share vector overloads in the base

The span *_to evaluators are now the primitives a concrete basis implements:
evaluate_values_to is pure virtual, evaluate_gradients_to/evaluate_hessians_to
default to reporting not-implemented, and a protected evaluate_all_to provides the
single-pass combined evaluation. The std::vector overloads (evaluate_values,
evaluate_gradients, evaluate_hessians, evaluate_all) are implemented once on the
base class, so LagrangeBasis and SerendipityBasis no longer duplicate those
wrappers.

This inverts the previous arrangement where the vector form was the primitive and
the base supplied an allocate-and-copy span fallback. A minimal basis now
overrides the span primitive instead of the vector form. The BasisFunction test
helpers are updated accordingly, and the former fallback test is rewritten to
verify the base vector overloads forward to the span primitives. All FE/Basis
unit tests pass.
---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  80 ++++++++-----
 Code/Source/solver/FE/Basis/BasisFunction.h   | 108 ++++++++++++-----
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  31 -----
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  69 +----------
 .../solver/FE/Basis/SerendipityBasis.cpp      |  31 -----
 .../Source/solver/FE/Basis/SerendipityBasis.h |  61 +---------
 .../FE/Basis/test_BasisErrorPaths.cpp         | 113 ++++++++++--------
 7 files changed, 194 insertions(+), 299 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 4849fabda..9939fbfa6 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -24,57 +24,75 @@ const std::vector<math::Vector<double, 3>>& BasisFunction::nodes() const noexcep
     return kNoNodes;
 }
 
+// Vector-output overloads: size the container and forward to the matching span
+// primitive. Defined once here so concrete families implement only the span
+// primitives below.
+void BasisFunction::evaluate_values(const math::Vector<double, 3>& xi,
+                                    std::vector<double>& values) const {
+    values.resize(size());
+    evaluate_values_to(xi, std::span<double>(values.data(), values.size()));
+}
+
 void BasisFunction::evaluate_gradients(const math::Vector<double, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
-    (void)xi;
-    (void)gradients;
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "Analytic gradient evaluation is not implemented for this basis");
+    gradients.resize(size());
+    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
 void BasisFunction::evaluate_hessians(const math::Vector<double, 3>& xi,
                                       std::vector<Hessian>& hessians) const {
-    (void)xi;
-    (void)hessians;
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "Analytic Hessian evaluation is not implemented for this basis");
+    hessians.resize(size());
+    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void BasisFunction::evaluate_all(const math::Vector<double, 3>& xi,
                                  std::vector<double>& values,
                                  std::vector<Gradient>& gradients,
                                  std::vector<Hessian>& hessians) const {
-    evaluate_values(xi, values);
-    evaluate_gradients(xi, gradients);
-    evaluate_hessians(xi, hessians);
-}
-
-// The base-class *_to overloads are a correct fallback for bases that implement
-// only the vector evaluators: they evaluate into a temporary and copy into the
-// caller's span. The concrete nodal families (LagrangeBasis, SerendipityBasis)
-// override these to compute directly into the span without the temporary.
-void BasisFunction::evaluate_values_to(const math::Vector<double, 3>& xi,
-                                       std::span<double> values_out) const {
-    require_span_size(values_out.size(), size(), "BasisFunction::evaluate_values_to");
-    std::vector<double> tmp(size());
-    evaluate_values(xi, tmp);
-    std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
+    values.resize(size());
+    gradients.resize(size());
+    hessians.resize(size());
+    evaluate_all_to(xi,
+                    std::span<double>(values.data(), values.size()),
+                    std::span<Gradient>(gradients.data(), gradients.size()),
+                    std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
+// The gradient/Hessian span primitives default to reporting "not implemented"; a
+// family supplies analytical derivatives by overriding them. evaluate_values_to
+// has no base definition: every basis must provide values.
 void BasisFunction::evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                           std::span<Gradient> gradients_out) const {
-    require_span_size(gradients_out.size(), size(), "BasisFunction::evaluate_gradients_to");
-    std::vector<Gradient> tmp(size());
-    evaluate_gradients(xi, tmp);
-    std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
+    (void)xi;
+    (void)gradients_out;
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic gradient evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                          std::span<Hessian> hessians_out) const {
-    require_span_size(hessians_out.size(), size(), "BasisFunction::evaluate_hessians_to");
-    std::vector<Hessian> tmp(size());
-    evaluate_hessians(xi, tmp);
-    std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
+    (void)xi;
+    (void)hessians_out;
+    svmp::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic Hessian evaluation is not implemented for this basis");
+}
+
+// Combined evaluator default: forward each requested (non-empty) quantity to its
+// single-quantity span primitive. Families override this to share per-point setup
+// across the requested quantities.
+void BasisFunction::evaluate_all_to(const math::Vector<double, 3>& xi,
+                                    std::span<double> values_out,
+                                    std::span<Gradient> gradients_out,
+                                    std::span<Hessian> hessians_out) const {
+    if (!values_out.empty()) {
+        evaluate_values_to(xi, values_out);
+    }
+    if (!gradients_out.empty()) {
+        evaluate_gradients_to(xi, gradients_out);
+    }
+    if (!hessians_out.empty()) {
+        evaluate_hessians_to(xi, hessians_out);
+    }
 }
 
 void BasisFunction::numerical_gradient(const math::Vector<double, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 9668a8354..c673cb2e0 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -61,14 +61,13 @@
  * lower-dimensional elements, only the first dimension() components are
  * active. Returned gradients always have three components and Hessians are
  * always 3-by-3 matrices; inactive reference directions are expected to be
- * zero for conforming lower-dimensional bases. The std::vector overloads are
- * convenient for setup, tests, and adapter code. The *_to overloads write to
- * caller-owned spans; the concrete nodal families (LagrangeBasis,
- * SerendipityBasis) compute directly into the span and so provide the
- * allocation-free path for assembly. The base-class defaults instead evaluate
- * into a temporary and copy into the span, so a basis that implements only the
- * vector form still works through the span API, just without the allocation
- * savings.
+ * zero for conforming lower-dimensional bases. The *_to overloads write to
+ * caller-owned spans and are the override points a concrete family implements:
+ * the nodal families (LagrangeBasis, SerendipityBasis) compute directly into the
+ * span, so this is the allocation-free path for assembly. The std::vector
+ * overloads are convenient for setup, tests, and adapter code; they are defined
+ * once on the base class, which sizes the output and forwards to the matching
+ * span overload.
  *
  * Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
  * solver's native node order. A caller that stores elements in another local
@@ -155,11 +154,12 @@ void require_span_size(std::size_t actual, std::size_t expected, const char* lab
  * @ingroup FE_Basis
  *
  * BasisFunction defines the common query and evaluation API used by solver
- * code that does not need to know the concrete basis implementation. Derived
- * classes provide shape function values at minimum and can override analytical
- * gradients, Hessians, combined evaluation, and span output paths. The interface
- * is deliberately limited to reference-space quantities; callers own node
- * ordering translation, physical mapping, and any field-level discretization
+ * code that does not need to know the concrete basis implementation. Concrete
+ * families implement the span output primitives -- shape function values at
+ * minimum, and optionally analytical gradients and Hessians; the vector
+ * overloads and the combined evaluator are provided once by the base class. The
+ * interface is deliberately limited to reference-space quantities; callers own
+ * node ordering translation, physical mapping, and any field-level discretization
  * policy.
  */
 class BasisFunction {
@@ -226,73 +226,121 @@ class BasisFunction {
 
     /**
      * @brief Evaluate basis function values at a reference coordinate.
+     *
+     * @details Convenience overload: it sizes \p values to size() and forwards to
+     * evaluate_values_to(). It is implemented once on the base class, so concrete
+     * families override the span primitive rather than this overload.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
      */
-    virtual void evaluate_values(const math::Vector<double, 3>& xi,
-                                 std::vector<double>& values) const = 0;
+    void evaluate_values(const math::Vector<double, 3>& xi,
+                         std::vector<double>& values) const;
 
     /**
      * @brief Evaluate basis gradients at a reference coordinate.
+     *
+     * @details Convenience overload over evaluate_gradients_to(); see
+     * evaluate_values() for the sizing and forwarding contract.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param gradients Receives one three-component gradient per basis function.
      * @throws BasisEvaluationException If gradients are not available for the basis.
      */
-    virtual void evaluate_gradients(const math::Vector<double, 3>& xi,
-                                    std::vector<Gradient>& gradients) const;
+    void evaluate_gradients(const math::Vector<double, 3>& xi,
+                            std::vector<Gradient>& gradients) const;
 
     /**
      * @brief Evaluate basis Hessians at a reference coordinate.
+     *
+     * @details Convenience overload over evaluate_hessians_to(); see
+     * evaluate_values() for the sizing and forwarding contract.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians Receives one 3-by-3 Hessian per basis function.
      * @throws BasisEvaluationException If Hessians are not available for the basis.
      */
-    virtual void evaluate_hessians(const math::Vector<double, 3>& xi,
-                                   std::vector<Hessian>& hessians) const;
+    void evaluate_hessians(const math::Vector<double, 3>& xi,
+                           std::vector<Hessian>& hessians) const;
 
     /**
      * @brief Evaluate values, gradients, and Hessians together.
+     *
+     * @details Convenience overload over evaluate_all_to(): it sizes all three
+     * containers to size() and forwards them in a single pass.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
      * @param gradients Receives one three-component gradient per basis function.
      * @param hessians Receives one 3-by-3 Hessian per basis function.
      */
-    virtual void evaluate_all(const math::Vector<double, 3>& xi,
-                              std::vector<double>& values,
-                              std::vector<Gradient>& gradients,
-                              std::vector<Hessian>& hessians) const;
+    void evaluate_all(const math::Vector<double, 3>& xi,
+                      std::vector<double>& values,
+                      std::vector<Gradient>& gradients,
+                      std::vector<Hessian>& hessians) const;
 
     /**
      * @brief Evaluate basis values into caller-provided storage.
+     *
+     * @details This span primitive is the single required override for a concrete
+     * basis: the vector overloads above and the combined evaluate_all_to() are all
+     * defined in terms of it, so a minimal basis implements only this method.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values_out Output span with at least size() entries.
-     * @note The base-class default evaluates into a temporary and copies; nodal
-     *       families override this to write directly into the span.
      */
     virtual void evaluate_values_to(const math::Vector<double, 3>& xi,
-                                    std::span<double> values_out) const;
+                                    std::span<double> values_out) const = 0;
 
     /**
      * @brief Evaluate basis gradients into caller-provided storage.
+     *
+     * @details Override to supply analytical gradients. The base implementation
+     * throws, so a family that provides no gradients reports it uniformly through
+     * every gradient entry point.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param gradients_out Output span with at least size() entries.
-     * @note The base-class default evaluates into a temporary and copies; nodal
-     *       families override this to write directly into the span.
+     * @throws BasisEvaluationException If gradients are not available for the basis.
      */
     virtual void evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                        std::span<Gradient> gradients_out) const;
 
     /**
      * @brief Evaluate basis Hessians into caller-provided storage.
+     *
+     * @details Override to supply analytical Hessians. The base implementation
+     * throws, so a family that provides no Hessians reports it uniformly through
+     * every Hessian entry point.
+     *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param hessians_out Output span with at least size() entries.
-     * @note The base-class default evaluates into a temporary and copies; nodal
-     *       families override this to write directly into the span.
+     * @throws BasisEvaluationException If Hessians are not available for the basis.
      */
     virtual void evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                       std::span<Hessian> hessians_out) const;
 
 protected:
+    /**
+     * @brief Evaluate any non-empty subset of values, gradients, and Hessians
+     * into caller-provided storage in a single pass.
+     *
+     * @details An empty span selects "skip that quantity". The base
+     * implementation forwards each requested quantity to its single-quantity span
+     * primitive; families that can share per-point setup override this to compute
+     * the requested quantities together. It backs the public evaluate_all()
+     * overload.
+     *
+     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+     * @param values_out Values output span, or empty to skip.
+     * @param gradients_out Gradients output span, or empty to skip.
+     * @param hessians_out Hessians output span, or empty to skip.
+     */
+    virtual void evaluate_all_to(const math::Vector<double, 3>& xi,
+                                 std::span<double> values_out,
+                                 std::span<Gradient> gradients_out,
+                                 std::span<Hessian> hessians_out) const;
+
     /**
      * @brief Approximate gradients by centered finite differences of values.
      *
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index dbb29bff3..83c27eb90 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -589,37 +589,6 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
         "Unsupported element in LagrangeBasis evaluation");
 }
 
-void LagrangeBasis::evaluate_values(const Vec3& xi,
-                                    std::vector<double>& values) const {
-    values.resize(size());
-    evaluate_values_to(xi, std::span<double>(values.data(), values.size()));
-}
-
-void LagrangeBasis::evaluate_gradients(const Vec3& xi,
-                                       std::vector<Gradient>& gradients) const {
-    gradients.resize(size());
-    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
-}
-
-void LagrangeBasis::evaluate_hessians(const Vec3& xi,
-                                      std::vector<Hessian>& hessians) const {
-    hessians.resize(size());
-    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
-}
-
-void LagrangeBasis::evaluate_all(const Vec3& xi,
-                                 std::vector<double>& values,
-                                 std::vector<Gradient>& gradients,
-                                 std::vector<Hessian>& hessians) const {
-    values.resize(size());
-    gradients.resize(size());
-    hessians.resize(size());
-    evaluate_all_to(xi,
-                    std::span<double>(values.data(), values.size()),
-                    std::span<Gradient>(gradients.data(), gradients.size()),
-                    std::span<Hessian>(hessians.data(), hessians.size()));
-}
-
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
                                        std::span<double> values_out) const {
     require_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 8334613e7..001960675 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -197,73 +197,6 @@ class LagrangeBasis final : public BasisFunction {
      */
     const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
-    /**
-     * @brief Evaluate Lagrange basis function values at a reference coordinate.
-     *
-     * @details Values satisfy the nodal interpolation property
-     * @f$N_i(x_j)=\delta_{ij}@f$ at the basis nodes. Tensor-product values are
-     * products of one-dimensional Lagrange polynomials. Simplex values are
-     * products of barycentric falling-factorial factors. Wedge values are
-     * products of triangle simplex values and through-axis Lagrange values.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param values Receives one value per basis function.
-     */
-    void evaluate_values(const math::Vector<double, 3>& xi,
-                         std::vector<double>& values) const final;
-
-    /**
-     * @brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
-     *
-     * @details Gradients are derivatives with respect to reference
-     * coordinates, not physical coordinates. Tensor-product gradients apply
-     * the product rule to the active axis polynomials. Simplex gradients
-     * differentiate the barycentric factors and multiply by the constant
-     * gradients of the barycentric coordinates. Wedge gradients combine the
-     * triangle gradient in the first two components with the through-axis
-     * derivative in the third component.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param gradients Receives one three-component gradient per basis function.
-     */
-    void evaluate_gradients(const math::Vector<double, 3>& xi,
-                            std::vector<Gradient>& gradients) const final;
-
-    /**
-     * @brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
-     *
-     * @details Hessians are second derivatives in reference coordinates and
-     * are stored as 3-by-3 matrices. Tensor-product Hessians contain pure
-     * second axis derivatives on the diagonal and mixed product-rule terms
-     * off diagonal. Simplex Hessians are assembled from first and second
-     * derivatives of the barycentric factors. Wedge Hessians contain triangle
-     * Hessian terms, through-axis second derivatives, and mixed
-     * triangle/through-axis derivative products.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param hessians Receives one 3-by-3 Hessian per basis function.
-     */
-    void evaluate_hessians(const math::Vector<double, 3>& xi,
-                           std::vector<Hessian>& hessians) const final;
-
-    /**
-     * @brief Evaluate Lagrange values, gradients, and Hessians together.
-     *
-     * @details This is the allocation-friendly vector API for callers that
-     * need all basis quantities at the same quadrature point. The underlying
-     * evaluator computes only topology-local polynomial data once and then
-     * fills all requested outputs.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param values Receives one value per basis function.
-     * @param gradients Receives one three-component gradient per basis function.
-     * @param hessians Receives one 3-by-3 Hessian per basis function.
-     */
-    void evaluate_all(const math::Vector<double, 3>& xi,
-                      std::vector<double>& values,
-                      std::vector<Gradient>& gradients,
-                      std::vector<Hessian>& hessians) const final;
-
     /**
      * @brief Evaluate Lagrange basis values into caller-provided storage.
      *
@@ -332,7 +265,7 @@ class LagrangeBasis final : public BasisFunction {
     void evaluate_all_to(const math::Vector<double, 3>& xi,
                          std::span<double> values_out,
                          std::span<Gradient> gradients_out,
-                         std::span<Hessian> hessians_out) const;
+                         std::span<Hessian> hessians_out) const override;
     void evaluate_point_to(std::span<double> values_out,
                            std::span<Gradient> gradients_out,
                            std::span<Hessian> hessians_out) const;
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index e47718320..0feac25d3 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -537,37 +537,6 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     }
 }
 
-void SerendipityBasis::evaluate_values(const math::Vector<double, 3>& xi,
-                                       std::vector<double>& values) const {
-    values.resize(size_);
-    evaluate_values_to(xi, std::span<double>(values.data(), values.size()));
-}
-
-void SerendipityBasis::evaluate_gradients(const math::Vector<double, 3>& xi,
-                                          std::vector<Gradient>& gradients) const {
-    gradients.resize(size_);
-    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
-}
-
-void SerendipityBasis::evaluate_hessians(const math::Vector<double, 3>& xi,
-                                         std::vector<Hessian>& hessians) const {
-    hessians.resize(size_);
-    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
-}
-
-void SerendipityBasis::evaluate_all(const math::Vector<double, 3>& xi,
-                                    std::vector<double>& values,
-                                    std::vector<Gradient>& gradients,
-                                    std::vector<Hessian>& hessians) const {
-    values.resize(size_);
-    gradients.resize(size_);
-    hessians.resize(size_);
-    evaluate_all_to(xi,
-                    std::span<double>(values.data(), values.size()),
-                    std::span<Gradient>(gradients.data(), gradients.size()),
-                    std::span<Hessian>(hessians.data(), hessians.size()));
-}
-
 void SerendipityBasis::evaluate_values_to(const math::Vector<double, 3>& xi,
                                           std::span<double> values_out) const {
     require_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 2efb4e788..edef4cf6c 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -230,65 +230,6 @@ class SerendipityBasis final : public BasisFunction {
      */
     const std::vector<math::Vector<double, 3>>& nodes() const noexcept final { return nodes_; }
 
-    /**
-     * @brief Evaluate serendipity basis function values at a reference coordinate.
-     *
-     * @details Every family evaluates the serendipity modal vector and multiplies
-     * by the generated inverse Vandermonde matrix to obtain nodal shape-function
-     * values. Quadrilateral and hexahedral bases use tensor Legendre modes; the
-     * fixed Wedge15 layout uses monomial modes. The coefficient table is already
-     * in public basis order, so no output reordering is needed.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param values Receives one value per basis function.
-     */
-    void evaluate_values(const math::Vector<double, 3>& xi,
-                         std::vector<double>& values) const final;
-
-    /**
-     * @brief Evaluate analytical serendipity basis gradients at a reference coordinate.
-     *
-     * @details Gradients are derivatives with respect to reference coordinates.
-     * Every family differentiates the same modal vector used for values and
-     * applies the generated inverse Vandermonde coefficients.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param gradients Receives one three-component gradient per basis function.
-     */
-    void evaluate_gradients(const math::Vector<double, 3>& xi,
-                            std::vector<Gradient>& gradients) const final;
-
-    /**
-     * @brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
-     *
-     * @details Hessians are second derivatives in reference coordinates and are
-     * stored as 3-by-3 matrices. Every family uses the second derivatives of the
-     * same modal vector used for values together with the generated inverse
-     * Vandermonde coefficients.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param hessians Receives one 3-by-3 Hessian per basis function.
-     */
-    void evaluate_hessians(const math::Vector<double, 3>& xi,
-                           std::vector<Hessian>& hessians) const final;
-
-    /**
-     * @brief Evaluate serendipity values, gradients, and Hessians together.
-     *
-     * @details This vector API is backed by the same span-based evaluator as
-     * the assembly-oriented `*_to` methods, so topology-specific polynomial
-     * setup can be shared for a quadrature point.
-     *
-     * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-     * @param values Receives one value per basis function.
-     * @param gradients Receives one three-component gradient per basis function.
-     * @param hessians Receives one 3-by-3 Hessian per basis function.
-     */
-    void evaluate_all(const math::Vector<double, 3>& xi,
-                      std::vector<double>& values,
-                      std::vector<Gradient>& gradients,
-                      std::vector<Hessian>& hessians) const final;
-
     /**
      * @brief Evaluate serendipity basis values into caller-provided storage.
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
@@ -343,7 +284,7 @@ class SerendipityBasis final : public BasisFunction {
     void evaluate_all_to(const math::Vector<double, 3>& xi,
                          std::span<double> values_out,
                          std::span<Gradient> gradients_out,
-                         std::span<Hessian> hessians_out) const;
+                         std::span<Hessian> hessians_out) const override;
 };
 
 /** @} */
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 0c1f8d6a0..5df40ddcc 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -12,6 +12,8 @@
 #include "FE/Basis/NodeOrderingConventions.h"
 #include "FE/Basis/SerendipityBasis.h"
 
+#include <algorithm>
+#include <span>
 #include <string>
 #include <vector>
 
@@ -46,10 +48,10 @@ class MinimalScalarBasis : public BasisFunction {
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<double, 3>&,
-                         std::vector<double>& values) const override
+    void evaluate_values_to(const math::Vector<double, 3>&,
+                            std::span<double> values_out) const override
     {
-        values.assign(size(), double(0));
+        std::fill(values_out.begin(), values_out.end(), double(0));
     }
 };
 
@@ -68,34 +70,34 @@ class ExactQuadraticBasis : public BasisFunction {
     int order() const noexcept override { return 2; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<double, 3>& xi,
-                         std::vector<double>& values) const override
+    void evaluate_values_to(const math::Vector<double, 3>& xi,
+                            std::span<double> values_out) const override
     {
         const double x = xi[0];
         const double y = xi[1];
         const double z = xi[2];
-        values.resize(size());
-        values[0] = double(1) + double(2) * x - y + double(0.5) * z +
-                    x * x + double(0.75) * y * y - double(0.25) * z * z +
-                    double(0.2) * x * y - double(0.3) * x * z + double(0.4) * y * z;
-        values[1] = double(3) - x + double(2) * y + z +
-                    double(0.5) * x * x - y * y + z * z +
-                    x * y + x * z - y * z;
+        values_out[0] = double(1) + double(2) * x - y + double(0.5) * z +
+                        x * x + double(0.75) * y * y - double(0.25) * z * z +
+                        double(0.2) * x * y - double(0.3) * x * z + double(0.4) * y * z;
+        values_out[1] = double(3) - x + double(2) * y + z +
+                        double(0.5) * x * x - y * y + z * z +
+                        x * y + x * z - y * z;
     }
 
-    void evaluate_gradients(const math::Vector<double, 3>& xi,
-                            std::vector<Gradient>& gradients) const override
+    void evaluate_gradients_to(const math::Vector<double, 3>& xi,
+                               std::span<Gradient> gradients_out) const override
     {
         const double x = xi[0];
         const double y = xi[1];
         const double z = xi[2];
-        gradients.assign(size(), Gradient::Zero());
-        gradients[0][0] = double(2) + double(2) * x + double(0.2) * y - double(0.3) * z;
-        gradients[0][1] = double(-1) + double(1.5) * y + double(0.2) * x + double(0.4) * z;
-        gradients[0][2] = double(0.5) - double(0.5) * z - double(0.3) * x + double(0.4) * y;
-        gradients[1][0] = double(-1) + x + y + z;
-        gradients[1][1] = double(2) - double(2) * y + x - z;
-        gradients[1][2] = double(1) + double(2) * z + x - y;
+        gradients_out[0] = Gradient::Zero();
+        gradients_out[1] = Gradient::Zero();
+        gradients_out[0][0] = double(2) + double(2) * x + double(0.2) * y - double(0.3) * z;
+        gradients_out[0][1] = double(-1) + double(1.5) * y + double(0.2) * x + double(0.4) * z;
+        gradients_out[0][2] = double(0.5) - double(0.5) * z - double(0.3) * x + double(0.4) * y;
+        gradients_out[1][0] = double(-1) + x + y + z;
+        gradients_out[1][1] = double(2) - double(2) * y + x - z;
+        gradients_out[1][2] = double(1) + double(2) * z + x - y;
     }
 
     void exact_hessians(std::vector<Hessian>& hessians) const
@@ -108,7 +110,11 @@ class ExactQuadraticBasis : public BasisFunction {
     }
 };
 
-class CompleteFallbackBasis : public BasisFunction {
+// Basis that implements only the span primitives and deliberately does not
+// override the combined evaluate_all_to. It therefore exercises the base class's
+// vector overloads and the default combined evaluator, both of which must forward
+// to these primitives.
+class SpanPrimitiveBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     BasisTopology topology() const noexcept override { return BasisTopology::Triangle; }
@@ -116,32 +122,32 @@ class CompleteFallbackBasis : public BasisFunction {
     int order() const noexcept override { return 1; }
     std::size_t size() const noexcept override { return 2u; }
 
-    void evaluate_values(const math::Vector<double, 3>& xi,
-                         std::vector<double>& values) const override
+    void evaluate_values_to(const math::Vector<double, 3>& xi,
+                            std::span<double> values_out) const override
     {
-        values.resize(size());
-        values[0] = double(1) + xi[0];
-        values[1] = double(2) + xi[1];
+        values_out[0] = double(1) + xi[0];
+        values_out[1] = double(2) + xi[1];
     }
 
-    void evaluate_gradients(const math::Vector<double, 3>&,
-                            std::vector<Gradient>& gradients) const override
+    void evaluate_gradients_to(const math::Vector<double, 3>&,
+                               std::span<Gradient> gradients_out) const override
     {
-        gradients.assign(size(), Gradient::Zero());
-        gradients[0][0] = double(1);
-        gradients[1][1] = double(1);
+        gradients_out[0] = Gradient::Zero();
+        gradients_out[1] = Gradient::Zero();
+        gradients_out[0][0] = double(1);
+        gradients_out[1][1] = double(1);
     }
 
-    void evaluate_hessians(const math::Vector<double, 3>& xi,
-                           std::vector<Hessian>& hessians) const override
+    void evaluate_hessians_to(const math::Vector<double, 3>& xi,
+                              std::span<Hessian> hessians_out) const override
     {
-        hessians.assign(size(), Hessian::Zero());
-        for (std::size_t d = 0; d < hessians.size(); ++d) {
+        for (std::size_t d = 0; d < size(); ++d) {
+            hessians_out[d] = Hessian::Zero();
             for (std::size_t r = 0; r < 3u; ++r) {
                 for (std::size_t c = 0; c < 3u; ++c) {
-                    hessians[d](r, c) = double(100) * static_cast<double>(d + 1u) +
-                                        double(10) * static_cast<double>(r) +
-                                        static_cast<double>(c) + xi[2];
+                    hessians_out[d](r, c) = double(100) * static_cast<double>(d + 1u) +
+                                            double(10) * static_cast<double>(r) +
+                                            static_cast<double>(c) + xi[2];
                 }
             }
         }
@@ -390,10 +396,11 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     }
 }
 
-TEST(BasisErrorPaths, BasisFunctionFallbackWritesSpanOutputs) {
-    CompleteFallbackBasis basis;
+TEST(BasisErrorPaths, BasisFunctionVectorOverloadsForwardToSpanPrimitives) {
+    SpanPrimitiveBasis basis;
     const math::Vector<double, 3> point{double(0.25), double(0.5), double(-0.25)};
 
+    // Reference results taken directly from the span primitives the basis defines.
     std::vector<double> span_values(basis.size());
     std::vector<Gradient> span_gradients(basis.size());
     std::vector<Hessian> span_hessians(basis.size());
@@ -401,18 +408,28 @@ TEST(BasisErrorPaths, BasisFunctionFallbackWritesSpanOutputs) {
     basis.evaluate_gradients_to(point, span_gradients);
     basis.evaluate_hessians_to(point, span_hessians);
 
-    std::vector<double> expected_values;
-    std::vector<Gradient> expected_gradients;
-    std::vector<Hessian> expected_hessians;
-    basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
+    // The base-class vector overloads must size their outputs and forward to the
+    // span primitives; evaluate_all() goes through the default combined evaluator.
+    std::vector<double> values;
+    basis.evaluate_values(point, values);
+    std::vector<double> all_values;
+    std::vector<Gradient> all_gradients;
+    std::vector<Hessian> all_hessians;
+    basis.evaluate_all(point, all_values, all_gradients, all_hessians);
+
+    ASSERT_EQ(values.size(), basis.size());
+    ASSERT_EQ(all_values.size(), basis.size());
+    ASSERT_EQ(all_gradients.size(), basis.size());
+    ASSERT_EQ(all_hessians.size(), basis.size());
     for (std::size_t d = 0; d < basis.size(); ++d) {
-        EXPECT_EQ(span_values[d], expected_values[d]);
+        EXPECT_EQ(values[d], span_values[d]);
+        EXPECT_EQ(all_values[d], span_values[d]);
         for (std::size_t c = 0; c < 3u; ++c) {
-            EXPECT_EQ(span_gradients[d][c], expected_gradients[d][c]);
+            EXPECT_EQ(all_gradients[d][c], span_gradients[d][c]);
         }
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_EQ(span_hessians[d](r, c), expected_hessians[d](r, c));
+                EXPECT_EQ(all_hessians[d](r, c), span_hessians[d](r, c));
             }
         }
     }

From caa4c91bbce794d43268e1614ad488bc1d305841 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:20:16 -0700
Subject: [PATCH 82/91] FE/Basis: clarify out-parameter, sentinel, and
 field-ordering documentation

- note that the vector evaluators use an output argument so callers can reuse a
  buffer across evaluations rather than allocating per call
- document that the BasisTraits classifiers return -1 / Unknown as sentinels that
  callers validate into exceptions (the constexpr noexcept helpers cannot throw)
- keep the BasisRequest::topology field-ordering note out of the rendered docs
---
 Code/Source/solver/FE/Basis/BasisFactory.h  | 6 ++++--
 Code/Source/solver/FE/Basis/BasisFunction.h | 5 ++++-
 Code/Source/solver/FE/Basis/BasisTraits.h   | 9 +++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index a49365249..33c26be67 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -43,9 +43,11 @@ struct BasisRequest {
     std::vector<int> tensor_extents{};               ///< Reserved for tensor-product extents; unused here.
     std::string custom_id{};                         ///< Optional identifier for Custom families.
     /// Reference topology for arbitrary-order requests, or Unknown to request by
-    /// element_type. Kept last so existing aggregate initializers for named
-    /// elements keep their positional meaning.
+    /// element_type.
     BasisTopology topology{BasisTopology::Unknown};
+    // Implementation note (kept out of the rendered docs): topology is declared
+    // last so existing aggregate initializers for named elements keep their
+    // positional meaning.
 };
 
 namespace basis_factory {
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index c673cb2e0..770a0c440 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -229,7 +229,10 @@ class BasisFunction {
      *
      * @details Convenience overload: it sizes \p values to size() and forwards to
      * evaluate_values_to(). It is implemented once on the base class, so concrete
-     * families override the span primitive rather than this overload.
+     * families override the span primitive rather than this overload. The result
+     * is delivered through the output argument rather than by return value so a
+     * caller can reuse one container across repeated evaluations (for example,
+     * across quadrature points) instead of allocating on every call.
      *
      * @param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
      * @param values Receives one value per basis function.
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 79d4653f7..c0ab1e5c5 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -82,6 +82,9 @@ enum class BasisTopology {
         case CellFamily::Hex:      return BasisTopology::Hexahedron;
         case CellFamily::Wedge:    return BasisTopology::Wedge;
         // Pyramid/Polygon/Polyhedron are outside the current basis scope.
+        // BasisTopology::Unknown is a sentinel the basis constructors validate
+        // and convert into a BasisElementCompatibilityException at the call site,
+        // not an error raised from this constexpr noexcept classifier.
         default:                   return BasisTopology::Unknown;
     }
 }
@@ -128,6 +131,12 @@ enum class BasisTopology {
         case ElementType::Wedge18:
             return 2;
         default:
+            // -1 is a sentinel for "not a complete-Lagrange alias" (serendipity
+            // layouts, pyramids, Unknown), not an error: the LagrangeBasis
+            // (ElementType, order) constructor compares the requested order
+            // against named_lagrange_order() and raises BasisConfigurationException
+            // on mismatch. These classifiers are constexpr noexcept and so cannot
+            // throw themselves.
             return -1;
     }
 }

From c733aa15dc96b617a3dd2e5c638bbee30afd6225 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:22:53 -0700
Subject: [PATCH 83/91] FE/Basis: render NodeOrderingConventions docs under an
 internal-API group

Replace the @cond INTERNAL exclusion with a documented internal group whose
@warning states the declarations are internal (use basis_factory and
BasisFunction::nodes() instead) and may change. Core developers now get the
rendered documentation while model-level callers are clearly steered away.
---
 .../solver/FE/Basis/NodeOrderingConventions.h | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index fc0b9a5dd..31a7a2d64 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -17,11 +17,21 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-// NodeOrderingConventions is internal to the Basis module: it is the reference-node
-// generator the basis families build on, not a consumer entry point (callers obtain
-// a basis through basis_factory and read BasisFunction::nodes()), so it is excluded
-// from the public Doxygen output below.
-/** @cond INTERNAL */
+/**
+ * @defgroup FE_BasisNodeOrdering Reference-node generation (internal)
+ * @ingroup FE_Basis
+ * @brief Reference-node generators that the basis families build on.
+ *
+ * @warning Internal implementation detail. Do not use these directly: obtain a
+ * basis through @ref basis_factory and read its nodes via BasisFunction::nodes().
+ * These declarations are part of the internal node-ordering machinery and their
+ * interface may change without notice.
+ *
+ * @details This is the reference-node generator the basis families build on, not
+ * a consumer entry point. It is documented for FE core developers; model-level
+ * code never calls it directly.
+ * @{
+ */
 
 /**
  * @brief The i-th 1D tensor-axis reference node on [-1, 1] at the given order.
@@ -133,7 +143,7 @@ class ReferenceNodeLayout {
     serendipity_node_coords(BasisTopology topology, int order);
 };
 
-/** @endcond */
+/** @} */
 
 } // namespace basis
 } // namespace FE

From 5b497a482390091cf828f5f9c57ce4b26aa101d3 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:27:51 -0700
Subject: [PATCH 84/91] FE/Math: clarify that math::Vector is distinct from the
 legacy solver Vector

Document that FE/Math/Vector.h is a fixed-size, compile-time-length element-level
vector in svmp::FE::math, distinct from and not a replacement for the legacy
dynamic global ::Vector in solver/Vector.h.
---
 Code/Source/solver/FE/Math/Vector.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index efa573dc8..8bd5b3fa9 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -13,6 +13,12 @@
  * Eigen. Note that, unlike the previous in-house implementation, Eigen types
  * are NOT zero-initialized by default construction; use Vector::Zero() where a
  * zeroed value is required.
+ *
+ * This is a small, fixed-size (compile-time length) vector for element-level FE
+ * kernels in namespace svmp::FE::math. It is distinct from, and not a replacement
+ * for, the legacy dynamically sized global ::Vector container in solver/Vector.h:
+ * the two differ in namespace, size model (compile-time vs runtime), and memory
+ * management, and coexist deliberately.
  */
 
 #include <Eigen/Core>

From 7aa2d16b0b28c773dc56b35e7550ab6e460a4e73 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:27:51 -0700
Subject: [PATCH 85/91] FE/Common: document Mesh-module optionality,
 GlobalIndex rationale, and ElementType bands

- explain that the Mesh library is an optional external module and FE builds
  standalone with fallback types when it is absent
- record why GlobalIndex stays a plain alias (raw PETSc/Trilinos interop) and that
  DofIndex is the strong wrapper
- document that the explicit ElementType values are intentional bands with a
  uint8_t Unknown sentinel
---
 Code/Source/solver/FE/Common/Types.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 5beefcce3..458fb5649 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -14,6 +14,11 @@
  * independence from backend-specific types.
  */
 
+// The Mesh library is an optional, external module. When the build enables it
+// (SVMP_FE_WITH_MESH), FE imports the Mesh scalar/index types so the two libraries
+// share a vocabulary; otherwise FE compiles standalone using the fallback
+// definitions below (e.g. svmp::CellFamily and the Mesh* aliases). The Mesh
+// headers are not part of this repository.
 #if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
 #  include "Mesh/Core/MeshTypes.h"
 /** Nonzero when FE shares scalar/index types with the Mesh library. */
@@ -103,6 +108,11 @@ using LocalIndex = std::uint32_t;
  *
  * Signed 64-bit for compatibility with PETSc and Trilinos.
  * Negative values can indicate special conditions or invalid indices.
+ *
+ * @note Kept as a plain integer alias rather than a StrongType wrapper: this is
+ * the raw interop type handed directly to PETSc/Trilinos, where a wrapper would
+ * force an unwrap at every call. Type safety for DOF indices is provided by
+ * DofIndex (below), the strong wrapper around a GlobalIndex.
  */
 using GlobalIndex = std::int64_t;
 
@@ -110,7 +120,9 @@ using GlobalIndex = std::int64_t;
  * @brief DOF-specific index type
  *
  * Strong type alias to prevent mixing DOF indices with other indices.
- * Provides type safety at compile time.
+ * Provides type safety at compile time. It is hand-rolled to carry an invalid
+ * sentinel and is_valid(); QuadraturePointIndex uses the general StrongType
+ * template for the same strong-typing purpose.
  */
 struct DofIndex {
     GlobalIndex value;  ///< Underlying global DOF index; negative values are invalid.
@@ -224,6 +236,12 @@ struct FieldValueEntry {
  *
  * Maps to svmp::CellFamily from the Mesh library but provides
  * FE-specific categorization including higher-order variants.
+ *
+ * @note The explicit enumerator values are intentional and grouped into bands:
+ * linear (0-6), quadratic (10-20), and special (Point1 = 30; Unknown = 255, the
+ * uint8_t sentinel). The enum is consumed via its names, not the numeric values,
+ * but the banding keeps related types together and leaves room to extend each
+ * group; keep new entries within their band.
  */
 enum class ElementType : std::uint8_t {
     // Linear elements

From 3932e77c5dc9ad6095ab1a8eae69b88cbff35f46 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:34:24 -0700
Subject: [PATCH 86/91] FE/Math: document DenseLinearAlgebra and rename label
 to error_message_label

Add Doxygen to the dense linear-algebra functions and the DenseLUSolver type, and
rename the label argument (and the DenseLUSolver member) to error_message_label so
its role as an error-message prefix is self-documenting. All FE math unit tests pass.
---
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  46 ++---
 .../solver/FE/Math/DenseLinearAlgebra.h       | 184 +++++++++++++++---
 .../FE/Math/test_DenseLinearAlgebra.cpp       |   2 +-
 3 files changed, 181 insertions(+), 51 deletions(-)

diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index df06700f9..86c3045c3 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -94,13 +94,13 @@ void DenseLUSolver::solve_in_place(std::span<double> rhs,
                                    std::size_t rhs_count) const {
     ::svmp::check_arg<FEException>(
         rhs_count > 0, SVMP_HERE,
-        label + ": dense solve requires at least one right-hand side");
+        error_message_label + ": dense solve requires at least one right-hand side");
     ::svmp::check_arg<FEException>(
         rhs.size() == n * rhs_count, SVMP_HERE,
-        label + ": dense multi-RHS solve size mismatch");
+        error_message_label + ": dense multi-RHS solve size mismatch");
     ::svmp::check_arg<FEException>(
         impl && impl->lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
-        label + ": dense solver is not factorized");
+        error_message_label + ": dense solver is not factorized");
     if (n == 0) {
         return;
     }
@@ -123,13 +123,13 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
-    std::string_view label) {
+    std::string_view error_message_label) {
     ::svmp::check_arg<FEException>(
         matrix.size() == rows * cols, SVMP_HERE,
-        std::string(label) + ": diagnostic size mismatch");
+        std::string(error_message_label) + ": diagnostic size mismatch");
     ::svmp::check_arg<FEException>(
         rows > 0 && cols > 0, SVMP_HERE,
-        std::string(label) + ": diagnostics require a nonempty matrix");
+        std::string(error_message_label) + ": diagnostics require a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense);
@@ -163,14 +163,14 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
 
 DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
                                   std::size_t n,
-                                  std::string_view label) {
+                                  std::string_view error_message_label) {
     ::svmp::check_arg<FEException>(
         matrix.size() == n * n, SVMP_HERE,
-        std::string(label) + ": dense factorization size mismatch");
+        std::string(error_message_label) + ": dense factorization size mismatch");
 
     DenseLUSolver solver;
     solver.n = n;
-    solver.label = std::string(label);
+    solver.error_message_label = std::string(error_message_label);
     const double max_abs =
         dense_matrix_max_abs(std::span<const double>(matrix.data(), matrix.size()));
     solver.pivot_tolerance = dense_matrix_pivot_tolerance(n, n, max_abs);
@@ -187,7 +187,7 @@ DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
         const double pivot_magnitude = std::abs(diagonal[col]);
         ::svmp::check_arg<FEException>(
             pivot_magnitude > solver.pivot_tolerance, SVMP_HERE,
-            solver.label + ": rank-deficient matrix (rank " +
+            solver.error_message_label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
                 ", pivot below scale-aware tolerance " +
                 std::to_string(solver.pivot_tolerance) + ")");
@@ -208,18 +208,18 @@ DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
 DenseInverseResult invert_dense_matrix_with_diagnostics(
     std::vector<double> matrix,
     std::size_t n,
-    std::string_view label) {
+    std::string_view error_message_label) {
     ::svmp::check_arg<FEException>(
         matrix.size() == n * n, SVMP_HERE,
-        std::string(label) + ": dense inverse size mismatch");
+        std::string(error_message_label) + ": dense inverse size mismatch");
     std::vector<double> matrix_for_lu = matrix;
     const DenseLUSolver solver =
-        factor_dense_matrix(std::move(matrix_for_lu), n, label);
+        factor_dense_matrix(std::move(matrix_for_lu), n, error_message_label);
 
     DenseInverseResult result;
     result.diagnostics =
         dense_matrix_diagnostics(std::span<const double>(matrix.data(), matrix.size()),
-                                 n, n, label);
+                                 n, n, error_message_label);
 
     if (std::isfinite(result.diagnostics.condition_estimate) &&
         result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
@@ -237,7 +237,7 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
             // against future refactors that derive the fallback condition differently.
             ::svmp::check_arg<FEException>(
                 singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
-                std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
+                std::string(error_message_label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = double(1) / singular_values[i];
         }
         const DenseMatrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
@@ -254,11 +254,11 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
 void validate_dense_inverse_diagnostics(
     const DenseInverseResult& result,
     std::size_t expected_rank,
-    std::string_view label,
+    std::string_view error_message_label,
     double max_condition) {
     ::svmp::check_arg<FEException>(
         result.diagnostics.rank == expected_rank, SVMP_HERE,
-        std::string(label) + ": rank-deficient matrix (rank " +
+        std::string(error_message_label) + ": rank-deficient matrix (rank " +
             std::to_string(result.diagnostics.rank) + " of " +
             std::to_string(expected_rank) + ")");
 
@@ -268,15 +268,15 @@ void validate_dense_inverse_diagnostics(
 
     ::svmp::check_arg<FEException>(
         result.diagnostics.condition_estimate <= max_condition, SVMP_HERE,
-        std::string(label) + ": condition estimate " +
+        std::string(error_message_label) + ": condition estimate " +
             std::to_string(result.diagnostics.condition_estimate) +
             " exceeds supported threshold " + std::to_string(max_condition));
 }
 
 std::vector<double> invert_dense_matrix(std::vector<double> matrix,
                                       std::size_t n,
-                                      std::string_view label) {
-    const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
+                                      std::string_view error_message_label) {
+    const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, error_message_label);
     const DenseMatrix inverse = solver.impl->lu.inverse();
     std::vector<double> result;
     copy_to_row_major(inverse, result);
@@ -313,13 +313,13 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
-    std::string_view label) {
+    std::string_view error_message_label) {
     ::svmp::check_arg<FEException>(
         matrix.size() == rows * cols, SVMP_HERE,
-        std::string(label) + ": pseudo-inverse size mismatch");
+        std::string(error_message_label) + ": pseudo-inverse size mismatch");
     ::svmp::check_arg<FEException>(
         rows > 0 && cols > 0, SVMP_HERE,
-        std::string(label) + ": pseudo-inverse requires a nonempty matrix");
+        std::string(error_message_label) + ": pseudo-inverse requires a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 440df817c..8784f8ff6 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -19,44 +19,92 @@ namespace FE {
 namespace math {
 
 // Dense solve, inverse, rank, and pseudo-inverse support for FE construction
-// utilities. Matrices are row-major: matrix[row * cols + col].
+// utilities. Matrices are row-major: matrix[row * cols + col]. The
+// error_message_label argument on the routines below is used only to prefix the
+// diagnostic message of any exception they throw.
+
+/**
+ * @brief Largest absolute entry of a dense matrix.
+ * @ingroup FE_Math
+ * @param matrix Row-major matrix entries.
+ * @return Maximum of |entry| over all entries, or 0 for an empty matrix.
+ */
 [[nodiscard]] double dense_matrix_max_abs(std::span<const double> matrix) noexcept;
 
+/**
+ * @brief Scale-aware pivot tolerance for dense factorization.
+ * @ingroup FE_Math
+ *
+ * @details Proportional to machine epsilon scaled by the matrix size and
+ * magnitude; pivots below it are treated as rank-deficient.
+ *
+ * @param rows Row count.
+ * @param cols Column count.
+ * @param max_abs Largest absolute matrix entry (see dense_matrix_max_abs()).
+ * @param multiplier Safety factor applied to the epsilon-scaled tolerance.
+ * @return Pivot magnitude threshold.
+ */
 [[nodiscard]] double dense_matrix_pivot_tolerance(std::size_t rows,
                                                 std::size_t cols,
                                                 double max_abs,
                                                 double multiplier = double(64)) noexcept;
 
+/**
+ * @brief Scale-aware singular-value tolerance for rank decisions.
+ * @ingroup FE_Math
+ *
+ * @details Singular values at or below the returned tolerance are treated as
+ * zero when computing rank or a pseudo-inverse.
+ *
+ * @param rows Row count.
+ * @param cols Column count.
+ * @param largest_singular_value Largest singular value of the matrix.
+ * @param multiplier Safety factor applied to the epsilon-scaled tolerance.
+ * @return Singular-value threshold.
+ */
 [[nodiscard]] double dense_matrix_singular_value_tolerance(std::size_t rows,
                                                          std::size_t cols,
                                                          double largest_singular_value,
                                                          double multiplier = double(64)) noexcept;
 
+/** @brief Result of a rank-revealing pseudo-inverse. @ingroup FE_Math */
 struct DensePseudoInverseResult {
-    std::vector<double> inverse;
-    std::size_t rank{0};
-    double tolerance{0};
-    double largest_singular_value{0};
-    double smallest_retained_singular_value{0};
+    std::vector<double> inverse;                 ///< Row-major pseudo-inverse.
+    std::size_t rank{0};                         ///< Numerical rank at the chosen tolerance.
+    double tolerance{0};                         ///< Singular-value tolerance used.
+    double largest_singular_value{0};            ///< Largest singular value.
+    double smallest_retained_singular_value{0};  ///< Smallest singular value kept.
 };
 
+/** @brief SVD-based conditioning and rank diagnostics for a dense matrix. @ingroup FE_Math */
 struct DenseMatrixDiagnostics {
-    std::size_t rank{0};
-    double tolerance{0};
-    double largest_singular_value{0};
-    double smallest_retained_singular_value{0};
-    double condition_estimate{std::numeric_limits<double>::infinity()};
+    std::size_t rank{0};                         ///< Numerical rank at @ref tolerance.
+    double tolerance{0};                         ///< Singular-value tolerance used.
+    double largest_singular_value{0};            ///< Largest singular value.
+    double smallest_retained_singular_value{0};  ///< Smallest singular value kept.
+    double condition_estimate{std::numeric_limits<double>::infinity()};  ///< Condition estimate; infinite when rank-deficient.
 };
 
+/** @brief A dense inverse together with its diagnostics. @ingroup FE_Math */
 struct DenseInverseResult {
-    std::vector<double> inverse;
-    DenseMatrixDiagnostics diagnostics;
-    bool used_svd_fallback{false};
+    std::vector<double> inverse;        ///< Row-major inverse.
+    DenseMatrixDiagnostics diagnostics; ///< Conditioning/rank diagnostics of the input.
+    bool used_svd_fallback{false};      ///< True when an SVD fallback was used for a high-condition matrix.
 };
 
+/** @brief Condition estimate above which the inverse switches to an SVD fallback. @ingroup FE_Math */
 [[nodiscard]] double dense_matrix_condition_fallback_threshold() noexcept;
+/** @brief Condition estimate above which validation rejects a dense inverse. @ingroup FE_Math */
 [[nodiscard]] double dense_matrix_condition_error_threshold() noexcept;
 
+/**
+ * @brief LU factorization of a dense square matrix with a cached pivot summary.
+ * @ingroup FE_Math
+ *
+ * @details Produced by factor_dense_matrix(); move-only because it owns the Eigen
+ * factorization. @ref error_message_label prefixes the messages of exceptions
+ * thrown by the solve methods.
+ */
 struct DenseLUSolver {
     struct Impl;
 
@@ -67,57 +115,139 @@ struct DenseLUSolver {
     DenseLUSolver(const DenseLUSolver&) = delete;
     DenseLUSolver& operator=(const DenseLUSolver&) = delete;
 
-    std::size_t n{0};
-    DenseMatrixDiagnostics diagnostics;
-    double pivot_tolerance{0};
-    double min_pivot{0};
-    double max_pivot{0};
-    std::string label;
-    std::unique_ptr<Impl> impl;
+    std::size_t n{0};                    ///< Matrix dimension.
+    DenseMatrixDiagnostics diagnostics;  ///< Pivot-derived diagnostics (rank, tolerance).
+    double pivot_tolerance{0};           ///< Scale-aware pivot tolerance used.
+    double min_pivot{0};                 ///< Smallest pivot magnitude.
+    double max_pivot{0};                 ///< Largest pivot magnitude.
+    std::string error_message_label;     ///< Prefix for solve-time exception messages.
+    std::unique_ptr<Impl> impl;          ///< Eigen factorization (pimpl).
 
+    /** @brief Whether the factorization is empty (n == 0). */
     [[nodiscard]] bool empty() const noexcept { return n == 0; }
 
+    /**
+     * @brief Solve A x = rhs in place for a single right-hand side.
+     * @param rhs On entry the right-hand side; on return the solution (size n).
+     */
     void solve_in_place(std::span<double> rhs) const;
+    /**
+     * @brief Solve A X = RHS in place for several right-hand sides.
+     * @param rhs Row-major block of size n * rhs_count (solutions on return).
+     * @param rhs_count Number of right-hand sides.
+     */
     void solve_in_place(std::span<double> rhs, std::size_t rhs_count) const;
+    /**
+     * @brief Solve A x = rhs and return the solution.
+     * @param rhs Right-hand side of size n.
+     * @return Solution vector of size n.
+     */
     [[nodiscard]] std::vector<double> solve(std::span<const double> rhs) const;
 };
 
 // Inverses and pseudo-inverses keep the same row-major convention for their
 // returned dimensions.
+
+/**
+ * @brief SVD-based rank and conditioning diagnostics for a dense matrix.
+ * @ingroup FE_Math
+ * @param matrix Row-major matrix of size rows * cols.
+ * @param rows Row count.
+ * @param cols Column count.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @return Rank, tolerance, singular-value, and condition diagnostics.
+ * @throws FEException If the matrix size is inconsistent or the matrix is empty.
+ */
 [[nodiscard]] DenseMatrixDiagnostics dense_matrix_diagnostics(
     std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
-    std::string_view label = "dense matrix");
+    std::string_view error_message_label = "dense matrix");
 
+/**
+ * @brief LU-factor a dense square matrix.
+ * @ingroup FE_Math
+ * @param matrix Row-major n * n matrix (consumed).
+ * @param n Matrix dimension.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @return The factorization.
+ * @throws FEException If the size is inconsistent or the matrix is rank-deficient.
+ */
 [[nodiscard]] DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
                                                 std::size_t n,
-                                                std::string_view label = "dense matrix");
+                                                std::string_view error_message_label = "dense matrix");
 
+/**
+ * @brief Invert a dense square matrix.
+ * @ingroup FE_Math
+ * @param matrix Row-major n * n matrix (consumed).
+ * @param n Matrix dimension.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @return Row-major inverse of size n * n.
+ * @throws FEException If the size is inconsistent or the matrix is singular.
+ */
 [[nodiscard]] std::vector<double> invert_dense_matrix(std::vector<double> matrix,
                                                     std::size_t n,
-                                                    std::string_view label = "dense matrix");
+                                                    std::string_view error_message_label = "dense matrix");
 
+/**
+ * @brief Invert a dense square matrix with diagnostics, using an SVD fallback for
+ * high-condition matrices.
+ * @ingroup FE_Math
+ * @param matrix Row-major n * n matrix (consumed).
+ * @param n Matrix dimension.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @return Inverse plus diagnostics and whether the SVD fallback was used.
+ * @throws FEException If the size is inconsistent or the matrix is rank-deficient.
+ */
 [[nodiscard]] DenseInverseResult invert_dense_matrix_with_diagnostics(
     std::vector<double> matrix,
     std::size_t n,
-    std::string_view label = "dense matrix");
+    std::string_view error_message_label = "dense matrix");
 
+/**
+ * @brief Validate that a dense inverse has full rank and acceptable conditioning.
+ * @ingroup FE_Math
+ * @param result Result from invert_dense_matrix_with_diagnostics().
+ * @param expected_rank Required (full) rank.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @param max_condition Largest acceptable condition estimate.
+ * @throws FEException If the rank is below expected_rank or the condition exceeds max_condition.
+ */
 void validate_dense_inverse_diagnostics(
     const DenseInverseResult& result,
     std::size_t expected_rank,
-    std::string_view label = "dense matrix",
+    std::string_view error_message_label = "dense matrix",
     double max_condition = dense_matrix_condition_error_threshold());
 
+/**
+ * @brief Numerical rank of a dense matrix from its singular values.
+ * @ingroup FE_Math
+ * @param matrix Row-major matrix of size rows * cols (consumed).
+ * @param rows Row count.
+ * @param cols Column count.
+ * @return Number of singular values above the scale-aware tolerance.
+ * @throws FEException If the matrix size is inconsistent.
+ */
 [[nodiscard]] std::size_t dense_matrix_rank(std::vector<double> matrix,
                                             std::size_t rows,
                                             std::size_t cols);
 
+/**
+ * @brief Moore-Penrose pseudo-inverse via a rank-revealing SVD.
+ * @ingroup FE_Math
+ * @param matrix Row-major matrix of size rows * cols.
+ * @param rows Row count.
+ * @param cols Column count.
+ * @param error_message_label Prefix for the message of any exception thrown.
+ * @return Row-major pseudo-inverse (cols * rows) plus rank/tolerance diagnostics.
+ * @throws FEException If the matrix size is inconsistent or the matrix is empty.
+ */
 [[nodiscard]] DensePseudoInverseResult rank_revealing_pseudo_inverse(
     std::span<const double> matrix,
     std::size_t rows,
     std::size_t cols,
-    std::string_view label = "dense matrix");
+    std::string_view error_message_label = "dense matrix");
 
 } // namespace math
 } // namespace FE
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
index 96890f241..b029a55b4 100644
--- a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -204,7 +204,7 @@ TEST(DenseLinearAlgebra, SolveInPlaceValidatesInputs) {
 
     DenseLUSolver unfactored;
     unfactored.n = 2u;
-    unfactored.label = "unfactored";
+    unfactored.error_message_label = "unfactored";
     EXPECT_FALSE(unfactored.empty());
     EXPECT_THROW(unfactored.solve_in_place(std::span<double>(rhs.data(), rhs.size()), 1u),
                  FEException);

From 5b072c11790af19ba83669396d37c2ffbef073ae Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:50:11 -0700
Subject: [PATCH 87/91] solver: throw on unmapped element type in nn.cpp
 to_fe_element_type

Add a default case that raises BasisElementCompatibilityException for a solver
element type with no FE mapping, so a missing mapping fails loudly instead of
relying on the unhandled-enum compiler warning. The deliberate NA/PNT/NRB cases
still return std::nullopt.
---
 Code/Source/solver/nn.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 9a145964e..7da0deea6 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -97,8 +97,15 @@ std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
     case consts::ElementType::PNT:
     case consts::ElementType::NRB:
       return std::nullopt;
+
+    // A solver element type with no case above is a missing mapping, not a
+    // deliberately unsupported type; fail loudly instead of relying on the
+    // unhandled-enum compiler warning being enabled.
+    default:
+      svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+          "to_fe_element_type: unhandled solver element type " +
+              std::to_string(static_cast<int>(eType)));
   }
-  return std::nullopt;
 }
 
 /// Whether the FE Basis face adapter can evaluate face shape functions for

From a0722c361497d34152f65439317a050a2c373718 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:50:11 -0700
Subject: [PATCH 88/91] docs: upgrade Doxyfile MathJax to version 3

MathJax 2 was pinned as a default when the FE documentation was added and is now
end-of-life. Move to the maintained MathJax 3 (chtml output, mathjax@3 CDN, ams
extension); doxygen emits the v3 bootstrap and the AMS math used in the FE docs is
supported.
---
 Documentation/Doxyfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/Doxyfile b/Documentation/Doxyfile
index 3c29a08f1..acd5ba21c 100644
--- a/Documentation/Doxyfile
+++ b/Documentation/Doxyfile
@@ -191,10 +191,10 @@ TREEVIEW_WIDTH         = 250
 EXT_LINKS_IN_WINDOW    = NO
 FORMULA_FONTSIZE       = 10
 USE_MATHJAX            = YES
-MATHJAX_VERSION        = MathJax_2
-MATHJAX_FORMAT         = HTML-CSS
-MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@2
-MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+MATHJAX_VERSION        = MathJax_3
+MATHJAX_FORMAT         = chtml
+MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@3
+MATHJAX_EXTENSIONS     = ams
 MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 SERVER_BASED_SEARCH    = NO

From 9c3906a4da8a65635df8f76d04c6bacc383a307c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 17:50:12 -0700
Subject: [PATCH 89/91] FE: resolve Doxygen cross-references and document
 exposed internal members

Generating HTML (not just parsing) surfaced reference/doc issues the no-output
checks missed:
- qualify the @ref class/struct/enum links in the Basis group doc so they resolve
- use plain text for the basis_factory namespace mention (no resolvable @ref)
- avoid the ::Vector explicit-link request in the Math vector doc
- add @return tags and document the NodeOrderingConventions entities that became
  visible when the @cond INTERNAL exclusion was removed
Net FE Doxygen warnings drop from 47 to 19 (the rest pre-existing).
---
 Code/Source/solver/FE/Basis/BasisFunction.h   | 21 +++++++++-------
 .../solver/FE/Basis/NodeOrderingConventions.h | 25 ++++++++++++++++---
 .../solver/FE/Math/DenseLinearAlgebra.h       | 17 ++++++++++---
 Code/Source/solver/FE/Math/Vector.h           |  6 ++---
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 770a0c440..ba9e256cb 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -32,20 +32,23 @@
  * material model, and equation context.
  *
  * The main pieces are:
- * - @ref BasisFunction (BasisFunction.h): the abstract query and evaluation
- *   contract for code that does not need to know the concrete family.
+ * - @ref svmp::FE::basis::BasisFunction "BasisFunction" (BasisFunction.h): the
+ *   abstract query and evaluation contract for code that does not need to know
+ *   the concrete family.
  * - @ref FE_LagrangeBasis "LagrangeBasis" and
  *   @ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
  *   families, including analytical first and second derivatives in reference
  *   coordinates.
  * - basis_factory (BasisFactory.h): runtime construction from a
- *   @ref BasisRequest. basis_factory::default_basis_request() centralizes the
- *   family/order that matches each supported element's public node layout.
- * - @ref ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
- *   coordinates and the output ordering used by every basis evaluator.
- * - @ref BasisTopology (BasisTraits.h) and the @ref FE_BasisExceptions
- *   "basis exceptions" (BasisExceptions.h): topology classification,
- *   compile-time helpers, and module-specific exception types.
+ *   @ref svmp::FE::basis::BasisRequest "BasisRequest".
+ *   basis_factory::default_basis_request() centralizes the family/order that
+ *   matches each supported element's public node layout.
+ * - @ref svmp::FE::basis::ReferenceNodeLayout "ReferenceNodeLayout"
+ *   (NodeOrderingConventions.h): canonical reference-node coordinates and the
+ *   output ordering used by every basis evaluator.
+ * - @ref svmp::FE::basis::BasisTopology "BasisTopology" (BasisTraits.h) and the
+ *   @ref FE_BasisExceptions "basis exceptions" (BasisExceptions.h): topology
+ *   classification, compile-time helpers, and module-specific exception types.
  *
  * ## Object and evaluation contract
  *
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 31a7a2d64..1c01fab22 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -23,7 +23,7 @@ namespace basis {
  * @brief Reference-node generators that the basis families build on.
  *
  * @warning Internal implementation detail. Do not use these directly: obtain a
- * basis through @ref basis_factory and read its nodes via BasisFunction::nodes().
+ * basis through basis_factory and read its nodes via BasisFunction::nodes().
  * These declarations are part of the internal node-ordering machinery and their
  * interface may change without notice.
  *
@@ -74,18 +74,31 @@ namespace basis {
  * index directly instead of reconstructing it from the floating-point coordinate.
  */
 struct LagrangeNodeLayout {
-    std::vector<math::Vector<double, 3>> coords;
-    std::vector<std::array<int, 3>>      lattice;
+    std::vector<math::Vector<double, 3>> coords;   ///< Reference node coordinates, one per node.
+    std::vector<std::array<int, 3>>      lattice;  ///< Integer lattice index of each node (see details above).
 };
 
+/**
+ * @brief Reference-node coordinate and count lookups for an element type.
+ */
 class ReferenceNodeLayout {
 public:
     /**
      * @brief One reference node coordinate by local index. Regenerates the full
      * layout per call; prefer node_coords() when more than one node is needed.
+     *
+     * @param elem_type Element type to look up.
+     * @param local_node Local node index in [0, num_nodes(elem_type)).
+     * @return Reference coordinate of the requested node.
      */
     static math::Vector<double, 3> node_coord_at(ElementType elem_type,
                                                  std::size_t local_node);
+
+    /**
+     * @brief Number of reference nodes in an element type's public layout.
+     * @param elem_type Element type to look up.
+     * @return Node count.
+     */
     static std::size_t num_nodes(ElementType elem_type);
 
     /**
@@ -101,6 +114,12 @@ class ReferenceNodeLayout {
      */
     static std::vector<math::Vector<double, 3>> node_coords(ElementType elem_type);
 
+    /**
+     * @brief Reference Lagrange node coordinates for a canonical type and order.
+     * @param canonical_type Canonical Lagrange element type (or Point1).
+     * @param order Polynomial order.
+     * @return Reference node coordinates, one per node, in basis order.
+     */
     static std::vector<math::Vector<double, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 8784f8ff6..df45224a1 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -92,9 +92,17 @@ struct DenseInverseResult {
     bool used_svd_fallback{false};      ///< True when an SVD fallback was used for a high-condition matrix.
 };
 
-/** @brief Condition estimate above which the inverse switches to an SVD fallback. @ingroup FE_Math */
+/**
+ * @brief Condition estimate above which the inverse switches to an SVD fallback.
+ * @ingroup FE_Math
+ * @return The fallback condition-number threshold.
+ */
 [[nodiscard]] double dense_matrix_condition_fallback_threshold() noexcept;
-/** @brief Condition estimate above which validation rejects a dense inverse. @ingroup FE_Math */
+/**
+ * @brief Condition estimate above which validation rejects a dense inverse.
+ * @ingroup FE_Math
+ * @return The error condition-number threshold.
+ */
 [[nodiscard]] double dense_matrix_condition_error_threshold() noexcept;
 
 /**
@@ -123,7 +131,10 @@ struct DenseLUSolver {
     std::string error_message_label;     ///< Prefix for solve-time exception messages.
     std::unique_ptr<Impl> impl;          ///< Eigen factorization (pimpl).
 
-    /** @brief Whether the factorization is empty (n == 0). */
+    /**
+     * @brief Whether the factorization is empty (n == 0).
+     * @return True when no matrix has been factored.
+     */
     [[nodiscard]] bool empty() const noexcept { return n == 0; }
 
     /**
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 8bd5b3fa9..8c4e30acd 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -16,9 +16,9 @@
  *
  * This is a small, fixed-size (compile-time length) vector for element-level FE
  * kernels in namespace svmp::FE::math. It is distinct from, and not a replacement
- * for, the legacy dynamically sized global ::Vector container in solver/Vector.h:
- * the two differ in namespace, size model (compile-time vs runtime), and memory
- * management, and coexist deliberately.
+ * for, the legacy dynamically sized container in solver/Vector.h: the two differ
+ * in namespace, size model (compile-time vs runtime), and memory management, and
+ * coexist deliberately.
  */
 
 #include <Eigen/Core>

From 6c7c0d7d41707a83f2d9e15e049ea4fe9606dc6c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 25 Jun 2026 18:12:35 -0700
Subject: [PATCH 90/91] Core: add NotImplemented/IndexOutOfRange exceptions and
 document the assertion helpers

- add svmp::NotImplementedException and svmp::IndexOutOfRangeException (CoreException-derived)
- default the ExceptionT template parameter of not_implemented() and check_index()
  to those types, so they can be used without naming an exception while existing
  explicit callers are unaffected
- document the helpers, including that check vs check_arg are the same check named
  for intent and that throw_if is the logical inverse of check, and when to use
  each not_implemented overload
Note: svmp::FE::NotImplementedException stays for FE code needing FEException
ancestry. All FE/exception-helper unit tests pass.
---
 Code/Source/solver/Core/Exception.h | 88 ++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/Code/Source/solver/Core/Exception.h b/Code/Source/solver/Core/Exception.h
index 2e7e48d6f..7cde41b97 100644
--- a/Code/Source/solver/Core/Exception.h
+++ b/Code/Source/solver/Core/Exception.h
@@ -346,6 +346,38 @@ class DependencyException : public CoreException {
     }
 };
 
+/// @brief A requested operation or feature is not implemented.
+///
+/// This is the default exception raised by not_implemented(). Subsystems that
+/// need their not-implemented errors to be catchable as their own base type may
+/// define a more specific exception (for example, svmp::FE::NotImplementedException
+/// derives from FEException); pass it explicitly to not_implemented<...>().
+class NotImplementedException : public CoreException {
+public:
+    NotImplementedException(const std::string& message,
+                            const char* file = "",
+                            int line = 0,
+                            const char* function = "")
+        : CoreException(message, StatusCode::NotImplemented, file, line, function)
+    {
+    }
+};
+
+/// @brief An index is outside its valid range.
+///
+/// This is the default exception raised by check_index(). The status code is
+/// InvalidArgument because an out-of-range index is a caller error.
+class IndexOutOfRangeException : public CoreException {
+public:
+    IndexOutOfRangeException(const std::string& message,
+                             const char* file = "",
+                             int line = 0,
+                             const char* function = "")
+        : CoreException(message, StatusCode::InvalidArgument, file, line, function)
+    {
+    }
+};
+
 inline void ExceptionRuntime::install_terminate_handler()
 {
     std::set_terminate([]() {
@@ -366,6 +398,12 @@ inline void ExceptionRuntime::install_terminate_handler()
     });
 }
 
+/**
+ * @brief Construct and throw @p ExceptionT, appending the source location.
+ *
+ * @details The exception type is given explicitly; @p args are forwarded to the
+ * exception constructor ahead of the file/line/function from @p location.
+ */
 template <class ExceptionT, class... Args>
 [[noreturn]] void raise(SourceLocation location, Args&&... args)
 {
@@ -373,6 +411,13 @@ template <class ExceptionT, class... Args>
                      location.function);
 }
 
+/**
+ * @brief Raise @p ExceptionT when @p condition is false (a required condition).
+ *
+ * @details Use for general invariants and postconditions. check_arg() is the same
+ * check with a name that documents argument/precondition validation at the call
+ * site; throw_if() is the logical inverse (it raises when its condition is true).
+ */
 template <class ExceptionT, class... Args>
 void check(bool condition, SourceLocation location, Args&&... args)
 {
@@ -381,6 +426,12 @@ void check(bool condition, SourceLocation location, Args&&... args)
     }
 }
 
+/**
+ * @brief Raise @p ExceptionT when an argument @p condition is false.
+ *
+ * @details Behaves exactly like check(); the distinct name documents at the call
+ * site that the condition validates a function argument/precondition.
+ */
 template <class ExceptionT, class... Args>
 void check_arg(bool condition, SourceLocation location, Args&&... args)
 {
@@ -389,6 +440,9 @@ void check_arg(bool condition, SourceLocation location, Args&&... args)
     }
 }
 
+/**
+ * @brief Raise @p ExceptionT when @p ptr is null.
+ */
 template <class ExceptionT, class PointerT, class... Args>
 void check_not_null(PointerT ptr, SourceLocation location, Args&&... args)
 {
@@ -397,6 +451,13 @@ void check_not_null(PointerT ptr, SourceLocation location, Args&&... args)
     }
 }
 
+/**
+ * @brief Raise @p ExceptionT when @p condition is true.
+ *
+ * @details The logical inverse of check(): check() raises when its condition is
+ * false (a required condition); throw_if() raises when its condition is true (a
+ * failure condition). The two are not interchangeable.
+ */
 template <class ExceptionT, class... Args>
 void throw_if(bool condition, SourceLocation location, Args&&... args)
 {
@@ -405,7 +466,14 @@ void throw_if(bool condition, SourceLocation location, Args&&... args)
     }
 }
 
-template <class ExceptionT, class IndexT, class SizeT>
+/**
+ * @brief Raise an exception when @p index is outside [0, @p size).
+ *
+ * @details @p ExceptionT defaults to IndexOutOfRangeException; supply a different
+ * type only when a subsystem needs its own exception. The bounds message is
+ * generated automatically.
+ */
+template <class ExceptionT = IndexOutOfRangeException, class IndexT, class SizeT>
 void check_index(IndexT index, SizeT size, SourceLocation location)
 {
     const long long index_value = static_cast<long long>(index);
@@ -417,13 +485,27 @@ void check_index(IndexT index, SizeT size, SourceLocation location)
             std::to_string(size_value) + ")");
 }
 
-template <class ExceptionT, class... Args>
+/**
+ * @brief Raise an exception reporting an unimplemented feature.
+ *
+ * @details @p ExceptionT defaults to NotImplementedException. This overload
+ * forwards @p args to the exception constructor; prefer the (feature, location)
+ * overload below for the common case of a single feature-name string.
+ */
+template <class ExceptionT = NotImplementedException, class... Args>
 [[noreturn]] void not_implemented(SourceLocation location, Args&&... args)
 {
     raise<ExceptionT>(location, std::forward<Args>(args)...);
 }
 
-template <class ExceptionT, class FeatureT>
+/**
+ * @brief Raise an exception reporting an unimplemented @p feature.
+ *
+ * @details Convenience for the common case where the only argument is a
+ * feature-name string (note the feature-first parameter order). @p ExceptionT
+ * defaults to NotImplementedException.
+ */
+template <class ExceptionT = NotImplementedException, class FeatureT>
 [[noreturn]] void not_implemented(FeatureT&& feature, SourceLocation location)
 {
     raise<ExceptionT>(location, std::forward<FeatureT>(feature));

From 7003bf3d66207135348731f4c3d932a5d4558aa4 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Fri, 26 Jun 2026 10:24:42 -0700
Subject: [PATCH 91/91] simplifying exception parameters and eliminating
 redundancies in exception infrastructure

---
 Code/Source/solver/Core/Exception.h           | 254 +++++++++++-------
 Code/Source/solver/FE/Basis/BasisExceptions.h |  59 +---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  24 +-
 Code/Source/solver/FE/Basis/BasisFunction.cpp |   9 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  36 +--
 .../FE/Basis/NodeOrderingConventions.cpp      |  70 ++---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  49 ++--
 Code/Source/solver/FE/Common/FEException.h    | 200 +++-----------
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  70 ++---
 Code/Source/solver/Parameters.cpp             | 227 ++++++++--------
 Code/Source/solver/Parameters.h               |  12 +-
 Code/Source/solver/cep_ion.cpp                |   2 +-
 Code/Source/solver/fs.cpp                     |   2 +-
 Code/Source/solver/ionic_model.cpp            |   6 +-
 Code/Source/solver/ionic_model.h              |   3 +-
 Code/Source/solver/mat_fun.h                  |   2 +-
 Code/Source/solver/nn.cpp                     |  87 ++----
 Code/Source/solver/post.cpp                   |   2 -
 Code/Source/solver/read_files.cpp             |   4 +-
 Documentation/Doxyfile                        |   6 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  27 +-
 21 files changed, 464 insertions(+), 687 deletions(-)

diff --git a/Code/Source/solver/Core/Exception.h b/Code/Source/solver/Core/Exception.h
index 7cde41b97..0c197c8bc 100644
--- a/Code/Source/solver/Core/Exception.h
+++ b/Code/Source/solver/Core/Exception.h
@@ -11,6 +11,7 @@
 #include <sstream>
 #include <string>
 #include <string_view>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -281,12 +282,23 @@ class ExceptionBase : public std::exception {
         rebuild_what();
     }
 
+    /// @brief Record the originating source location and refresh what().
+    ///
+    /// @details Called by raise() after construction, so exception constructors
+    /// do not need to accept (and forward) the file/line/function themselves.
+    void set_source_location(const SourceLocation& location)
+    {
+        context_.set_source_location(location.file, location.line,
+                                     location.function);
+        rebuild_what();
+    }
+
     virtual ~ExceptionBase() noexcept = default;
 
 protected:
     ExceptionBase(std::string message, StatusCode status,
-                  std::string_view subsystem_label, const char* file,
-                  int line, const char* function)
+                  std::string_view subsystem_label, const char* file = "",
+                  int line = 0, const char* function = "")
         : message_(std::move(message)),
           subsystem_label_(subsystem_label.empty() ? std::string_view("Exception")
                                                    : subsystem_label)
@@ -323,60 +335,51 @@ class CoreException : public ExceptionBase {
     }
 };
 
-class ParseException : public CoreException {
-public:
-    ParseException(const std::string& message,
-                   const char* file = "",
-                   int line = 0,
-                   const char* function = "")
-        : CoreException(message, StatusCode::ParseError, file, line, function)
-    {
+/**
+ * @brief Define a simple, message-only exception type in one line.
+ *
+ * @details Expands to a class @p Name deriving from @p Base with a single
+ * `explicit Name(const std::string& message)` constructor that records @p Status.
+ * Use it for exceptions that carry only a message; write the class by hand when it
+ * needs extra structured context (members and accessors). The source location is
+ * stamped by raise(), so no file/line/function constructor is needed.
+ *
+ * @p Base must be an ExceptionBase-derived type whose constructor accepts
+ * `(const std::string&, StatusCode)` (CoreException, FEException, and the
+ * subsystem bases do).
+ *
+ * @code
+ * SVMP_DEFINE_EXCEPTION(ParseException, CoreException, StatusCode::ParseError);
+ * @endcode
+ */
+#define SVMP_DEFINE_EXCEPTION(Name, Base, Status)                              \
+    class Name : public Base {                                                 \
+    public:                                                                    \
+        explicit Name(const std::string& message)                             \
+            : Base(message, (Status))                                          \
+        {                                                                      \
+        }                                                                      \
     }
-};
 
-class DependencyException : public CoreException {
-public:
-    DependencyException(const std::string& message,
-                        const char* file = "",
-                        int line = 0,
-                        const char* function = "")
-        : CoreException(message, StatusCode::DependencyError, file, line,
-                        function)
-    {
-    }
-};
+/// @brief A parsing or input-format error.
+SVMP_DEFINE_EXCEPTION(ParseException, CoreException, StatusCode::ParseError);
+
+/// @brief A required dependency is missing or failed to load.
+SVMP_DEFINE_EXCEPTION(DependencyException, CoreException,
+                      StatusCode::DependencyError);
 
 /// @brief A requested operation or feature is not implemented.
 ///
-/// This is the default exception raised by not_implemented(). Subsystems that
-/// need their not-implemented errors to be catchable as their own base type may
-/// define a more specific exception (for example, svmp::FE::NotImplementedException
-/// derives from FEException); pass it explicitly to not_implemented<...>().
-class NotImplementedException : public CoreException {
-public:
-    NotImplementedException(const std::string& message,
-                            const char* file = "",
-                            int line = 0,
-                            const char* function = "")
-        : CoreException(message, StatusCode::NotImplemented, file, line, function)
-    {
-    }
-};
+/// @details The default exception raised by not_implemented().
+SVMP_DEFINE_EXCEPTION(NotImplementedException, CoreException,
+                      StatusCode::NotImplemented);
 
 /// @brief An index is outside its valid range.
 ///
-/// This is the default exception raised by check_index(). The status code is
+/// @details The default exception raised by check_index(); the status code is
 /// InvalidArgument because an out-of-range index is a caller error.
-class IndexOutOfRangeException : public CoreException {
-public:
-    IndexOutOfRangeException(const std::string& message,
-                             const char* file = "",
-                             int line = 0,
-                             const char* function = "")
-        : CoreException(message, StatusCode::InvalidArgument, file, line, function)
-    {
-    }
-};
+SVMP_DEFINE_EXCEPTION(IndexOutOfRangeException, CoreException,
+                      StatusCode::InvalidArgument);
 
 inline void ExceptionRuntime::install_terminate_handler()
 {
@@ -399,44 +402,97 @@ inline void ExceptionRuntime::install_terminate_handler()
 }
 
 /**
- * @brief Construct and throw @p ExceptionT, appending the source location.
+ * @brief A diagnostic message bundled with the source location where it was written.
  *
- * @details The exception type is given explicitly; @p args are forwarded to the
- * exception constructor ahead of the file/line/function from @p location.
+ * @details The core helpers (raise(), check(), throw_if(), check_not_null()) take
+ * a Diagnostic in place of an explicit source location. Its file/line/function
+ * arguments default to the compiler builtins __builtin_FILE()/__builtin_LINE()/
+ * __builtin_FUNCTION(), which capture the caller's location, so a string literal
+ * or std::string passed at the call site is implicitly wrapped into a Diagnostic
+ * that records exactly where the call appears -- callers do not pass SVMP_HERE:
+ * @code
+ * svmp::check<MyException>(ptr != nullptr, "pointer must not be null");
+ * @endcode
  */
-template <class ExceptionT, class... Args>
-[[noreturn]] void raise(SourceLocation location, Args&&... args)
-{
-    throw ExceptionT(std::forward<Args>(args)..., location.file, location.line,
-                     location.function);
-}
+class Diagnostic {
+public:
+    /**
+     * @brief Wrap a message, capturing the caller's source location by default.
+     * @param message The diagnostic message.
+     * @param file Source file; defaults to the caller's via __builtin_FILE().
+     * @param line Source line; defaults to the caller's via __builtin_LINE().
+     * @param function Function; defaults to the caller's via __builtin_FUNCTION().
+     */
+    Diagnostic(const char* message,
+               const char* file = __builtin_FILE(),
+               int line = __builtin_LINE(),
+               const char* function = __builtin_FUNCTION())
+        : message_(message), location_{file, line, function}
+    {
+    }
+    /**
+     * @brief Wrap a message, capturing the caller's source location by default.
+     * @param message The diagnostic message.
+     * @param file Source file; defaults to the caller's via __builtin_FILE().
+     * @param line Source line; defaults to the caller's via __builtin_LINE().
+     * @param function Function; defaults to the caller's via __builtin_FUNCTION().
+     */
+    Diagnostic(std::string message,
+               const char* file = __builtin_FILE(),
+               int line = __builtin_LINE(),
+               const char* function = __builtin_FUNCTION())
+        : message_(std::move(message)), location_{file, line, function}
+    {
+    }
+
+    /**
+     * @brief The diagnostic message.
+     * @return The stored message.
+     */
+    const std::string& message() const noexcept { return message_; }
+    /**
+     * @brief The source location captured when the Diagnostic was constructed.
+     * @return The stored source location.
+     */
+    const SourceLocation& location() const noexcept { return location_; }
+
+private:
+    std::string message_;
+    SourceLocation location_;
+};
 
 /**
- * @brief Raise @p ExceptionT when @p condition is false (a required condition).
+ * @brief Construct @p ExceptionT from the diagnostic message and @p args, stamp
+ * the source location, and throw it.
  *
- * @details Use for general invariants and postconditions. check_arg() is the same
- * check with a name that documents argument/precondition validation at the call
- * site; throw_if() is the logical inverse (it raises when its condition is true).
+ * @details @p diagnostic carries the message and the source location captured at
+ * the call site; @p args are forwarded to the exception constructor after the
+ * message. The location is recorded via ExceptionBase::set_source_location(), so
+ * exception types never need a file/line/function constructor -- a `(message)`
+ * (plus any structured-context) constructor is enough.
  */
 template <class ExceptionT, class... Args>
-void check(bool condition, SourceLocation location, Args&&... args)
+[[noreturn]] void raise(Diagnostic diagnostic, Args&&... args)
 {
-    if (!condition) {
-        raise<ExceptionT>(location, std::forward<Args>(args)...);
-    }
+    static_assert(std::is_base_of_v<ExceptionBase, ExceptionT>,
+                  "raise<>() requires an svmp::ExceptionBase-derived exception type");
+    ExceptionT exception(diagnostic.message(), std::forward<Args>(args)...);
+    exception.set_source_location(diagnostic.location());
+    throw exception;
 }
 
 /**
- * @brief Raise @p ExceptionT when an argument @p condition is false.
+ * @brief Raise @p ExceptionT when @p condition is false (a required condition).
  *
- * @details Behaves exactly like check(); the distinct name documents at the call
- * site that the condition validates a function argument/precondition.
+ * @details The general success-condition check, used for argument, state, and
+ * invariant validation: `check<E>(ptr != nullptr, "...")`. throw_if() is the
+ * logical inverse -- it raises when its condition is true.
  */
 template <class ExceptionT, class... Args>
-void check_arg(bool condition, SourceLocation location, Args&&... args)
+void check(bool condition, Diagnostic diagnostic, Args&&... args)
 {
     if (!condition) {
-        raise<ExceptionT>(location, std::forward<Args>(args)...);
+        raise<ExceptionT>(std::move(diagnostic), std::forward<Args>(args)...);
     }
 }
 
@@ -444,10 +500,10 @@ void check_arg(bool condition, SourceLocation location, Args&&... args)
  * @brief Raise @p ExceptionT when @p ptr is null.
  */
 template <class ExceptionT, class PointerT, class... Args>
-void check_not_null(PointerT ptr, SourceLocation location, Args&&... args)
+void check_not_null(PointerT ptr, Diagnostic diagnostic, Args&&... args)
 {
     if (ptr == nullptr) {
-        raise<ExceptionT>(location, std::forward<Args>(args)...);
+        raise<ExceptionT>(std::move(diagnostic), std::forward<Args>(args)...);
     }
 }
 
@@ -459,10 +515,10 @@ void check_not_null(PointerT ptr, SourceLocation location, Args&&... args)
  * failure condition). The two are not interchangeable.
  */
 template <class ExceptionT, class... Args>
-void throw_if(bool condition, SourceLocation location, Args&&... args)
+void throw_if(bool condition, Diagnostic diagnostic, Args&&... args)
 {
     if (condition) {
-        raise<ExceptionT>(location, std::forward<Args>(args)...);
+        raise<ExceptionT>(std::move(diagnostic), std::forward<Args>(args)...);
     }
 }
 
@@ -470,45 +526,41 @@ void throw_if(bool condition, SourceLocation location, Args&&... args)
  * @brief Raise an exception when @p index is outside [0, @p size).
  *
  * @details @p ExceptionT defaults to IndexOutOfRangeException; supply a different
- * type only when a subsystem needs its own exception. The bounds message is
- * generated automatically.
+ * type only when a subsystem needs its own exception. The bounds message and the
+ * source location are generated automatically.
  */
 template <class ExceptionT = IndexOutOfRangeException, class IndexT, class SizeT>
-void check_index(IndexT index, SizeT size, SourceLocation location)
+void check_index(IndexT index, SizeT size,
+                 const char* file = __builtin_FILE(),
+                 int line = __builtin_LINE(),
+                 const char* function = __builtin_FUNCTION())
 {
     const long long index_value = static_cast<long long>(index);
     const long long size_value = static_cast<long long>(size);
-    check_arg<ExceptionT>(
+    check<ExceptionT>(
         index_value >= 0 && index_value < size_value,
-        location,
-        "Index " + std::to_string(index_value) + " out of bounds [0, " +
-            std::to_string(size_value) + ")");
-}
-
-/**
- * @brief Raise an exception reporting an unimplemented feature.
- *
- * @details @p ExceptionT defaults to NotImplementedException. This overload
- * forwards @p args to the exception constructor; prefer the (feature, location)
- * overload below for the common case of a single feature-name string.
- */
-template <class ExceptionT = NotImplementedException, class... Args>
-[[noreturn]] void not_implemented(SourceLocation location, Args&&... args)
-{
-    raise<ExceptionT>(location, std::forward<Args>(args)...);
+        Diagnostic("Index " + std::to_string(index_value) + " out of bounds [0, " +
+                       std::to_string(size_value) + ")",
+                   file, line, function));
 }
 
 /**
- * @brief Raise an exception reporting an unimplemented @p feature.
+ * @brief Raise an exception reporting an unimplemented feature, with a message.
  *
- * @details Convenience for the common case where the only argument is a
- * feature-name string (note the feature-first parameter order). @p ExceptionT
- * defaults to NotImplementedException.
+ * @details @p ExceptionT defaults to NotImplementedException, so most call sites
+ * pass only a message:
+ * @code
+ * svmp::not_implemented("GPU assembly is not supported");
+ * @endcode
+ * Pass a different exception type explicitly only when a subsystem needs one.
  */
-template <class ExceptionT = NotImplementedException, class FeatureT>
-[[noreturn]] void not_implemented(FeatureT&& feature, SourceLocation location)
+template <class ExceptionT = NotImplementedException>
+[[noreturn]] void not_implemented(std::string message,
+                                  const char* file = __builtin_FILE(),
+                                  int line = __builtin_LINE(),
+                                  const char* function = __builtin_FUNCTION())
 {
-    raise<ExceptionT>(location, std::forward<FeatureT>(feature));
+    raise<ExceptionT>(Diagnostic(std::move(message), file, line, function));
 }
 
 } // namespace svmp
@@ -519,7 +571,7 @@ template <class ExceptionT = NotImplementedException, class FeatureT>
 #define SVMP_DEBUG_CHECK(ExceptionT, condition, ...)                         \
     do {                                                                     \
         if (!(condition)) {                                                  \
-            ::svmp::raise<ExceptionT>(SVMP_HERE, __VA_ARGS__);               \
+            ::svmp::raise<ExceptionT>(__VA_ARGS__);                          \
         }                                                                    \
     } while (false)
 #else
diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index e32a60c66..cc8680c7a 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -34,12 +34,9 @@ namespace basis {
  */
 class BasisException : public FEException {
 public:
-    BasisException(const std::string& message,
-                   const char* file,
-                   int line,
-                   const char* function,
-                   StatusCode status = StatusCode::Unknown)
-        : FEException(message, status, file, line, function) {}
+    explicit BasisException(const std::string& message,
+                            StatusCode status = StatusCode::Unknown)
+        : FEException(message, status) {}
 };
 
 /**
@@ -52,14 +49,8 @@ class BasisException : public FEException {
  * constructing a LagrangeBasis for Tetra10 at order 1, when that layout is fixed
  * at order 2.
  */
-class BasisConfigurationException : public BasisException {
-public:
-    BasisConfigurationException(const std::string& message,
-                                const char* file,
-                                int line,
-                                const char* function)
-        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
-};
+SVMP_DEFINE_EXCEPTION(BasisConfigurationException, BasisException,
+                      StatusCode::InvalidArgument);
 
 /**
  * @brief Requested element topology is incompatible with the basis family
@@ -69,14 +60,8 @@ class BasisConfigurationException : public BasisException {
  * topology path (only the named Wedge15 layout is supported), or requesting a
  * basis on ElementType::Unknown.
  */
-class BasisElementCompatibilityException : public BasisException {
-public:
-    BasisElementCompatibilityException(const std::string& message,
-                                       const char* file,
-                                       int line,
-                                       const char* function)
-        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
-};
+SVMP_DEFINE_EXCEPTION(BasisElementCompatibilityException, BasisException,
+                      StatusCode::InvalidArgument);
 
 /**
  * @brief Basis evaluation request cannot be satisfied
@@ -85,14 +70,8 @@ class BasisElementCompatibilityException : public BasisException {
  * output span smaller than size(), or requesting analytical gradients or Hessians
  * from a basis that does not provide them.
  */
-class BasisEvaluationException : public BasisException {
-public:
-    BasisEvaluationException(const std::string& message,
-                             const char* file,
-                             int line,
-                             const char* function)
-        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
-};
+SVMP_DEFINE_EXCEPTION(BasisEvaluationException, BasisException,
+                      StatusCode::InvalidArgument);
 
 /**
  * @brief Public-to-canonical node ordering or coordinate lookup failure
@@ -101,14 +80,8 @@ class BasisEvaluationException : public BasisException {
  * reference layout. Example: requesting a tensor-axis node index outside
  * [0, order] from line_coord_pm_one.
  */
-class BasisNodeOrderingException : public BasisException {
-public:
-    BasisNodeOrderingException(const std::string& message,
-                               const char* file,
-                               int line,
-                               const char* function)
-        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
-};
+SVMP_DEFINE_EXCEPTION(BasisNodeOrderingException, BasisException,
+                      StatusCode::InvalidArgument);
 
 /**
  * @brief Internal basis construction or transform setup failure
@@ -117,14 +90,8 @@ class BasisNodeOrderingException : public BasisException {
  * InternalError) rather than bad user input. Example: a generated Lagrange node
  * lattice whose index components fall outside [0, order] in get_lagrange_lattice.
  */
-class BasisConstructionException : public BasisException {
-public:
-    BasisConstructionException(const std::string& message,
-                               const char* file,
-                               int line,
-                               const char* function)
-        : BasisException(message, file, line, function, StatusCode::InternalError) {}
-};
+SVMP_DEFINE_EXCEPTION(BasisConstructionException, BasisException,
+                      StatusCode::InternalError);
 
 /** @} */
 
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index 79841ae13..11581b454 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -21,10 +21,8 @@ enum class RequestTarget {
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
-    svmp::throw_if<BasisConfigurationException>(!req.order.has_value(), SVMP_HERE,
-                                              missing_message);
-    svmp::throw_if<BasisConfigurationException>(*req.order < 0, SVMP_HERE,
-                                              negative_message);
+    svmp::throw_if<BasisConfigurationException>(!req.order.has_value(), missing_message);
+    svmp::throw_if<BasisConfigurationException>(*req.order < 0, negative_message);
     return *req.order;
 }
 
@@ -32,21 +30,17 @@ RequestTarget require_single_request_target(const BasisRequest& req) {
     const bool has_named_element = req.element_type != ElementType::Unknown;
     const bool has_topology = req.topology != BasisTopology::Unknown;
     svmp::throw_if<BasisConfigurationException>(
-        !has_named_element && !has_topology, SVMP_HERE,
-        "BasisFactory: request must specify either a named element_type or a reference topology");
+        !has_named_element && !has_topology, "BasisFactory: request must specify either a named element_type or a reference topology");
     svmp::throw_if<BasisConfigurationException>(
-        has_named_element && has_topology, SVMP_HERE,
-        "BasisFactory: request must specify element_type or topology, not both");
+        has_named_element && has_topology, "BasisFactory: request must specify element_type or topology, not both");
     return has_topology ? RequestTarget::Topology : RequestTarget::NamedElement;
 }
 
 void require_scalar_c0_request(const BasisRequest& req) {
     svmp::throw_if<BasisConfigurationException>(
-        req.field_type != FieldType::Scalar, SVMP_HERE,
-        "BasisFactory: Lagrange/Serendipity bases support scalar fields only");
+        req.field_type != FieldType::Scalar, "BasisFactory: Lagrange/Serendipity bases support scalar fields only");
     svmp::throw_if<BasisConfigurationException>(
-        req.continuity != Continuity::C0, SVMP_HERE,
-        "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
+        req.continuity != Continuity::C0, "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
 }
 
 std::unique_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
@@ -84,8 +78,7 @@ std::unique_ptr<BasisFunction> create(const BasisRequest& req) {
         case BasisType::Serendipity:
             return create_serendipity(req);
         default:
-            svmp::raise<BasisConfigurationException>(SVMP_HERE,
-                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope");
+            svmp::raise<BasisConfigurationException>("BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope");
     }
 }
 
@@ -104,8 +97,7 @@ BasisRequest default_basis_request(ElementType element_type) {
             if (order >= 0) {
                 return BasisRequest{element_type, BasisType::Lagrange, order};
             }
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "BasisFactory: no default basis is defined for the requested element type");
+            svmp::raise<BasisElementCompatibilityException>("BasisFactory: no default basis is defined for the requested element type");
         }
     }
 }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 9939fbfa6..982b4c4c1 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -13,8 +13,7 @@ namespace basis {
 void require_span_size(std::size_t actual,
                        std::size_t expected,
                        const char* label) {
-    svmp::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
-        std::string(label) + ": output span is smaller than basis size");
+    svmp::throw_if<BasisEvaluationException>(actual < expected, std::string(label) + ": output span is smaller than basis size");
 }
 
 const std::vector<math::Vector<double, 3>>& BasisFunction::nodes() const noexcept {
@@ -65,16 +64,14 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<double, 3>& xi,
                                           std::span<Gradient> gradients_out) const {
     (void)xi;
     (void)gradients_out;
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "Analytic gradient evaluation is not implemented for this basis");
+    svmp::raise<BasisEvaluationException>("Analytic gradient evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<double, 3>& xi,
                                          std::span<Hessian> hessians_out) const {
     (void)xi;
     (void)hessians_out;
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "Analytic Hessian evaluation is not implemented for this basis");
+    svmp::raise<BasisEvaluationException>("Analytic Hessian evaluation is not implemented for this basis");
 }
 
 // Combined evaluator default: forward each requested (non-empty) quantity to its
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 83c27eb90..58925171a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -41,30 +41,24 @@ struct SimplexEval {
 BasisTopology validated_lagrange_topology(ElementType element_type, int order) {
     switch (element_type) {
         case ElementType::Quad8:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
+            svmp::raise<BasisElementCompatibilityException>("LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
         case ElementType::Hex20:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis");
+            svmp::raise<BasisElementCompatibilityException>("LagrangeBasis: Hex20 is serendipity; use SerendipityBasis");
         case ElementType::Wedge15:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis");
+            svmp::raise<BasisElementCompatibilityException>("LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis");
         case ElementType::Pyramid5:
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "LagrangeBasis: pyramid support is not within the current solver basis scope");
+            svmp::raise<BasisElementCompatibilityException>("LagrangeBasis: pyramid support is not within the current solver basis scope");
         default:
             break;
     }
 
     const BasisTopology top = topology(element_type);
-    svmp::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
-                                                     "LagrangeBasis: unsupported element type");
+    svmp::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, "LagrangeBasis: unsupported element type");
 
     const int baked_order = named_lagrange_order(element_type);
-    svmp::throw_if<BasisConfigurationException>(order != baked_order, SVMP_HERE,
-        "LagrangeBasis: a named element layout has a fixed polynomial order; request the matching "
+    svmp::throw_if<BasisConfigurationException>(order != baked_order, "LagrangeBasis: a named element layout has a fixed polynomial order; request the matching "
         "BasisTopology with an explicit order to choose a different order");
     return top;
 }
@@ -257,13 +251,10 @@ void evaluate_simplex(const Vec3& xi,
 
 LagrangeBasis::LagrangeBasis(BasisTopology topology, int order)
     : topology_(topology), order_(order) {
-    svmp::throw_if<BasisElementCompatibilityException>(topology_ == BasisTopology::Unknown, SVMP_HERE,
-                                                     "LagrangeBasis: unknown reference topology");
-    svmp::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
-                                              "LagrangeBasis requires non-negative polynomial order");
+    svmp::throw_if<BasisElementCompatibilityException>(topology_ == BasisTopology::Unknown, "LagrangeBasis: unknown reference topology");
+    svmp::throw_if<BasisConfigurationException>(order_ < 0, "LagrangeBasis requires non-negative polynomial order");
     svmp::throw_if<BasisConfigurationException>(
-        topology_ == BasisTopology::Point && order_ != 0, SVMP_HERE,
-        "LagrangeBasis: Point topology supports order 0 only");
+        topology_ == BasisTopology::Point && order_ != 0, "LagrangeBasis: Point topology supports order 0 only");
     dimension_ = topology_dimension(topology_);
     init_nodes();
 }
@@ -327,8 +318,7 @@ void LagrangeBasis::init_nodes() {
             break;
     }
 
-    svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-        "Unsupported element type in LagrangeBasis::init_nodes");
+    svmp::raise<BasisElementCompatibilityException>("Unsupported element type in LagrangeBasis::init_nodes");
 }
 
 // Build the single reference node for a point basis.
@@ -395,8 +385,7 @@ void LagrangeBasis::build_wedge_nodes() {
     for (const auto& idx : layout.lattice) {
         const int tri_ordinal =
             tri_ordinal_for_key[static_cast<std::size_t>(idx[0] * stride + idx[1])];
-        svmp::throw_if<BasisConstructionException>(tri_ordinal < 0, SVMP_HERE,
-                                                 "LagrangeBasis: wedge node triangle index lookup failed");
+        svmp::throw_if<BasisConstructionException>(tri_ordinal < 0, "LagrangeBasis: wedge node triangle index lookup failed");
         wedge_indices_.push_back({static_cast<std::size_t>(tri_ordinal),
                                   static_cast<std::size_t>(idx[2])});
     }
@@ -585,8 +574,7 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
             break;
     }
 
-    svmp::raise<BasisEvaluationException>(SVMP_HERE,
-        "Unsupported element in LagrangeBasis evaluation");
+    svmp::raise<BasisEvaluationException>("Unsupported element in LagrangeBasis evaluation");
 }
 
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 0b4796db6..856406de3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -86,8 +86,7 @@ const std::vector<double>& gll_points(int order) {
             }
         }
         svmp::throw_if<BasisConstructionException>(
-            !converged, SVMP_HERE,
-            "ReferenceNodeLayout: Gauss-Lobatto-Legendre Newton iteration did not converge "
+            !converged, "ReferenceNodeLayout: Gauss-Lobatto-Legendre Newton iteration did not converge "
             "(order outside the trustworthy range)");
         pts[static_cast<std::size_t>(j)] = x;
     }
@@ -115,8 +114,7 @@ Lattice lerp_lattice(const Lattice& a, const Lattice& b, int m, int order) {
     for (std::size_t d = 0; d < 3u; ++d) {
         const int numerator = a[d] * (order - m) + b[d] * m;
         svmp::throw_if<BasisConstructionException>(
-            numerator % order != 0, SVMP_HERE,
-            "ReferenceNodeLayout: non-integral edge lattice index");
+            numerator % order != 0, "ReferenceNodeLayout: non-integral edge lattice index");
         result[d] = numerator / order;
     }
     return result;
@@ -131,8 +129,7 @@ Lattice combine_lattice(const Lattice& l0, const Lattice& l1, const Lattice& l2,
     for (std::size_t d = 0; d < 3u; ++d) {
         const int numerator = a * l0[d] + b * l1[d] + c * l2[d];
         svmp::throw_if<BasisConstructionException>(
-            numerator % order != 0, SVMP_HERE,
-            "ReferenceNodeLayout: non-integral face-interior lattice index");
+            numerator % order != 0, "ReferenceNodeLayout: non-integral face-interior lattice index");
         result[d] = numerator / order;
     }
     return result;
@@ -545,8 +542,7 @@ LagrangeNodeLayout generate_wedge_nodes(int order) {
 }
 
 LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order) {
-    svmp::throw_if<BasisConfigurationException>(order < 0, SVMP_HERE,
-                                              "ReferenceNodeLayout requires non-negative Lagrange order");
+    svmp::throw_if<BasisConfigurationException>(order < 0, "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
         case ElementType::Point1: {
@@ -568,11 +564,9 @@ LagrangeNodeLayout complete_lagrange_nodes(ElementType canonical_type, int order
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
-                "ReferenceNodeLayout: pyramid node ordering is disabled");
+            svmp::raise<BasisNodeOrderingException>("ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
-                "ReferenceNodeLayout: unsupported Lagrange topology");
+            svmp::raise<BasisNodeOrderingException>("ReferenceNodeLayout: unsupported Lagrange topology");
     }
 }
 
@@ -610,19 +604,16 @@ std::vector<Point> serendipity_subset_nodes(LagrangeNodeLayout complete,
     svmp::throw_if<BasisConstructionException>(
         complete.coords.size() != complete_count ||
             complete.lattice.size() != complete_count,
-        SVMP_HERE,
         "ReferenceNodeLayout: unexpected complete-quadratic node count for serendipity layout");
     svmp::throw_if<BasisConstructionException>(
-        keep_count >= complete_count, SVMP_HERE,
-        "ReferenceNodeLayout: serendipity node count must be smaller than the complete layout");
+        keep_count >= complete_count, "ReferenceNodeLayout: serendipity node count must be smaller than the complete layout");
 
     for (std::size_t n = 0; n < complete.lattice.size(); ++n) {
         const bool on_skeleton =
             wedge_interior_dim(complete.lattice[n], kQuadraticOrder) <= 1;
         const bool kept = n < keep_count;
         svmp::throw_if<BasisConstructionException>(
-            kept != on_skeleton, SVMP_HERE,
-            "ReferenceNodeLayout: serendipity truncation does not separate skeleton nodes from interior nodes");
+            kept != on_skeleton, "ReferenceNodeLayout: serendipity truncation does not separate skeleton nodes from interior nodes");
     }
 
     std::vector<Point> nodes = std::move(complete.coords);
@@ -678,8 +669,7 @@ std::vector<Point> quad_serendipity_nodes(int order) {
     std::vector<Point> nodes = generate_quad_nodes(order).coords;
     const std::size_t boundary_count = static_cast<std::size_t>(4 * order);
     svmp::throw_if<BasisConstructionException>(
-        boundary_count > nodes.size(), SVMP_HERE,
-        "ReferenceNodeLayout: quadrilateral serendipity skeleton exceeds the complete Lagrange layout");
+        boundary_count > nodes.size(), "ReferenceNodeLayout: quadrilateral serendipity skeleton exceeds the complete Lagrange layout");
     nodes.resize(boundary_count);
     append_quad_serendipity_interior_nodes(nodes, order);
     return nodes;
@@ -774,21 +764,18 @@ std::vector<Point> hex_serendipity_nodes(int order) {
     const std::size_t skeleton_count =
         8u + 12u * static_cast<std::size_t>(order - 1);
     svmp::throw_if<BasisConstructionException>(
-        skeleton_count > nodes.size(), SVMP_HERE,
-        "ReferenceNodeLayout: hexahedral serendipity skeleton exceeds the complete Lagrange layout");
+        skeleton_count > nodes.size(), "ReferenceNodeLayout: hexahedral serendipity skeleton exceeds the complete Lagrange layout");
     nodes.resize(skeleton_count);
 
     const std::size_t skeleton = nodes.size();
     append_hex_serendipity_face_interior_nodes(nodes, order);
     svmp::throw_if<BasisConstructionException>(
-        nodes.size() - skeleton != 6u * quad_serendipity_interior_count(order), SVMP_HERE,
-        "ReferenceNodeLayout: hexahedral serendipity face-interior node count mismatch");
+        nodes.size() - skeleton != 6u * quad_serendipity_interior_count(order), "ReferenceNodeLayout: hexahedral serendipity face-interior node count mismatch");
 
     const std::size_t before_volume = nodes.size();
     append_hex_serendipity_volume_interior_nodes(nodes, order);
     svmp::throw_if<BasisConstructionException>(
-        nodes.size() - before_volume != hex_serendipity_volume_interior_count(order), SVMP_HERE,
-        "ReferenceNodeLayout: hexahedral serendipity volume-interior node count mismatch");
+        nodes.size() - before_volume != hex_serendipity_volume_interior_count(order), "ReferenceNodeLayout: hexahedral serendipity volume-interior node count mismatch");
     return nodes;
 }
 
@@ -806,11 +793,9 @@ std::vector<Point> element_nodes(ElementType elem_type) {
         case ElementType::Wedge15:
             return serendipity_subset_nodes(generate_wedge_nodes(2), 15u, 18u);
         case ElementType::Pyramid13:
-            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
-                "ReferenceNodeLayout: pyramid node ordering is disabled");
+            svmp::raise<BasisNodeOrderingException>("ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            svmp::raise<BasisNodeOrderingException>(SVMP_HERE,
-                "ReferenceNodeLayout: unknown element type");
+            svmp::raise<BasisNodeOrderingException>("ReferenceNodeLayout: unknown element type");
     }
 }
 
@@ -819,24 +804,20 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 // guards with exact integer checks.
 void validate_lattice(const LagrangeNodeLayout& layout, ElementType type, int order) {
     svmp::throw_if<BasisConstructionException>(
-        layout.coords.size() != layout.lattice.size(), SVMP_HERE,
-        "ReferenceNodeLayout: lattice/coordinate count mismatch");
+        layout.coords.size() != layout.lattice.size(), "ReferenceNodeLayout: lattice/coordinate count mismatch");
 
     const BasisTopology top = topology(type);
     for (const auto& idx : layout.lattice) {
         for (std::size_t d = 0; d < 3u; ++d) {
             svmp::throw_if<BasisConstructionException>(
-                idx[d] < 0 || idx[d] > order, SVMP_HERE,
-                "ReferenceNodeLayout: lattice index outside [0, order]");
+                idx[d] < 0 || idx[d] > order, "ReferenceNodeLayout: lattice index outside [0, order]");
         }
         if (top == BasisTopology::Triangle || top == BasisTopology::Tetrahedron) {
             svmp::throw_if<BasisConstructionException>(
-                idx[0] + idx[1] + idx[2] > order, SVMP_HERE,
-                "ReferenceNodeLayout: simplex lattice index sum exceeds order");
+                idx[0] + idx[1] + idx[2] > order, "ReferenceNodeLayout: simplex lattice index sum exceeds order");
         } else if (top == BasisTopology::Wedge) {
             svmp::throw_if<BasisConstructionException>(
-                idx[0] + idx[1] > order, SVMP_HERE,
-                "ReferenceNodeLayout: wedge triangle lattice index sum exceeds order");
+                idx[0] + idx[1] > order, "ReferenceNodeLayout: wedge triangle lattice index sum exceeds order");
         }
     }
 }
@@ -846,21 +827,18 @@ void validate_lattice(const LagrangeNodeLayout& layout, ElementType type, int or
 double line_coord_pm_one(int i, int order) {
     if (order <= 0) {
         svmp::throw_if<BasisNodeOrderingException>(
-            i != 0, SVMP_HERE,
-            "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
+            i != 0, "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
         return double(0);
     }
     svmp::throw_if<BasisNodeOrderingException>(
-        i < 0 || i > order, SVMP_HERE,
-        "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
+        i < 0 || i > order, "ReferenceNodeLayout::line_coord_pm_one: node index out of range");
     return gll_points(order)[static_cast<std::size_t>(i)];
 }
 
 math::Vector<double, 3> ReferenceNodeLayout::node_coord_at(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
-    svmp::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
-                                             "ReferenceNodeLayout::node_coord_at: node index out of range");
+    svmp::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), "ReferenceNodeLayout::node_coord_at: node index out of range");
     return nodes[local_node];
 }
 
@@ -888,16 +866,14 @@ ReferenceNodeLayout::get_lagrange_lattice(ElementType canonical_type, int order)
 std::vector<math::Vector<double, 3>>
 ReferenceNodeLayout::serendipity_node_coords(BasisTopology topology, int order) {
     svmp::throw_if<BasisConstructionException>(
-        order < 1, SVMP_HERE,
-        "ReferenceNodeLayout::serendipity_node_coords requires a polynomial order >= 1");
+        order < 1, "ReferenceNodeLayout::serendipity_node_coords requires a polynomial order >= 1");
     switch (topology) {
         case BasisTopology::Quadrilateral:
             return quad_serendipity_nodes(order);
         case BasisTopology::Hexahedron:
             return hex_serendipity_nodes(order);
         default:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "ReferenceNodeLayout::serendipity_node_coords: generated serendipity layouts "
+            svmp::raise<BasisElementCompatibilityException>("ReferenceNodeLayout::serendipity_node_coords: generated serendipity layouts "
                 "exist only for Quadrilateral and Hexahedron (Wedge15 is the fixed named layout)");
     }
 }
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 0feac25d3..c0a638d6f 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -169,8 +169,7 @@ std::vector<double> build_inverse_vandermonde(
     int max_degree) {
     const std::size_t n = nodes.size();
     svmp::throw_if<BasisConstructionException>(
-        n == 0 || exponents.size() != n, SVMP_HERE,
-        "SerendipityBasis: invalid serendipity interpolation setup");
+        n == 0 || exponents.size() != n, "SerendipityBasis: invalid serendipity interpolation setup");
 
     std::vector<double> vandermonde(n * n, double(0));
     AxisTable tx;
@@ -213,15 +212,13 @@ std::vector<double> build_inverse_vandermonde(
             std::move(vandermonde), n,
             "SerendipityBasis interpolation matrix for " + label);
     } catch (const FEException&) {
-        svmp::raise<BasisConstructionException>(SVMP_HERE,
-            "SerendipityBasis: " + label +
+        svmp::raise<BasisConstructionException>("SerendipityBasis: " + label +
                 " interpolation matrix is singular; the serendipity node set is not "
                 "unisolvent at the requested order");
     }
     const double condition_number = norm_v * matrix_norm_inf(inverse, n);
     svmp::throw_if<BasisConstructionException>(
-        !(condition_number <= kSerendipityVandermondeMaxCond), SVMP_HERE,
-        "SerendipityBasis: " + label +
+        !(condition_number <= kSerendipityVandermondeMaxCond), "SerendipityBasis: " + label +
             " interpolation matrix is too ill-conditioned (condition number ~ " +
             std::to_string(condition_number) +
             "); the requested order exceeds the well-conditioned range");
@@ -285,25 +282,20 @@ NormalizedSerendipityRequest normalize_serendipity_request(ElementType type, int
     const int expected_order = serendipity_named_order(type);
     switch (type) {
         case ElementType::Quad8:
-            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
-                "SerendipityBasis: Quad8 is the quadratic 8-node serendipity layout (order 2 only); "
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, "SerendipityBasis: Quad8 is the quadratic 8-node serendipity layout (order 2 only); "
                 "use BasisTopology::Quadrilateral for higher-order quadrilateral serendipity");
             return {BasisTopology::Quadrilateral, 2, expected_order};
         case ElementType::Hex8:
-            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
-                "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, "SerendipityBasis: Hex8 is the trilinear 8-node basis (order 1 only); use Hex20 for quadratic serendipity");
             return {BasisTopology::Hexahedron, 3, expected_order};
         case ElementType::Hex20:
-            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
-                "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, "SerendipityBasis: Hex20 is the 20-node quadratic serendipity layout (order 2 only)");
             return {BasisTopology::Hexahedron, 3, expected_order};
         case ElementType::Wedge15:
-            svmp::throw_if<BasisConfigurationException>(order != expected_order, SVMP_HERE,
-                "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
+            svmp::throw_if<BasisConfigurationException>(order != expected_order, "SerendipityBasis: Wedge15 is the 15-node quadratic serendipity layout (order 2 only)");
             return {BasisTopology::Wedge, 3, expected_order};
         default:
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "SerendipityBasis named elements are Quad8, Hex8, Hex20, and Wedge15; "
+            svmp::raise<BasisElementCompatibilityException>("SerendipityBasis named elements are Quad8, Hex8, Hex20, and Wedge15; "
                 "use BasisTopology::Quadrilateral for arbitrary-order quadrilateral serendipity");
     }
 }
@@ -315,12 +307,10 @@ SerendipityBasis::SerendipityBasis(BasisTopology topology, int order)
     const bool supported_topology = topology_ == BasisTopology::Quadrilateral ||
                                     topology_ == BasisTopology::Hexahedron;
     svmp::throw_if<BasisElementCompatibilityException>(
-        !supported_topology, SVMP_HERE,
-        "SerendipityBasis: arbitrary-order topology construction is supported for "
+        !supported_topology, "SerendipityBasis: arbitrary-order topology construction is supported for "
         "Quadrilateral and Hexahedron; use the named ElementType (Wedge15) for wedge serendipity");
     svmp::throw_if<BasisConfigurationException>(
-        order < 1, SVMP_HERE,
-        "SerendipityBasis: serendipity requires a polynomial order >= 1");
+        order < 1, "SerendipityBasis: serendipity requires a polynomial order >= 1");
     dimension_ = topology_ == BasisTopology::Hexahedron ? 3 : 2;
     order_ = order;
     if (topology_ == BasisTopology::Hexahedron) {
@@ -355,8 +345,7 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order) {
             return;
         default:
             // normalize_serendipity_request already rejected anything else.
-            svmp::raise<BasisElementCompatibilityException>(SVMP_HERE,
-                "SerendipityBasis: unsupported named serendipity element");
+            svmp::raise<BasisElementCompatibilityException>("SerendipityBasis: unsupported named serendipity element");
     }
 }
 
@@ -375,8 +364,7 @@ void SerendipityBasis::init_quadrilateral(int order) {
     size_ = mode_exponents_.size();
     nodes_ = ReferenceNodeLayout::serendipity_node_coords(BasisTopology::Quadrilateral, order);
     svmp::throw_if<BasisConstructionException>(
-        nodes_.size() != size_, SVMP_HERE,
-        "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
+        nodes_.size() != size_, "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
     uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
         nodes_, mode_exponents_, "Quad order " + std::to_string(order),
@@ -393,8 +381,7 @@ void SerendipityBasis::init_hexahedron(int order) {
     size_ = mode_exponents_.size();
     nodes_ = ReferenceNodeLayout::serendipity_node_coords(BasisTopology::Hexahedron, order);
     svmp::throw_if<BasisConstructionException>(
-        nodes_.size() != size_, SVMP_HERE,
-        "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
+        nodes_.size() != size_, "SerendipityBasis: hexahedral serendipity setup produced inconsistent sizes");
     uses_legendre_modes_ = true;
     inv_vandermonde_ = build_inverse_vandermonde(
         nodes_, mode_exponents_, "Hex order " + std::to_string(order),
@@ -407,18 +394,15 @@ void SerendipityBasis::init_hexahedron(int order) {
 // layout that still carries a fixed monomial table.
 void SerendipityBasis::init_fixed_named(ElementType type) {
     svmp::throw_if<BasisConstructionException>(
-        type != ElementType::Wedge15, SVMP_HERE,
-        "SerendipityBasis: init_fixed_named builds only the Wedge15 layout");
+        type != ElementType::Wedge15, "SerendipityBasis: init_fixed_named builds only the Wedge15 layout");
     size_ = 15u;
     const std::span<const std::array<int, 3>> family_exponents(
         kWedge15MonomialExponents.data(), kWedge15MonomialExponents.size());
     nodes_ = ReferenceNodeLayout::node_coords(type);
     svmp::throw_if<BasisConstructionException>(
-        nodes_.size() != size_, SVMP_HERE,
-        "SerendipityBasis: Wedge15 layout node count does not match basis size");
+        nodes_.size() != size_, "SerendipityBasis: Wedge15 layout node count does not match basis size");
     svmp::throw_if<BasisConstructionException>(
-        family_exponents.size() != size_, SVMP_HERE,
-        "SerendipityBasis: Wedge15 monomial count does not match basis size");
+        family_exponents.size() != size_, "SerendipityBasis: Wedge15 monomial count does not match basis size");
     mode_exponents_.assign(family_exponents.begin(), family_exponents.end());
     // Wedge15 is the fixed order-2 layout; its 15x15 system is trivially
     // well-conditioned, so it keeps the monomial modal basis.
@@ -458,7 +442,6 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<double, 3>& xi,
     svmp::throw_if<BasisEvaluationException>(
         mode_exponents_.size() != size_ ||
             inv_vandermonde_.size() != size_ * size_,
-        SVMP_HERE,
         "SerendipityBasis: interpolation tables are not initialized for evaluation");
 
     // Build the per-axis modal tables once, then accumulate over the modes. The
diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index d8779aa22..cf5a153cd 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -36,17 +36,17 @@ namespace FE {
  * Throw FE exceptions through the canonical core helpers in Core/Exception.h:
  *
  * @code
- * svmp::raise<ExceptionT>(SVMP_HERE, message);
- * svmp::throw_if<ExceptionT>(failure_condition, SVMP_HERE, message);
- * svmp::check_arg<ExceptionT>(valid_condition, SVMP_HERE, message);
- * svmp::check_not_null<ExceptionT>(ptr, SVMP_HERE, message);
- * svmp::check_index<ExceptionT>(index, size, SVMP_HERE);
- * svmp::not_implemented<ExceptionT>(feature, SVMP_HERE);
+ * svmp::raise<ExceptionT>(message);
+ * svmp::throw_if<ExceptionT>(failure_condition, message);
+ * svmp::check<ExceptionT>(valid_condition, message);
+ * svmp::check_not_null<ExceptionT>(ptr, message);
+ * svmp::check_index<ExceptionT>(index, size);
+ * svmp::not_implemented(message);
  * @endcode
  *
- * throw_if() is failure-condition based. check_arg() is
- * success-condition based. FE owns exception types; helper spelling is owned
- * by the core layer.
+ * check() raises when its (success) condition is false; throw_if() raises when
+ * its (failure) condition is true. FE owns exception types; helper spelling is
+ * owned by the core layer.
  * @{
  */
 
@@ -60,39 +60,16 @@ namespace FE {
 class FEException : public ExceptionBase {
 public:
     /**
-     * @brief Construct with a message and optional status code and source location.
+     * @brief Construct with a message and optional status code.
      * @param message Human-readable error description.
      * @param status Status code classifying the failure.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
+     *
+     * @details The source location is stamped by svmp::raise(); construct FE
+     * exceptions through the core helpers rather than passing file/line/function.
      */
-    FEException(const std::string& message,
-                StatusCode status = StatusCode::Unknown,
-                const char* file = "",
-                int line = 0,
-                const char* function = "")
-        : ExceptionBase(message,
-                        status,
-                        "FE Exception",
-                        file,
-                        line,
-                        function)
-    {
-    }
-
-    /**
-     * @brief Construct with a message and source location, using an Unknown status.
-     * @param message Human-readable error description.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
-     */
-    FEException(const std::string& message,
-                const char* file,
-                int line,
-                const char* function = "")
-        : FEException(message, StatusCode::Unknown, file, line, function)
+    explicit FEException(const std::string& message,
+                         StatusCode status = StatusCode::Unknown)
+        : ExceptionBase(message, status, "FE Exception")
     {
     }
 
@@ -106,24 +83,8 @@ class FEException : public ExceptionBase {
 /**
  * @brief An argument failed validation
  */
-class InvalidArgumentException : public FEException {
-public:
-    /**
-     * @brief Construct with a message and optional source location.
-     * @param message Human-readable error description.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
-     */
-    InvalidArgumentException(const std::string& message,
-                             const char* file = "",
-                             int line = 0,
-                             const char* function = "")
-        : FEException(message, StatusCode::InvalidArgument, file, line,
-                      function)
-    {
-    }
-};
+SVMP_DEFINE_EXCEPTION(InvalidArgumentException, FEException,
+                      StatusCode::InvalidArgument);
 
 /**
  * @brief Unsupported or malformed element request
@@ -136,20 +97,11 @@ class InvalidElementException : public FEException {
      * @brief Construct with a message and optional element-type context.
      * @param message Human-readable error description.
      * @param element_type Name of the offending element type; appended to the message when non-empty.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
      */
     InvalidElementException(const std::string& message,
-                            std::string element_type = "",
-                            const char* file = "",
-                            int line = 0,
-                            const char* function = "")
+                            std::string element_type = "")
         : FEException(build_message(message, element_type),
-                      StatusCode::InvalidArgument,
-                      file,
-                      line,
-                      function),
+                      StatusCode::InvalidArgument),
           element_type_(std::move(element_type))
     {
     }
@@ -185,20 +137,11 @@ class DofException : public FEException {
      * @brief Construct with a message and optional DOF-index context.
      * @param message Human-readable error description.
      * @param dof_index Offending DOF index; appended to the message unless it equals invalid_dof_index().
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
      */
     DofException(const std::string& message,
-                 long long dof_index = invalid_dof_index(),
-                 const char* file = "",
-                 int line = 0,
-                 const char* function = "")
+                 long long dof_index = invalid_dof_index())
         : FEException(build_message(message, dof_index),
-                      StatusCode::InvalidArgument,
-                      file,
-                      line,
-                      function),
+                      StatusCode::InvalidArgument),
           dof_index_(dof_index)
     {
     }
@@ -231,23 +174,7 @@ class DofException : public FEException {
 /**
  * @brief Global assembly failure
  */
-class AssemblyException : public FEException {
-public:
-    /**
-     * @brief Construct with a message and optional source location.
-     * @param message Human-readable error description.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
-     */
-    AssemblyException(const std::string& message,
-                      const char* file = "",
-                      int line = 0,
-                      const char* function = "")
-        : FEException(message, StatusCode::InvalidState, file, line, function)
-    {
-    }
-};
+SVMP_DEFINE_EXCEPTION(AssemblyException, FEException, StatusCode::InvalidState);
 
 /**
  * @brief Failure reported by a linear-algebra or solver backend
@@ -262,21 +189,12 @@ class BackendException : public FEException {
      * @param message Human-readable error description.
      * @param backend_name Name of the failing backend; appended to the message when non-empty.
      * @param error_code Backend-native error code; appended to the message when nonzero.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
      */
     BackendException(const std::string& message,
                      std::string backend_name = "",
-                     int error_code = 0,
-                     const char* file = "",
-                     int line = 0,
-                     const char* function = "")
+                     int error_code = 0)
         : FEException(build_message(message, backend_name, error_code),
-                      StatusCode::DependencyError,
-                      file,
-                      line,
-                      function),
+                      StatusCode::DependencyError),
           backend_name_(std::move(backend_name)),
           error_code_(error_code)
     {
@@ -321,29 +239,14 @@ class BackendException : public FEException {
 };
 
 /**
- * @brief Requested feature is not implemented
+ * @brief Requested feature is not implemented.
+ *
+ * @details Alias for svmp::NotImplementedException (Core/Exception.h), the single
+ * not-implemented type used across the solver and the default raised by
+ * svmp::not_implemented(). Kept in the FE namespace for source compatibility; it
+ * derives from CoreException, not FEException.
  */
-class NotImplementedException : public FEException {
-public:
-    /**
-     * @brief Construct from the name of the missing feature.
-     * @param feature Description of the unimplemented feature.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
-     */
-    NotImplementedException(const std::string& feature,
-                            const char* file = "",
-                            int line = 0,
-                            const char* function = "")
-        : FEException("Feature not implemented: " + feature,
-                      StatusCode::NotImplemented,
-                      file,
-                      line,
-                      function)
-    {
-    }
-};
+using NotImplementedException = svmp::NotImplementedException;
 
 /**
  * @brief Required initialization step has not been performed
@@ -353,19 +256,10 @@ class NotInitializedException : public FEException {
   /**
    * @brief Construct from the name of the uninitialized feature.
    * @param feature Description of the missing initialization.
-   * @param file Source file where the error was raised.
-   * @param line Source line where the error was raised.
-   * @param function Function where the error was raised.
    */
-  NotInitializedException(const std::string &feature,
-                          const char *file,
-                          int line = 0,
-                          const char *function = "")
+  explicit NotInitializedException(const std::string& feature)
       : FEException("Missing initialization: " + feature,
-                    StatusCode::InvalidState,
-                    file,
-                    line,
-                    function)
+                    StatusCode::InvalidState)
   {
   }
 };
@@ -383,21 +277,12 @@ class ConvergenceException : public FEException {
      * @param message Human-readable error description.
      * @param iteration Iteration at which the failure was detected; appended to the message when non-negative.
      * @param residual Final residual; appended to the message when positive.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
      */
     ConvergenceException(const std::string& message,
                          int iteration = -1,
-                         double residual = 0.0,
-                         const char* file = "",
-                         int line = 0,
-                         const char* function = "")
+                         double residual = 0.0)
         : FEException(build_message(message, iteration, residual),
-                      StatusCode::InvalidState,
-                      file,
-                      line,
-                      function),
+                      StatusCode::InvalidState),
           iteration_(iteration),
           residual_(residual)
     {
@@ -447,20 +332,11 @@ class SingularMappingException : public FEException {
      * @brief Construct with a message and the offending Jacobian determinant.
      * @param message Human-readable error description.
      * @param jacobian_det Jacobian determinant at the failure point; appended to the message.
-     * @param file Source file where the error was raised.
-     * @param line Source line where the error was raised.
-     * @param function Function where the error was raised.
      */
     SingularMappingException(const std::string& message,
-                             double jacobian_det = 0.0,
-                             const char* file = "",
-                             int line = 0,
-                             const char* function = "")
+                             double jacobian_det = 0.0)
         : FEException(build_message(message, jacobian_det),
-                      StatusCode::InvalidState,
-                      file,
-                      line,
-                      function),
+                      StatusCode::InvalidState),
           jacobian_det_(jacobian_det)
     {
     }
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 86c3045c3..86884740c 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -92,15 +92,12 @@ void DenseLUSolver::solve_in_place(std::span<double> rhs) const {
 
 void DenseLUSolver::solve_in_place(std::span<double> rhs,
                                    std::size_t rhs_count) const {
-    ::svmp::check_arg<FEException>(
-        rhs_count > 0, SVMP_HERE,
-        error_message_label + ": dense solve requires at least one right-hand side");
-    ::svmp::check_arg<FEException>(
-        rhs.size() == n * rhs_count, SVMP_HERE,
-        error_message_label + ": dense multi-RHS solve size mismatch");
-    ::svmp::check_arg<FEException>(
-        impl && impl->lu.rows() == static_cast<Eigen::Index>(n), SVMP_HERE,
-        error_message_label + ": dense solver is not factorized");
+    ::svmp::check<FEException>(
+        rhs_count > 0, error_message_label + ": dense solve requires at least one right-hand side");
+    ::svmp::check<FEException>(
+        rhs.size() == n * rhs_count, error_message_label + ": dense multi-RHS solve size mismatch");
+    ::svmp::check<FEException>(
+        impl && impl->lu.rows() == static_cast<Eigen::Index>(n), error_message_label + ": dense solver is not factorized");
     if (n == 0) {
         return;
     }
@@ -124,12 +121,10 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     std::size_t rows,
     std::size_t cols,
     std::string_view error_message_label) {
-    ::svmp::check_arg<FEException>(
-        matrix.size() == rows * cols, SVMP_HERE,
-        std::string(error_message_label) + ": diagnostic size mismatch");
-    ::svmp::check_arg<FEException>(
-        rows > 0 && cols > 0, SVMP_HERE,
-        std::string(error_message_label) + ": diagnostics require a nonempty matrix");
+    ::svmp::check<FEException>(
+        matrix.size() == rows * cols, std::string(error_message_label) + ": diagnostic size mismatch");
+    ::svmp::check<FEException>(
+        rows > 0 && cols > 0, std::string(error_message_label) + ": diagnostics require a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense);
@@ -164,9 +159,8 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
 DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
                                   std::size_t n,
                                   std::string_view error_message_label) {
-    ::svmp::check_arg<FEException>(
-        matrix.size() == n * n, SVMP_HERE,
-        std::string(error_message_label) + ": dense factorization size mismatch");
+    ::svmp::check<FEException>(
+        matrix.size() == n * n, std::string(error_message_label) + ": dense factorization size mismatch");
 
     DenseLUSolver solver;
     solver.n = n;
@@ -185,9 +179,8 @@ DenseLUSolver factor_dense_matrix(std::vector<double> matrix,
     const auto diagonal = solver.impl->lu.matrixLU().diagonal();
     for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
         const double pivot_magnitude = std::abs(diagonal[col]);
-        ::svmp::check_arg<FEException>(
-            pivot_magnitude > solver.pivot_tolerance, SVMP_HERE,
-            solver.error_message_label + ": rank-deficient matrix (rank " +
+        ::svmp::check<FEException>(
+            pivot_magnitude > solver.pivot_tolerance, solver.error_message_label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
                 ", pivot below scale-aware tolerance " +
                 std::to_string(solver.pivot_tolerance) + ")");
@@ -209,9 +202,8 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
     std::vector<double> matrix,
     std::size_t n,
     std::string_view error_message_label) {
-    ::svmp::check_arg<FEException>(
-        matrix.size() == n * n, SVMP_HERE,
-        std::string(error_message_label) + ": dense inverse size mismatch");
+    ::svmp::check<FEException>(
+        matrix.size() == n * n, std::string(error_message_label) + ": dense inverse size mismatch");
     std::vector<double> matrix_for_lu = matrix;
     const DenseLUSolver solver =
         factor_dense_matrix(std::move(matrix_for_lu), n, error_message_label);
@@ -235,9 +227,8 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
             // singular value (rank < full_rank). A sub-tolerance singular value
             // therefore cannot reach here in current code; the guard protects
             // against future refactors that derive the fallback condition differently.
-            ::svmp::check_arg<FEException>(
-                singular_values[i] > result.diagnostics.tolerance, SVMP_HERE,
-                std::string(error_message_label) + ": high-condition SVD fallback encountered a dropped singular value");
+            ::svmp::check<FEException>(
+                singular_values[i] > result.diagnostics.tolerance, std::string(error_message_label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = double(1) / singular_values[i];
         }
         const DenseMatrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
@@ -256,9 +247,8 @@ void validate_dense_inverse_diagnostics(
     std::size_t expected_rank,
     std::string_view error_message_label,
     double max_condition) {
-    ::svmp::check_arg<FEException>(
-        result.diagnostics.rank == expected_rank, SVMP_HERE,
-        std::string(error_message_label) + ": rank-deficient matrix (rank " +
+    ::svmp::check<FEException>(
+        result.diagnostics.rank == expected_rank, std::string(error_message_label) + ": rank-deficient matrix (rank " +
             std::to_string(result.diagnostics.rank) + " of " +
             std::to_string(expected_rank) + ")");
 
@@ -266,9 +256,8 @@ void validate_dense_inverse_diagnostics(
         return;
     }
 
-    ::svmp::check_arg<FEException>(
-        result.diagnostics.condition_estimate <= max_condition, SVMP_HERE,
-        std::string(error_message_label) + ": condition estimate " +
+    ::svmp::check<FEException>(
+        result.diagnostics.condition_estimate <= max_condition, std::string(error_message_label) + ": condition estimate " +
             std::to_string(result.diagnostics.condition_estimate) +
             " exceeds supported threshold " + std::to_string(max_condition));
 }
@@ -286,9 +275,8 @@ std::vector<double> invert_dense_matrix(std::vector<double> matrix,
 std::size_t dense_matrix_rank(std::vector<double> matrix,
                               std::size_t rows,
                               std::size_t cols) {
-    ::svmp::check_arg<FEException>(
-        matrix.size() == rows * cols, SVMP_HERE,
-        "dense_matrix_rank: size mismatch");
+    ::svmp::check<FEException>(
+        matrix.size() == rows * cols, "dense_matrix_rank: size mismatch");
 
     const DenseMatrix dense =
         map_row_major(std::span<const double>(matrix.data(), matrix.size()), rows, cols);
@@ -314,12 +302,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     std::size_t rows,
     std::size_t cols,
     std::string_view error_message_label) {
-    ::svmp::check_arg<FEException>(
-        matrix.size() == rows * cols, SVMP_HERE,
-        std::string(error_message_label) + ": pseudo-inverse size mismatch");
-    ::svmp::check_arg<FEException>(
-        rows > 0 && cols > 0, SVMP_HERE,
-        std::string(error_message_label) + ": pseudo-inverse requires a nonempty matrix");
+    ::svmp::check<FEException>(
+        matrix.size() == rows * cols, std::string(error_message_label) + ": pseudo-inverse size mismatch");
+    ::svmp::check<FEException>(
+        rows > 0 && cols > 0, std::string(error_message_label) + ": pseudo-inverse requires a nonempty matrix");
 
     const DenseMatrix dense = map_row_major(matrix, rows, cols);
     Eigen::JacobiSVD<DenseMatrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
diff --git a/Code/Source/solver/Parameters.cpp b/Code/Source/solver/Parameters.cpp
index e20dcf2f7..8b9191e49 100644
--- a/Code/Source/solver/Parameters.cpp
+++ b/Code/Source/solver/Parameters.cpp
@@ -84,38 +84,43 @@ std::string missing_xml_attribute_message(tinyxml2::XMLElement* element,
 }
 
 const char* require_xml_attribute(tinyxml2::XMLElement* element,
-    const char* attribute_name, svmp::SourceLocation location,
-    const std::string& message = std::string())
+    const char* attribute_name, const std::string& message = std::string(),
+    const char* file = __builtin_FILE(), int line = __builtin_LINE(),
+    const char* function = __builtin_FUNCTION())
 {
   const char* value = nullptr;
   if (element == nullptr ||
       element->QueryStringAttribute(attribute_name, &value) != tinyxml2::XML_SUCCESS ||
       value == nullptr) {
-    svmp::raise<svmp::ParseException>(
-        location,
+    svmp::raise<svmp::ParseException>(svmp::Diagnostic(
         message.empty() ? missing_xml_attribute_message(element, attribute_name)
-                        : message);
+                        : message,
+        file, line, function));
   }
   return value;
 }
 
 const char* require_xml_text(tinyxml2::XMLElement* element,
-    svmp::SourceLocation location, const std::string& message)
+    const std::string& message, const char* file = __builtin_FILE(),
+    int line = __builtin_LINE(), const char* function = __builtin_FUNCTION())
 {
   if (element == nullptr || element->GetText() == nullptr) {
-    svmp::raise<svmp::ParseException>(location, message);
+    svmp::raise<svmp::ParseException>(
+        svmp::Diagnostic(message, file, line, function));
   }
   return element->GetText();
 }
 
 template <typename MapT>
 typename MapT::mapped_type require_map_value(const MapT& map,
-    const typename MapT::key_type& key, svmp::SourceLocation location,
-    const std::string& message)
+    const typename MapT::key_type& key, const std::string& message,
+    const char* file = __builtin_FILE(), int line = __builtin_LINE(),
+    const char* function = __builtin_FUNCTION())
 {
   auto iter = map.find(key);
   if (iter == map.end()) {
-    svmp::raise<svmp::ParseException>(location, message);
+    svmp::raise<svmp::ParseException>(
+        svmp::Diagnostic(message, file, line, function));
   }
   return iter->second;
 }
@@ -140,10 +145,10 @@ void xml_util_set_parameters( std::function<void(const std::string&, const std::
         try {
           fn(name, value);
         } catch (const std::bad_function_call& exception) {
-          svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+          svmp::raise<svmp::ParseException>(error_msg + name + "'.");
         }
       } else {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     }
 
@@ -161,18 +166,18 @@ std::string IncludeParametersFile::NAME = "Include_xml";
 IncludeParametersFile::IncludeParametersFile(const char* cfile_name)
 {
   svmp::check<svmp::ParseException>(
-      cfile_name != nullptr, SVMP_HERE, "Include_xml requires a file name.");
+      cfile_name != nullptr, "Include_xml requires a file name.");
 
   std::string file_name(cfile_name);
   file_name.erase(std::remove_if(file_name.begin(), file_name.end(), ::isspace), file_name.end());
   svmp::check<svmp::ParseException>(
-      !file_name.empty(), SVMP_HERE, "Include_xml requires a non-empty file name.");
+      !file_name.empty(), "Include_xml requires a non-empty file name.");
 
   auto error = document.LoadFile(file_name.c_str());
   root_element = document.FirstChildElement(Parameters::FSI_FILE.c_str());
 
   if (error != tinyxml2::XML_SUCCESS || root_element == nullptr) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "The following error occurred while reading the XML file '" +
+    svmp::raise<svmp::ParseException>("The following error occurred while reading the XML file '" +
         file_name + "'.\n" + "[svMultiPhysics] ERROR " + std::string(document.ErrorStr())); 
   }
 }
@@ -229,7 +234,7 @@ void Parameters::read_xml(std::string file_name)
   
   auto root_element = doc.FirstChildElement(FSI_FILE.c_str());
   if (error != tinyxml2::XML_SUCCESS || root_element == nullptr) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "The following error occurred while reading the XML file '" + file_name + "'.\n" +
+    svmp::raise<svmp::ParseException>("The following error occurred while reading the XML file '" + file_name + "'.\n" +
         "[svMultiPhysics] ERROR " + std::string(doc.ErrorStr()));
   }
 
@@ -274,7 +279,7 @@ void Parameters::set_equation_values(tinyxml2::XMLElement* root_element)
   auto add_eq_item = root_element->FirstChildElement(EquationParameters::xml_element_name_.c_str());
 
   while (add_eq_item) {
-    const char* eq_type = require_xml_attribute(add_eq_item, "type", SVMP_HERE);
+    const char* eq_type = require_xml_attribute(add_eq_item, "type");
 
     auto eq_params = new EquationParameters();
     eq_params->type.set(std::string(eq_type));
@@ -290,7 +295,7 @@ void Parameters::set_mesh_values(tinyxml2::XMLElement* root_element)
   auto add_mesh_item = root_element->FirstChildElement(MeshParameters::xml_element_name_.c_str());
 
   while (add_mesh_item) {
-    const char* mesh_name = require_xml_attribute(add_mesh_item, "name", SVMP_HERE);
+    const char* mesh_name = require_xml_attribute(add_mesh_item, "name");
 
     MeshParameters* mesh_params = new MeshParameters();
     mesh_params->name.set(std::string(mesh_name));
@@ -316,7 +321,7 @@ void Parameters::set_projection_values(tinyxml2::XMLElement* root_element)
   auto add_proj_item = root_element->FirstChildElement(ProjectionParameters::xml_element_name_.c_str());
 
   while (add_proj_item) {
-    const char* proj_name = require_xml_attribute(add_proj_item, "name", SVMP_HERE);
+    const char* proj_name = require_xml_attribute(add_proj_item, "name");
 
     ProjectionParameters* proj_params = new ProjectionParameters();
     proj_params->name.set(std::string(proj_name));
@@ -333,7 +338,7 @@ void Parameters::set_RIS_projection_values(tinyxml2::XMLElement* root_element)
 
   while (add_RIS_proj_item) {
     const char* RIS_proj_name =
-        require_xml_attribute(add_RIS_proj_item, "name", SVMP_HERE);
+        require_xml_attribute(add_RIS_proj_item, "name");
 
     RISProjectionParameters* RIS_proj_params = new RISProjectionParameters();
     RIS_proj_params->name.set(std::string(RIS_proj_name));
@@ -350,7 +355,7 @@ void Parameters::set_URIS_mesh_values(tinyxml2::XMLElement* root_element)
 
   while (add_URIS_mesh_item) {
     const char* URIS_mesh_name =
-        require_xml_attribute(add_URIS_mesh_item, "name", SVMP_HERE);
+        require_xml_attribute(add_URIS_mesh_item, "name");
 
     URISMeshParameters* URIS_mesh_params = new URISMeshParameters();
     URIS_mesh_params->name.set(std::string(URIS_mesh_name));
@@ -410,7 +415,7 @@ void BodyForceParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <LS type=TYPE> element.
-  const char* smesh = require_xml_attribute(xml_elem, "mesh", SVMP_HERE);
+  const char* smesh = require_xml_attribute(xml_elem, "mesh");
   mesh_name.set(std::string(smesh));
   //auto item = xml_elem->FirstChildElement();
 
@@ -584,7 +589,7 @@ void BoundaryConditionParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '"; 
 
   // Get the 'name' from the <Add_BC name=NAME> element.
-  const char* sname = require_xml_attribute(xml_elem, "name", SVMP_HERE);
+  const char* sname = require_xml_attribute(xml_elem, "name");
   name.set(std::string(sname));
 
   auto item = xml_elem->FirstChildElement();
@@ -601,10 +606,10 @@ void BoundaryConditionParameters::set_values(tinyxml2::XMLElement* xml_elem)
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -938,8 +943,7 @@ void CANNRowParameters::print_parameters()
 void CANNRowParameters::set_values(tinyxml2::XMLElement* row_elem)
 {
   svmp::check_not_null<svmp::ParseException>(
-      row_elem, SVMP_HERE,
-      "CANNRowParameters::set_values: Received null XML element.");
+      row_elem, "CANNRowParameters::set_values: Received null XML element.");
 
   using namespace tinyxml2;
 
@@ -947,7 +951,7 @@ void CANNRowParameters::set_values(tinyxml2::XMLElement* row_elem)
 
   // Set row_name for current row element
   const char* row_name_input =
-      require_xml_attribute(row_elem, "row_name", SVMP_HERE);
+      require_xml_attribute(row_elem, "row_name");
   row_name.set(std::string(row_name_input));
 
   auto item = row_elem->FirstChildElement();
@@ -958,13 +962,13 @@ void CANNRowParameters::set_values(tinyxml2::XMLElement* row_elem)
     auto value = item->GetText();
 
     if (value == nullptr) { 
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     try {
       set_parameter_value_CANN(name, value);
     } catch (const std::bad_function_call& exception) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -1008,7 +1012,7 @@ void CANNParameters::set_values(tinyxml2::XMLElement* xml_elem)
   }
 
   if (rows.empty()) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + "Add_row'. No rows found.");
+    svmp::raise<svmp::ParseException>(error_msg + "Add_row'. No rows found.");
   }
 
   value_set = true;
@@ -1055,7 +1059,7 @@ void ConstitutiveModelParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '"; 
 
   // Get the 'type' from the <Constitutive_model type= > element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
 
   // Check constitutive model type.
@@ -1067,7 +1071,7 @@ void ConstitutiveModelParameters::set_values(tinyxml2::XMLElement* xml_elem)
     }
     msg_2 += "\n";
     auto msg = msg_1 + msg_2;
-    svmp::raise<svmp::ParseException>(SVMP_HERE, msg);
+    svmp::raise<svmp::ParseException>(msg);
   }
   auto model_type = constitutive_model_types.at(type.value());
   type.set(model_type);
@@ -1083,13 +1087,13 @@ void ConstitutiveModelParameters::set_values(tinyxml2::XMLElement* xml_elem)
 void ConstitutiveModelParameters::check_constitutive_model(const Parameter<std::string>& eq_type_str)
 {
   auto eq_type = require_map_value(consts::equation_name_to_type, eq_type_str.value(),
-      SVMP_HERE, "Unknown equation type '" + eq_type_str.value() + "'.");
+      "Unknown equation type '" + eq_type_str.value() + "'.");
   auto model = require_map_value(consts::constitutive_model_name_to_type, type.value(),
-      SVMP_HERE, "Unknown constitutive model '" + type.value() + "'.");
+      "Unknown constitutive model '" + type.value() + "'.");
 
   if (eq_type == consts::EquationType::phys_ustruct) {
     if (! ustruct::constitutive_model_is_valid(model)) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "The " + type.value() + " constitutive model is not valid for ustruct equations.");
+      svmp::raise<svmp::ParseException>("The " + type.value() + " constitutive model is not valid for ustruct equations.");
     }
   }
 }
@@ -1118,7 +1122,7 @@ void CoupleGenBCParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown Couple_to_genBC type=TYPE XML element '";
   
   // Get the 'type' from the <Couple_to_genBC type=TYPE> element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
   auto item = xml_elem->FirstChildElement();
   
@@ -1208,7 +1212,7 @@ void OutputParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string msg("[OutputParameters::set_values] ");
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '"; 
 
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
 
   // Get values from XML file.
@@ -1220,8 +1224,7 @@ void OutputParameters::set_values(tinyxml2::XMLElement* xml_elem)
     auto item = xml_elem->FirstChildElement();
     while (item != nullptr) {
       auto name = std::string(item->Name());
-      auto value = std::string(require_xml_text(item, SVMP_HERE,
-          "Output XML element '" + name + "' requires a value."));
+      auto value = std::string(require_xml_text(item, "Output XML element '" + name + "' requires a value."));
       Parameter<std::string> param(name, "", false);
       param.set(value);
       alias_list.emplace_back(param);
@@ -1231,8 +1234,7 @@ void OutputParameters::set_values(tinyxml2::XMLElement* xml_elem)
     auto item = xml_elem->FirstChildElement();
     while (item != nullptr) {
       auto name = std::string(item->Name());
-      auto value = std::string(require_xml_text(item, SVMP_HERE,
-          "Output XML element '" + name + "' requires a value."));
+      auto value = std::string(require_xml_text(item, "Output XML element '" + name + "' requires a value."));
       Parameter<bool> param(name, false, false);
       param.set(value);
       output_list.emplace_back(param);
@@ -1291,7 +1293,7 @@ void VariableWallPropsParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <Variable_wall_properties mesh_name=NAME> element.
-  const char* sname = require_xml_attribute(xml_elem, "mesh_name", SVMP_HERE);
+  const char* sname = require_xml_attribute(xml_elem, "mesh_name");
   mesh_name.set(std::string(sname));
   auto item = xml_elem->FirstChildElement();
 
@@ -1456,12 +1458,12 @@ void FluidViscosityParameters::set_values(tinyxml2::XMLElement* xml_elem)
 {
   using namespace tinyxml2;
 
-  const char* smodel = require_xml_attribute(xml_elem, "model", SVMP_HERE);
+  const char* smodel = require_xml_attribute(xml_elem, "model");
   model.set(std::string(smodel));
 
   // Check fluid_viscosity model name.
   if (model_names.count(model.value()) == 0) { 
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown fluid viscosity model '" + model.value() +
+      svmp::raise<svmp::ParseException>("Unknown fluid viscosity model '" + model.value() +
         "' in '" + xml_elem->Name() + "'.");
   }
 
@@ -1576,12 +1578,12 @@ void SolidViscosityParameters::set_values(tinyxml2::XMLElement* xml_elem)
 {
   using namespace tinyxml2;
 
-  const char* smodel = require_xml_attribute(xml_elem, "model", SVMP_HERE);
+  const char* smodel = require_xml_attribute(xml_elem, "model");
   model.set(std::string(smodel));
 
   // Check solid viscosity model name.
   if (model_names.count(model.value()) == 0) { 
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown solid viscosity model '" + model.value() +
+      svmp::raise<svmp::ParseException>("Unknown solid viscosity model '" + model.value() +
         "' in '" + xml_elem->Name() + "'.");
   }
 
@@ -1616,7 +1618,7 @@ void IonicInitialStateParameters::print_parameters() const {
 void IonicInitialStateParameters::set_values(
     const tinyxml2::XMLElement *xml_elem) {
   if (xml_elem->Name() != xml_element_name) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown " + xml_element_name +
+    svmp::raise<svmp::ParseException>("Unknown " + xml_element_name +
                                                      " XML element '");
   }
 
@@ -1634,12 +1636,10 @@ void IonicInitialStateParameters::set_values(
         set_parameter_value(name, value);
         value_set = true;
       } catch (const std::bad_function_call &exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE,
-                                          error_msg_prefix + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg_prefix + name + "'.");
       }
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE,
-                                        error_msg_prefix + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg_prefix + name + "'.");
     }
   }
 
@@ -1715,14 +1715,14 @@ void IonicModelParameters::set_values(const tinyxml2::XMLElement *xml_elem) {
   // The initial values of both state and gating variables must be set.
   if (initial_X_parameters.required && !initial_X_parameters.defined()) {
     svmp::raise<svmp::ParseException>(
-        SVMP_HERE, xml_element_name + " requires an '" +
+        xml_element_name + " requires an '" +
                        initial_X_parameters.xml_element_name +
                        "' XML section.");
   }
 
   if (initial_Xg_parameters.required && !initial_Xg_parameters.defined()) {
     svmp::raise<svmp::ParseException>(
-        SVMP_HERE, xml_element_name + " requires an '" +
+        xml_element_name + " requires an '" +
                        initial_Xg_parameters.xml_element_name +
                        "' XML section.");
   }
@@ -1841,7 +1841,7 @@ void DomainParameters::set_values(tinyxml2::XMLElement* domain_elem, bool from_e
   // If not reading from an external xml file then get the <Domain id=ID> 'id' attrribute.
   //
   if (!from_external_xml) {
-    const char* sid = require_xml_attribute(domain_elem, "id", SVMP_HERE);
+    const char* sid = require_xml_attribute(domain_elem, "id");
     id.set(std::string(sid));
   }
 
@@ -1877,8 +1877,7 @@ void DomainParameters::set_values(tinyxml2::XMLElement* domain_elem, bool from_e
     if (name == FluidViscosityParameters::xml_element_name_ ||
         name == SolidViscosityParameters::xml_element_name_) {
       auto eq_type = require_map_value(
-          consts::equation_name_to_type, equation.value(), SVMP_HERE,
-          "Unknown equation type '" + equation.value() +
+          consts::equation_name_to_type, equation.value(), "Unknown equation type '" + equation.value() +
               "' while parsing viscosity model.");
       if (eq_type == consts::EquationType::phys_fluid ||
           eq_type == consts::EquationType::phys_CMM ||
@@ -1891,14 +1890,13 @@ void DomainParameters::set_values(tinyxml2::XMLElement* domain_elem, bool from_e
         item_found = true;
       } else {
         svmp::raise<svmp::ParseException>(
-            SVMP_HERE, "Viscosity model not supported for equation '" +
+            "Viscosity model not supported for equation '" +
                            equation.value() + "'.");
       }
     }
 
     if (name == include_xml.name()) {
-      auto value = require_xml_text(item, SVMP_HERE,
-          "Domain Include_xml requires a file name.");
+      auto value = require_xml_text(item, "Domain Include_xml requires a file name.");
       IncludeParametersFile include_parameters(value);
       set_values(include_parameters.root_element, true);
 
@@ -1911,12 +1909,12 @@ void DomainParameters::set_values(tinyxml2::XMLElement* domain_elem, bool from_e
         set_parameter_value(name, value);
         item_found = true;
       } catch (const std::bad_function_call &exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     }
 
     if (!item_found)
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
 
     item = item->NextSiblingElement();
   }
@@ -1926,14 +1924,14 @@ void DomainParameters::set_values(tinyxml2::XMLElement* domain_elem, bool from_e
     // Check values for some parameters..
     //
     if (Parameters::constitutive_model_names.count(constitutive_model.value())
-    == 0) { svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown constitutive
+    == 0) { svmp::raise<svmp::ParseException>("Unknown constitutive
     model '" + constitutive_model.value_ + "' for '" + constitutive_model.name_
     +
         "' in '" + domain_params->Name() + "'.");
     }
 
     if (Parameters::equation_names.count(equation.value()) == 0) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown equation name '" +
+      svmp::raise<svmp::ParseException>("Unknown equation name '" +
     equation.value() + "' for '" + equation.name_ +
         "' in '" + domain_params->Name() + "'.");
     }
@@ -1988,7 +1986,7 @@ void DirectionalDistributionParameters::validate() const
   
   // Empty block is invalid - if block exists, must specify all three
   if (num_defined == 0) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Directional_distribution block is empty. "
+    svmp::raise<svmp::ParseException>("Directional_distribution block is empty. "
       "Either remove the block entirely (to use defaults: fiber=1.0, sheet=0.0, normal=0.0) "
       "or specify all three directions: Fiber_direction, Sheet_direction, Sheet_normal_direction.");
   }
@@ -2003,7 +2001,7 @@ void DirectionalDistributionParameters::validate() const
     if (!fiber_defined) msg += "Fiber_direction ";
     if (!sheet_defined) msg += "Sheet_direction ";
     if (!normal_defined) msg += "Sheet_normal_direction ";
-    svmp::raise<svmp::ParseException>(SVMP_HERE, msg);
+    svmp::raise<svmp::ParseException>(msg);
   }
   
   // All three are specified, validate their values
@@ -2015,7 +2013,7 @@ void DirectionalDistributionParameters::validate() const
   double eta_sum = eta_f + eta_s + eta_n;
   const double tol = 1.0e-10;
   if (std::abs(eta_sum - 1.0) > tol) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Directional distribution fractions must sum to 1.0. "
+    svmp::raise<svmp::ParseException>("Directional distribution fractions must sum to 1.0. "
       "Got: Fiber_direction=" + std::to_string(eta_f) + 
       ", Sheet_direction=" + std::to_string(eta_s) + 
       ", Sheet_normal_direction=" + std::to_string(eta_n) + 
@@ -2024,7 +2022,7 @@ void DirectionalDistributionParameters::validate() const
   
   // Validate that each eta is non-negative
   if (eta_f < 0.0 || eta_s < 0.0 || eta_n < 0.0) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Directional distribution fractions must be non-negative. "
+    svmp::raise<svmp::ParseException>("Directional distribution fractions must be non-negative. "
       "Got: Fiber_direction=" + std::to_string(eta_f) + 
       ", Sheet_direction=" + std::to_string(eta_s) + 
       ", Sheet_normal_direction=" + std::to_string(eta_n));
@@ -2071,7 +2069,7 @@ void FiberReinforcementStressParameters::set_values(tinyxml2::XMLElement* xml_el
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the element attribute.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
   auto item = xml_elem->FirstChildElement();
   
@@ -2086,10 +2084,10 @@ void FiberReinforcementStressParameters::set_values(tinyxml2::XMLElement* xml_el
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
     
     item = item->NextSiblingElement();
@@ -2148,7 +2146,7 @@ void StimulusParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <LS type=TYPE> element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
   auto item = xml_elem->FirstChildElement();
   
@@ -2277,7 +2275,7 @@ void ContactParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <Add_projection name=NAME> element.
-  const char* mname = require_xml_attribute(xml_elem, "model", SVMP_HERE);
+  const char* mname = require_xml_attribute(xml_elem, "model");
   model.set(std::string(mname));
 
   using std::placeholders::_1;
@@ -2439,14 +2437,14 @@ void EquationParameters::set_values(tinyxml2::XMLElement* eq_elem, DomainParamet
 
     } else if (viscosity_names.count(name)) { 
       auto eq_type = require_map_value(consts::equation_name_to_type, type.value(),
-          SVMP_HERE, "Unknown equation type '" + type.value() + "' while parsing viscosity model.");
+          "Unknown equation type '" + type.value() + "' while parsing viscosity model.");
 
       if (fluid_eqs.count(eq_type)) {
         domain->fluid_viscosity.set_values(item);
       } else if (eq_type == consts::EquationType::phys_struct || eq_type == consts::EquationType::phys_ustruct) {
         domain->solid_viscosity.set_values(item);
       } else {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, "Viscosity model not supported for equation '" + type.value() + "'.");
+        svmp::raise<svmp::ParseException>("Viscosity model not supported for equation '" + type.value() + "'.");
       }
 
     } else if (name == ECGLeadsParameters::xml_element_name_) {
@@ -2456,8 +2454,7 @@ void EquationParameters::set_values(tinyxml2::XMLElement* eq_elem, DomainParamet
       variable_wall_properties.set_values(item);
 
     } else if (name == include_xml.name()) { 
-      auto value = require_xml_text(item, SVMP_HERE,
-          "Equation Include_xml requires a file name.");
+      auto value = require_xml_text(item, "Equation Include_xml requires a file name.");
       IncludeParametersFile include_parameters(value);
       set_values(include_parameters.root_element, default_domain);
 
@@ -2473,13 +2470,13 @@ void EquationParameters::set_values(tinyxml2::XMLElement* eq_elem, DomainParamet
         try {
           default_domain->set_parameter_value(name, value);
         } catch (const std::bad_function_call& exception) {
-          svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown " + xml_element_name_ + " XML element '" + name + "'.");
+          svmp::raise<svmp::ParseException>("Unknown " + xml_element_name_ + " XML element '" + name + "'.");
         }
       }
 
 
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "[Equation] Unknown " + xml_element_name_ + " XML element '" + name + "'.");
+      svmp::raise<svmp::ParseException>("[Equation] Unknown " + xml_element_name_ + " XML element '" + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -2571,8 +2568,7 @@ void GeneralSimulationParameters::set_values(tinyxml2::XMLElement* xml_element,
   } else {
     auto general_params = xml_element->FirstChildElement(xml_element_name.c_str());
     if (general_params == nullptr) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE,
-          "No <" + xml_element_name + "> section found in the solver XML file.");
+      svmp::raise<svmp::ParseException>("No <" + xml_element_name + "> section found in the solver XML file.");
     }
     item = general_params->FirstChildElement();
   }
@@ -2581,19 +2577,17 @@ void GeneralSimulationParameters::set_values(tinyxml2::XMLElement* xml_element,
     std::string name = std::string(item->Value());
 
     if (name == include_xml.name()) {
-      auto value = require_xml_text(item, SVMP_HERE,
-          "GeneralSimulationParameters Include_xml requires a file name.");
+      auto value = require_xml_text(item, "GeneralSimulationParameters Include_xml requires a file name.");
       IncludeParametersFile include_parameters(value);
       set_values(include_parameters.root_element, true);
 
     } else {
-      auto value = require_xml_text(item, SVMP_HERE,
-          "GeneralSimulationParameters XML element '" + name + "' requires a value.");
+      auto value = require_xml_text(item, "GeneralSimulationParameters XML element '" + name + "' requires a value.");
 
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown XML GeneralSimulationParameters element '" + name + ".");
+        svmp::raise<svmp::ParseException>("Unknown XML GeneralSimulationParameters element '" + name + ".");
       }
     }
 
@@ -2645,7 +2639,7 @@ void FaceParameters::set_values(tinyxml2::XMLElement* face_elem)
   using namespace tinyxml2;
 
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '"; 
-  const char* face_name = require_xml_attribute(face_elem, "name", SVMP_HERE);
+  const char* face_name = require_xml_attribute(face_elem, "name");
   name.set(std::string(face_name));
   auto item = face_elem->FirstChildElement();
 
@@ -2654,13 +2648,13 @@ void FaceParameters::set_values(tinyxml2::XMLElement* face_elem)
     auto value = item->GetText();
 
     if (value == nullptr) { 
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     try {
       set_parameter_value(name, value);
     } catch (const std::bad_function_call& exception) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -2703,7 +2697,7 @@ void RemesherParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name + " XML element '";
 
   // Get the 'type' from the <Remesher type=TYPE> element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
   values_set_ = true;
 
@@ -2717,15 +2711,15 @@ void RemesherParameters::set_values(tinyxml2::XMLElement* xml_elem)
     if (name == "Max_edge_size") {
       const char* name;
       const char* value;
-      name = require_xml_attribute(item, "name", SVMP_HERE);
-      value = require_xml_attribute(item, "value", SVMP_HERE);
+      name = require_xml_attribute(item, "name");
+      value = require_xml_attribute(item, "value");
       auto svalue = std::string(value);
 
       try {
         double dvalue = std::stod(svalue);
         max_edge_sizes_[std::string(name)] = dvalue;
       } catch (...) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, "VALUE=" + svalue +
+        svmp::raise<svmp::ParseException>("VALUE=" + svalue +
             " is not a valid float in the XML Remesher <Max_edge_size name=NAME  value=VALUE> element.");
       }
 
@@ -2734,11 +2728,11 @@ void RemesherParameters::set_values(tinyxml2::XMLElement* xml_elem)
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
 
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -2825,15 +2819,13 @@ void MeshParameters::set_values(tinyxml2::XMLElement* mesh_elem, bool from_exter
     // them as a list of VectorParameter<double>. 
     //
     } else if (name == "Fiber_direction") {
-      auto value = require_xml_text(item, SVMP_HERE,
-          "Mesh Fiber_direction XML element requires a value.");
+      auto value = require_xml_text(item, "Mesh Fiber_direction XML element requires a value.");
       VectorParameter<double> dir("Fiber_direction", {}, false, {});
       dir.set(value);
       fiber_directions.push_back(dir);
 
     } else if (name == include_xml.name()) {
-      auto value = require_xml_text(item, SVMP_HERE,
-          "Mesh Include_xml requires a file name.");
+      auto value = require_xml_text(item, "Mesh Include_xml requires a file name.");
       IncludeParametersFile include_parameters(value);
       set_values(include_parameters.root_element, true);
 
@@ -2843,10 +2835,10 @@ void MeshParameters::set_values(tinyxml2::XMLElement* mesh_elem, bool from_exter
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -2914,7 +2906,7 @@ void ProjectionParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <Add_projection name=NAME> element.
-  const char* sname = require_xml_attribute(xml_elem, "name", SVMP_HERE);
+  const char* sname = require_xml_attribute(xml_elem, "name");
   name.set(std::string(sname));
 
   using std::placeholders::_1;
@@ -2951,7 +2943,7 @@ void RISProjectionParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
 
   // Get the 'type' from the <Add_RIS_projection name=NAME> element.
-  const char* sname = require_xml_attribute(xml_elem, "name", SVMP_HERE);
+  const char* sname = require_xml_attribute(xml_elem, "name");
   name.set(std::string(sname));
 
   using std::placeholders::_1;
@@ -3030,10 +3022,10 @@ void URISMeshParameters::set_values(tinyxml2::XMLElement* mesh_elem)
       try {
         set_parameter_value(name, value);
       } catch (const std::bad_function_call& exception) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+        svmp::raise<svmp::ParseException>(error_msg + name + "'.");
       }
     } else {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -3082,7 +3074,7 @@ void URISFaceParameters::set_values(tinyxml2::XMLElement* face_elem)
   using namespace tinyxml2;
 
   std::string error_msg = "Unknown " + xml_element_name_ + " XML element '"; 
-  const char* face_name = require_xml_attribute(face_elem, "name", SVMP_HERE);
+  const char* face_name = require_xml_attribute(face_elem, "name");
   name.set(std::string(face_name));
   auto item = face_elem->FirstChildElement();
 
@@ -3091,13 +3083,13 @@ void URISFaceParameters::set_values(tinyxml2::XMLElement* face_elem)
     auto value = item->GetText();
 
     if (value == nullptr) { 
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     try {
       set_parameter_value(name, value);
     } catch (const std::bad_function_call& exception) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE, error_msg + name + "'.");
+      svmp::raise<svmp::ParseException>(error_msg + name + "'.");
     }
 
     item = item->NextSiblingElement();
@@ -3152,7 +3144,7 @@ void LinearAlgebraParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name + " XML element '";
 
   // Get the 'type' from the <Linear_algebra type=TYPE> element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
   type.set(std::string(stype));
 
   // Check Linear_algebra type=TYPE> element.
@@ -3163,7 +3155,7 @@ void LinearAlgebraParameters::set_values(tinyxml2::XMLElement* xml_elem)
     std::string valid_types = "";
     std::for_each(LinearAlgebra::name_to_type.begin(), LinearAlgebra::name_to_type.end(), 
         [&valid_types](std::pair<const std::string, const consts::LinearAlgebraType> p) {valid_types += p.first+" ";}); 
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown TYPE '" + type.value() +
+    svmp::raise<svmp::ParseException>("Unknown TYPE '" + type.value() +
         "' given in the XML <Linear_algebra type=TYPE> element.\nValid types are: " + valid_types);
   }
 
@@ -3182,7 +3174,7 @@ void LinearAlgebraParameters::set_values(tinyxml2::XMLElement* xml_elem)
     std::string valid_types = "";
     std::for_each(consts::preconditioner_name_to_type.begin(), consts::preconditioner_name_to_type.end(),
         [&valid_types](std::pair<const std::string, const consts::PreconditionerType> p) {valid_types += p.first+" ";});
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown TYPE '" + preconditioner() +
+    svmp::raise<svmp::ParseException>("Unknown TYPE '" + preconditioner() +
         "' given in the XML <Linear_algebra> <Preconditioner> element.\nValid types are: " + valid_types);
   }     
 
@@ -3195,21 +3187,20 @@ void LinearAlgebraParameters::set_values(tinyxml2::XMLElement* xml_elem)
 void LinearAlgebraParameters::check_input_parameters()
 {
   auto linear_algebra_type = require_map_value(LinearAlgebra::name_to_type, type(),
-      SVMP_HERE, "Unknown TYPE '" + type() +
+      "Unknown TYPE '" + type() +
       "' given in the XML <Linear_algebra type=TYPE> element.");
   auto prec_cond_type = require_map_value(consts::preconditioner_name_to_type,
-      preconditioner.value(), SVMP_HERE, "Unknown TYPE '" + preconditioner() +
+      preconditioner.value(), "Unknown TYPE '" + preconditioner() +
       "' given in the XML <Linear_algebra> <Preconditioner> element.");
   auto assembly_type = require_map_value(LinearAlgebra::name_to_type, assembly.value(),
-      SVMP_HERE, "Unknown TYPE '" + assembly() +
+      "Unknown TYPE '" + assembly() +
       "' given in the XML <Linear_algebra> <Assembly> element.");
 
   LinearAlgebra* linear_algebra = nullptr;
   try {
     linear_algebra = LinearAlgebraFactory::create_interface(linear_algebra_type);
     if (linear_algebra == nullptr) {
-      svmp::raise<svmp::ParseException>(SVMP_HERE,
-          "Linear_algebra type '" + type() + "' cannot be used as a solver backend.");
+      svmp::raise<svmp::ParseException>("Linear_algebra type '" + type() + "' cannot be used as a solver backend.");
     }
     linear_algebra->check_options(prec_cond_type, assembly_type);
     delete linear_algebra;
@@ -3218,7 +3209,7 @@ void LinearAlgebraParameters::check_input_parameters()
     throw;
   } catch (const std::exception& exception) {
     delete linear_algebra;
-    svmp::raise<svmp::ParseException>(SVMP_HERE, exception.what());
+    svmp::raise<svmp::ParseException>(exception.what());
   }
 }
 
@@ -3276,7 +3267,7 @@ void LinearSolverParameters::set_values(tinyxml2::XMLElement* xml_elem)
   std::string error_msg = "Unknown " + xml_element_name + " XML element '";
 
   // Get the 'type' from the <LS type=TYPE> element.
-  const char* stype = require_xml_attribute(xml_elem, "type", SVMP_HERE);
+  const char* stype = require_xml_attribute(xml_elem, "type");
 
   type.set(std::string(stype));
 
diff --git a/Code/Source/solver/Parameters.h b/Code/Source/solver/Parameters.h
index de383e80f..9f8ab4749 100644
--- a/Code/Source/solver/Parameters.h
+++ b/Code/Source/solver/Parameters.h
@@ -140,7 +140,7 @@ class Parameter
       if (!(str_stream >> value_)) {
         std::istringstream str_stream(str);
         if (!(str_stream >> std::boolalpha >> value_)) {
-          svmp::raise<svmp::ParseException>(SVMP_HERE, "Incorrect value '" + str + "' for '" + name_ + "'.");
+          svmp::raise<svmp::ParseException>("Incorrect value '" + str + "' for '" + name_ + "'.");
         }
       }
 
@@ -342,7 +342,7 @@ class ParameterLists
     void set_parameter_value_CANN(const std::string& name, const std::string& value) 
     {
       if (params_map.count(name) == 0) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown " + xml_element_name + " XML element '" + name + "'.");
+        svmp::raise<svmp::ParseException>("Unknown " + xml_element_name + " XML element '" + name + "'.");
       }
 
       auto& param_variant = params_map[name];
@@ -353,7 +353,7 @@ class ParameterLists
           (*vec_param)->value_.clear();  // Clear the vector before setting
           (*vec_param)->set(value);  // Set the new value
         } else {
-          svmp::raise<svmp::ParseException>(SVMP_HERE, "Activation_functions is not a VectorParameter<int>.");
+          svmp::raise<svmp::ParseException>("Activation_functions is not a VectorParameter<int>.");
         }
       }
       // Check for Weights
@@ -362,7 +362,7 @@ class ParameterLists
           (*vec_param)->value_.clear();  // Clear the vector before setting
           (*vec_param)->set(value);  // Set the new value
         } else {
-          svmp::raise<svmp::ParseException>(SVMP_HERE, "Weights is not a VectorParameter<double>.");
+          svmp::raise<svmp::ParseException>("Weights is not a VectorParameter<double>.");
         }
       }
       // Default: everything else
@@ -379,7 +379,7 @@ class ParameterLists
     void set_parameter_value(const std::string& name, const std::string& value) 
     {
       if (params_map.count(name) == 0) {
-        svmp::raise<svmp::ParseException>(SVMP_HERE, "Unknown " + xml_element_name + " XML element '" + name + "'.");
+        svmp::raise<svmp::ParseException>("Unknown " + xml_element_name + " XML element '" + name + "'.");
       }
 
       std::visit([value](auto&& p) { p->set(value); }, params_map[name]);
@@ -394,7 +394,7 @@ class ParameterLists
         if (std::visit([](auto&& p) {
           return !p->check_required_set();
         }, param)) { 
-          svmp::raise<svmp::ParseException>(SVMP_HERE, xml_element_name + " XML element '" + key + "' has not been set.");
+          svmp::raise<svmp::ParseException>(xml_element_name + " XML element '" + key + "' has not been set.");
         }
       }
     }
diff --git a/Code/Source/solver/cep_ion.cpp b/Code/Source/solver/cep_ion.cpp
index 8c91a54fd..82ad5303e 100644
--- a/Code/Source/solver/cep_ion.cpp
+++ b/Code/Source/solver/cep_ion.cpp
@@ -331,7 +331,7 @@ void cep_integ_l(CepMod &cep_mod, cepModelType &cep, Vector<double> &X,
   #endif
 
   svmp::check_not_null<svmp::FE::NotInitializedException>(
-      cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
+      cep.ionic_model, "ionic model was not constructed.");
 
   const double eps = std::numeric_limits<double>::epsilon();
 
diff --git a/Code/Source/solver/fs.cpp b/Code/Source/solver/fs.cpp
index 81cf91466..dce27174d 100644
--- a/Code/Source/solver/fs.cpp
+++ b/Code/Source/solver/fs.cpp
@@ -378,7 +378,7 @@ void set_thood_fs(fsType& fs, consts::ElementType eType)
 
     default:
       svmp::raise<fe::InvalidElementException>(
-          SVMP_HERE, "Cannot choose Taylor-Hood basis", element_name(eType));
+          "Cannot choose Taylor-Hood basis", element_name(eType));
   }
 }
 
diff --git a/Code/Source/solver/ionic_model.cpp b/Code/Source/solver/ionic_model.cpp
index df7d9c25e..bc33d0dd6 100644
--- a/Code/Source/solver/ionic_model.cpp
+++ b/Code/Source/solver/ionic_model.cpp
@@ -41,7 +41,7 @@ void IonicModel::distribute_parameters(const CmMod &cm_mod, const cmType &cm) {
 void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
   if (initial_X.size() != X.size()) {
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE, "Initial conditions size for X does not match vector size.");
+        "Initial conditions size for X does not match vector size.");
   }
 
   for (size_t i = 0; i < initial_X.size(); ++i)
@@ -49,7 +49,6 @@ void IonicModel::init(Vector<double> &X, Vector<double> &Xg) const {
 
   if (initial_Xg.size() != Xg.size()) {
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE,
         "Initial conditions size for Xg does not match vector size.");
   }
 
@@ -77,7 +76,6 @@ void IonicModel::integ(const odeType &ode_solver_params, const int zone_id,
 
   default:
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE,
         "Unknown time integration type: " +
             std::to_string(static_cast<int>(ode_solver_params.tIntType)));
   }
@@ -264,7 +262,7 @@ IonicModelFactory::create_model(const std::string &name) {
   auto iter = factory_instance.children.find(name);
   if (iter == factory_instance.children.end()) {
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE, "No model with name '" + name +
+        "No model with name '" + name +
                        "' was registered in the ionic model factory.");
   }
 
diff --git a/Code/Source/solver/ionic_model.h b/Code/Source/solver/ionic_model.h
index 52526a24b..22c1332fb 100644
--- a/Code/Source/solver/ionic_model.h
+++ b/Code/Source/solver/ionic_model.h
@@ -405,7 +405,7 @@ class IonicModel {
                              const Vector<double> &X, const Vector<double> &Xg,
                              const double Ksac) const {
     svmp::raise<svmp::FE::NotImplementedException>(
-        SVMP_HERE, "getj method not implemented for this ionic model.");
+        "getj method not implemented for this ionic model.");
 
     // Dummy return statement to avoid compiler warnings.
     Array<double> dummy(X.size(), X.size());
@@ -476,7 +476,6 @@ class IonicModelFactory {
     if (factory_instance.children.find(name) !=
         factory_instance.children.end()) {
       svmp::raise<svmp::FE::InvalidArgumentException>(
-          SVMP_HERE,
           "A model with name '" + name +
               "' was already registered in the ionic model factory.");
     }
diff --git a/Code/Source/solver/mat_fun.h b/Code/Source/solver/mat_fun.h
index 598aa1827..22b03896e 100644
--- a/Code/Source/solver/mat_fun.h
+++ b/Code/Source/solver/mat_fun.h
@@ -53,7 +53,7 @@ namespace mat_fun {
           auto mat_dims = (std::stringstream() << "(" << mat.rows()  << "x" << mat.cols() << ")").str();
           auto dest_dims = (std::stringstream() << "(" << dest.nrows()  << "x" << dest.ncols() << ")").str();
           svmp::raise<svmp::FE::InvalidArgumentException>(
-              SVMP_HERE, "The 'mat" + mat_dims + "' and 'dest" + dest_dims +
+              "The 'mat" + mat_dims + "' and 'dest" + dest_dims +
               "' arrays have incompatible sizes.");
         }
 
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 7da0deea6..54c2d973c 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -102,8 +102,7 @@ std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
     // deliberately unsupported type; fail loudly instead of relying on the
     // unhandled-enum compiler warning being enabled.
     default:
-      svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
-          "to_fe_element_type: unhandled solver element type " +
+      svmp::raise<febasis::BasisElementCompatibilityException>("to_fe_element_type: unhandled solver element type " +
               std::to_string(static_cast<int>(eType)));
   }
 }
@@ -143,8 +142,7 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
 
   const auto fe_type = to_fe_element_type(eType);
   if (!fe_type) {
-    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
-        "No FE Basis selection for solver element " + solver_element_name(eType));
+    svmp::raise<febasis::BasisElementCompatibilityException>("No FE Basis selection for solver element " + solver_element_name(eType));
   }
 
   const std::lock_guard<std::mutex> lock(cache_mutex);
@@ -198,8 +196,7 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
 {
   if (solver_node < 0) {
-    svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
-        "Solver node " + std::to_string(solver_node) +
+    svmp::raise<febasis::BasisNodeOrderingException>("Solver node " + std::to_string(solver_node) +
             " is outside node map for " + solver_element_name(eType));
   }
 
@@ -211,8 +208,7 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   if (node < map.size()) {
     return map[node];
   }
-  svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
-      "Solver node " + std::to_string(solver_node) +
+  svmp::raise<febasis::BasisNodeOrderingException>("Solver node " + std::to_string(solver_node) +
           " is outside node map for " + solver_element_name(eType));
 }
 
@@ -225,8 +221,7 @@ fe::math::Vector<double, 3> make_basis_point(const febasis::BasisFunction& basis
                                                const Array<double>& xi)
 {
   if (xi.nrows() < basis.dimension()) {
-    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-        "xi has " + std::to_string(xi.nrows()) +
+    svmp::raise<febasis::BasisConfigurationException>("xi has " + std::to_string(xi.nrows()) +
             " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
             " reference coordinates");
   }
@@ -253,21 +248,18 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         Array3<double>& Nx)
 {
   if (values.size() != static_cast<std::size_t>(eNoN)) {
-    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
-        "FE Basis value count " + std::to_string(values.size()) +
+    svmp::raise<febasis::BasisEvaluationException>("FE Basis value count " + std::to_string(values.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
   if (gradients.size() != static_cast<std::size_t>(eNoN)) {
-    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
-        "FE Basis gradient count " + std::to_string(gradients.size()) +
+    svmp::raise<febasis::BasisEvaluationException>("FE Basis gradient count " + std::to_string(gradients.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   for (int a = 0; a < eNoN; ++a) {
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= values.size() || basis_index >= gradients.size()) {
-      svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
-          "Solver node " + std::to_string(a) + " maps to FE Basis node " +
+      svmp::raise<febasis::BasisNodeOrderingException>("Solver node " + std::to_string(a) + " maps to FE Basis node " +
               std::to_string(basis_index) + " outside basis output for " +
               solver_element_name(eType));
     }
@@ -297,8 +289,7 @@ void evaluate_basis_values_and_gradients(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-        "solver insd " + std::to_string(insd) +
+    svmp::raise<febasis::BasisConfigurationException>("solver insd " + std::to_string(insd) +
             " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
@@ -339,8 +330,7 @@ int required_nxx_components_for_dimension(const int dimension)
     case 3:
       return 6;
     default:
-      svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-          "Unsupported FE Basis reference dimension " + std::to_string(dimension));
+      svmp::raise<febasis::BasisConfigurationException>("Unsupported FE Basis reference dimension " + std::to_string(dimension));
   }
 }
 
@@ -356,15 +346,13 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
                                        Array3<double>& Nxx)
 {
   if (hessians.size() != static_cast<std::size_t>(eNoN)) {
-    svmp::raise<febasis::BasisEvaluationException>(SVMP_HERE,
-        "FE Basis Hessian count " + std::to_string(hessians.size()) +
+    svmp::raise<febasis::BasisEvaluationException>("FE Basis Hessian count " + std::to_string(hessians.size()) +
             " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   const int required_components = required_nxx_components_for_dimension(dimension);
   if (Nxx.nrows() < required_components) {
-    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-        "solver Nxx has " + std::to_string(Nxx.nrows()) +
+    svmp::raise<febasis::BasisConfigurationException>("solver Nxx has " + std::to_string(Nxx.nrows()) +
             " rows but FE Basis Hessian packing requires " + std::to_string(required_components));
   }
 
@@ -375,8 +363,7 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
 
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= hessians.size()) {
-      svmp::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
-          "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
+      svmp::raise<febasis::BasisNodeOrderingException>("Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
               std::to_string(basis_index) + " outside basis output for " +
               solver_element_name(eType));
     }
@@ -409,15 +396,13 @@ void evaluate_basis_hessians(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-        "solver insd " + std::to_string(insd) +
+    svmp::raise<febasis::BasisConfigurationException>("solver insd " + std::to_string(insd) +
             " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
-    svmp::raise<febasis::BasisConfigurationException>(SVMP_HERE,
-        "solver ind2 " + std::to_string(ind2) +
+    svmp::raise<febasis::BasisConfigurationException>("solver ind2 " + std::to_string(ind2) +
             " is smaller than packed Hessian component count " + std::to_string(required_components));
   }
 
@@ -448,8 +433,7 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
-        "No support in 'get_element_gauss_int_data'",
+    svmp::raise<fe::InvalidElementException>("No support in 'get_element_gauss_int_data'",
         solver_element_name(eType));
   }
 }
@@ -463,8 +447,7 @@ void get_gip(mshType& mesh)
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
-        "No support in 'set_element_gauss_int_data'",
+    svmp::raise<fe::InvalidElementException>("No support in 'set_element_gauss_int_data'",
         solver_element_name(mesh.eType));
   }
 }
@@ -474,8 +457,7 @@ void get_gip(Simulation* simulation, faceType& face)
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
-        "No support in 'set_face_gauss_int_data'",
+    svmp::raise<fe::InvalidElementException>("No support in 'set_face_gauss_int_data'",
         solver_element_name(face.eType));
   }
 }
@@ -486,8 +468,7 @@ void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const in
     Array<double>& N, Array3<double>& Nx)
 {
   if (!to_fe_element_type(eType).has_value()) {
-    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
-        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
+    svmp::raise<febasis::BasisElementCompatibilityException>("[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
   }
 
   evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
@@ -520,8 +501,7 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  svmp::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, SVMP_HERE,
-      "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis");
+  svmp::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis");
 
   if (face.eType == ElementType::PNT) {
     set_point_face_shape_data(gaus_pt, face);
@@ -534,8 +514,7 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
     return;
   }
 
-  svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
-      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType));
+  svmp::raise<febasis::BasisElementCompatibilityException>("[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType));
 }
 
 /// @brief Returns second order derivatives at given natural coords.
@@ -553,8 +532,7 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
   }
 
   if (!to_fe_element_type(eType).has_value()) {
-    svmp::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
-        "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
+    svmp::raise<febasis::BasisElementCompatibilityException>("[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
             solver_element_name(eType));
   }
 
@@ -742,8 +720,7 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
 
   l1 = (l1 && l2 && l3 && l4);
 
-  svmp::throw_if<fe::InvalidArgumentException>(!l1, SVMP_HERE,
-      "Error in computing shape functions");
+  svmp::throw_if<fe::InvalidArgumentException>(!l1, "Error in computing shape functions");
 }
 
 /// @brief Inverse maps {xp} to {$\xi$} in an element with coordinates {xl} using Newton's method
@@ -991,8 +968,7 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      svmp::raise<fe::InvalidArgumentException>(SVMP_HERE,
-          "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
+      svmp::raise<fe::InvalidArgumentException>("[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
               std::to_string(Ac) + " could not be matched to a node in the '" +
               msh.name + "' volume mesh.");
     }
@@ -1043,8 +1019,7 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          svmp::raise<fe::InvalidArgumentException>(SVMP_HERE,
-              "gnnb: invalid MechanicalConfigurationType provided");
+          svmp::raise<fe::InvalidArgumentException>("gnnb: invalid MechanicalConfigurationType provided");
       }
     }
   }
@@ -1232,8 +1207,7 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    svmp::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
-        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
+    svmp::throw_if<fe::BackendException>(INFO != 0, "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
 
@@ -1302,8 +1276,7 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    svmp::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
-        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
+    svmp::throw_if<fe::BackendException>(INFO != 0, "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
   }
@@ -1350,8 +1323,7 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      svmp::raise<fe::InvalidElementException>(SVMP_HERE,
-          "[select_ele] No support for " + std::to_string(mesh.eNoN) +
+      svmp::raise<fe::InvalidElementException>("[select_ele] No support for " + std::to_string(mesh.eNoN) +
               " noded " + std::to_string(insd) + "D elements.",
           solver_element_name(mesh.eType));
   }
@@ -1409,8 +1381,7 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    svmp::raise<fe::InvalidElementException>(SVMP_HERE,
-        "No support for " + std::to_string(face.eNoN) + " noded " +
+    svmp::raise<fe::InvalidElementException>("No support for " + std::to_string(face.eNoN) + " noded " +
             std::to_string(insd) + "D elements in 'set_face_element_props'.",
         solver_element_name(face.eType));
   }
diff --git a/Code/Source/solver/post.cpp b/Code/Source/solver/post.cpp
index 84b2c23c8..b53e85745 100644
--- a/Code/Source/solver/post.cpp
+++ b/Code/Source/solver/post.cpp
@@ -805,13 +805,11 @@ void fib_stretch_rate(const ComMod& com_mod, const int iEq, const mshType& lM, c
 
   if (dt <= 0.0) {
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE,
         "[fib_stretch_rate] Expected com_mod.dt > 0, but got " + std::to_string(dt) + ".");
   }
 
   if (res.size() != nNo) {
     svmp::raise<svmp::FE::InvalidArgumentException>(
-        SVMP_HERE,
         "[fib_stretch_rate] Expected res size " + std::to_string(nNo) + ", but got " + std::to_string(res.size()) + ".");
   }
 
diff --git a/Code/Source/solver/read_files.cpp b/Code/Source/solver/read_files.cpp
index 25f88cdd4..e29b0a649 100644
--- a/Code/Source/solver/read_files.cpp
+++ b/Code/Source/solver/read_files.cpp
@@ -203,7 +203,7 @@ void read_bc(Simulation* simulation, EquationParameters* eq_params, eqType& lEq,
     if (effective_direction.size() != com_mod.nsd) {
       auto effective_size = (std::stringstream() << "(" << effective_direction.size() << ")").str();
       auto space_dim = (std::stringstream() << "(" << com_mod.nsd << ")").str();
-      svmp::raise<svmp::ParseException>(SVMP_HERE, "The size of the effective direction " + effective_size + 
+      svmp::raise<svmp::ParseException>("The size of the effective direction " + effective_size + 
           " does not equal the number of space dimensions " + space_dim); 
     }
 
@@ -2380,7 +2380,7 @@ void read_outputs(Simulation* simulation, EquationParameters* eq_params, eqType&
           continue;
 
         svmp::check_not_null<svmp::FE::NotInitializedException>(
-            dmn.cep.ionic_model, SVMP_HERE, "ionic model was not constructed.");
+            dmn.cep.ionic_model, "ionic model was not constructed.");
 
         const auto registered_outputs =
             dmn.cep.ionic_model->get_registered_outputs();
diff --git a/Documentation/Doxyfile b/Documentation/Doxyfile
index acd5ba21c..fba8c016a 100644
--- a/Documentation/Doxyfile
+++ b/Documentation/Doxyfile
@@ -267,12 +267,12 @@ PERLMOD_MAKEVAR_PREFIX =
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
 SEARCH_INCLUDES        = YES
 INCLUDE_PATH           =
 INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
+PREDEFINED             = "SVMP_DEFINE_EXCEPTION(Name,Base,Status)=class Name : public Base { }"
 EXPAND_AS_DEFINED      =
 SKIP_FUNCTION_MACROS   = YES
 #---------------------------------------------------------------------------
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 5df40ddcc..e81a9c377 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -154,20 +154,23 @@ class SpanPrimitiveBasis : public BasisFunction {
     }
 };
 
-void expect_source_location(const FEException& e)
+void expect_source_location(const svmp::ExceptionBase& e)
 {
     EXPECT_NE(e.context().file().find("test_BasisErrorPaths.cpp"), std::string::npos);
     EXPECT_GT(e.context().line(), 0);
     EXPECT_FALSE(e.context().function().empty());
 }
 
+// The core helpers raise both FE-subsystem exceptions and Core exceptions (for
+// example not_implemented() defaults to svmp::NotImplementedException), so this
+// catches their common base, svmp::ExceptionBase.
 template <class Thrower>
 void expect_core_helper_preserves_source_location(Thrower&& thrower)
 {
     try {
         thrower();
-        FAIL() << "Expected an FEException";
-    } catch (const FEException& e) {
+        FAIL() << "Expected an svmp::ExceptionBase";
+    } catch (const svmp::ExceptionBase& e) {
         expect_source_location(e);
     }
 }
@@ -285,13 +288,13 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
 
 TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
     try {
-        svmp::raise<BasisConfigurationException>(SVMP_HERE, "invalid config");
+        svmp::raise<BasisConfigurationException>("invalid config");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InvalidArgument);
     }
 
     try {
-        svmp::raise<BasisConstructionException>(SVMP_HERE, "construction failure");
+        svmp::raise<BasisConstructionException>("construction failure");
     } catch (const FEException& e) {
         EXPECT_EQ(e.status(), svmp::StatusCode::InternalError);
     }
@@ -299,32 +302,32 @@ TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
 
 TEST(BasisErrorPaths, CoreHelpersPreserveSourceLocation) {
     expect_core_helper_preserves_source_location([] {
-        svmp::raise<BasisEvaluationException>(SVMP_HERE, "raise location");
+        svmp::raise<BasisEvaluationException>("raise location");
     });
 
     expect_core_helper_preserves_source_location([] {
         svmp::throw_if<BasisEvaluationException>(
-            true, SVMP_HERE, "throw_if location");
+            true, "throw_if location");
     });
 
     expect_core_helper_preserves_source_location([] {
-        svmp::check_arg<BasisEvaluationException>(
-            false, SVMP_HERE, "check_arg location");
+        svmp::check<BasisEvaluationException>(
+            false, "check location");
     });
 
     expect_core_helper_preserves_source_location([] {
         const int* ptr = nullptr;
         svmp::check_not_null<BasisEvaluationException>(
-            ptr, SVMP_HERE, "check_not_null location");
+            ptr, "check_not_null location");
     });
 
     expect_core_helper_preserves_source_location([] {
-        svmp::check_index<BasisEvaluationException>(1, 1, SVMP_HERE);
+        svmp::check_index<BasisEvaluationException>(1, 1);
     });
 
     expect_core_helper_preserves_source_location([] {
         svmp::not_implemented<NotImplementedException>(
-            "test feature", SVMP_HERE);
+            "test feature");
     });
 }