diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index c546c2822..64608b6b1 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -178,6 +178,8 @@ set(CSRCS
   eq_assem.h eq_assem.cpp
   fluid.h fluid.cpp
   fsi.h fsi.cpp
+  fsi_coupling.h fsi_coupling.cpp
+  PartitionedFSI.h PartitionedFSI.cpp
   fs.h fs.cpp
   fft.h fft.cpp
   heatf.h heatf.cpp
@@ -357,7 +359,13 @@ if(ENABLE_UNIT_TEST)
 
   # include source files (same as what svMultiPhysics does except for main.cpp)
   add_executable(run_all_unit_tests ${CSRCS})
-  
+
+  # Define test data directory for integration tests
+  # CMAKE_SOURCE_DIR points to Code/, so go up one level to reach the repo root
+  target_compile_definitions(run_all_unit_tests PRIVATE
+    TEST_DATA_DIR="${CMAKE_SOURCE_DIR}/../tests/cases"
+  )
+
   if(USE_TRILINOS)
     target_link_libraries(run_all_unit_tests ${Trilinos_LIBRARIES} ${Trilinos_TPL_LIBRARIES})
   endif()
diff --git a/Code/Source/solver/ComMod.h b/Code/Source/solver/ComMod.h
index 86db290f4..dd57f5e39 100644
--- a/Code/Source/solver/ComMod.h
+++ b/Code/Source/solver/ComMod.h
@@ -1571,9 +1571,14 @@ class ComMod {
     /// @brief Whether there is a requirement to update mesh and Dn-Do variables
     bool dFlag = false;
 
-    /// @brief Whether mesh is moving
+    /// @brief Whether mesh is moving (monolithic FSI: mesh velocity in DOFs nsd+1..2*nsd)
     bool mvMsh = false;
 
+    /// @brief ALE mesh velocity for partitioned FSI (nsd, tnNo).
+    /// When non-empty, the fluid assembly subtracts this from the convective
+    /// velocity, providing the ALE correction without expanding tDof.
+    Array<double> ale_mesh_velocity;
+
     /// @brief Whether to averaged results
     bool saveAve = false;
 
diff --git a/Code/Source/solver/Integrator.cpp b/Code/Source/solver/Integrator.cpp
index 90dddb1d1..3d5b94d0a 100644
--- a/Code/Source/solver/Integrator.cpp
+++ b/Code/Source/solver/Integrator.cpp
@@ -17,6 +17,7 @@
 #include "utils.h"
 
 #include <algorithm>
+#include <cmath>
 #include <iostream>
 #include <set>
 
@@ -171,6 +172,94 @@ bool Integrator::step() {
   } // End of Newton iteration loop
 }
 
+//------------------------
+// step_equation
+//------------------------
+/// @brief Solve a single equation to convergence without cycling to other equations
+bool Integrator::step_equation(int iEq, std::function<void()> post_assembly) {
+  using namespace consts;
+
+  auto& com_mod = simulation_->com_mod;
+  auto& cm_mod = simulation_->cm_mod;
+
+  int& cTS = com_mod.cTS;
+
+  // Set up for this equation
+  com_mod.cEq = iEq;
+  auto& eq = com_mod.eq[iEq];
+  eq.itr = 0;
+  eq.ok = false;
+  eq.iNorm = 0.0;
+
+  newton_count_ = 1;
+
+  while (true) {
+    istr_ = "_" + std::to_string(cTS) + "_" + std::to_string(newton_count_);
+    auto& eq = com_mod.eq[iEq];
+
+    // Coupled BC handling (same as step())
+    if (com_mod.cplBC.coupled && iEq == 0) {
+      set_bc::set_bc_cpl(com_mod, cm_mod, solutions_);
+      set_bc::set_bc_dir(com_mod, solutions_);
+    }
+
+    // Initiator step for Generalized α-Method
+    initiator_step();
+
+    if (com_mod.Rd.size() != 0) {
+      com_mod.Rd = 0.0;
+      com_mod.Kd = 0.0;
+    }
+
+    // Allocate, assemble, apply BCs
+    allocate_linear_system(eq);
+    set_body_forces();
+    assemble_equations();
+    apply_boundary_conditions();
+
+    // Optional post-assembly callback (e.g., for injecting interface traction)
+    if (post_assembly) {
+      post_assembly();
+    }
+
+    // Synchronize R across processes
+    if (!eq.assmTLS) {
+      all_fun::commu(com_mod, com_mod.R);
+    }
+
+    // Update residual in displacement equation for USTRUCT phys
+    if (com_mod.sstEq) {
+      ustruct::ustruct_r(com_mod, solutions_);
+    }
+
+    // Set the residual of the continuity equation to 0 on edge nodes
+    if (std::set<EquationType>{Equation_stokes, Equation_fluid, Equation_ustruct, Equation_FSI}.count(eq.phys) != 0) {
+      fs::thood_val_rc(com_mod);
+    }
+
+    set_bc::set_bc_undef_neu(com_mod);
+    update_residual_arrays(eq);
+
+    // Solve linear system
+    solve_linear_system();
+
+    // Update solution and check convergence (no equation cycling)
+    update_solution();
+
+    if (eq.ok) {
+      return true;
+    }
+
+    // Abort on NaN in residual norm (indicates divergence).
+    if (newton_count_ > 1 && std::isnan(eq.FSILS.RI.iNorm)) {
+      return false;
+    }
+
+    output::output_result(simulation_, com_mod.timeP, 2, iEq);
+    newton_count_ += 1;
+  }
+}
+
 //------------------------
 // initiator_step
 //------------------------
@@ -374,8 +463,8 @@ void Integrator::update_residual_arrays(eqType& eq) {
 //  1.  Bazilevs, et al. "Isogeometric fluid-structure interaction:
 //      theory, algorithms, and computations.", Computational Mechanics,
 //      43 (2008): 3-37. doi: 10.1007/s00466-008-0315-x
-//  2. Bazilevs, et al. "Variational multiscale residual-based 
-//      turbulence modeling for large eddy simulation of incompressible 
+//  2. Bazilevs, et al. "Variational multiscale residual-based
+//      turbulence modeling for large eddy simulation of incompressible
 //      flows.", CMAME (2007)
 //------------------------
 // predictor (picp)
@@ -621,11 +710,13 @@ void Integrator::initiator(SolutionStates& solutions)
   }
 }
 //------------------------
-// corrector
+// update_solution
 //------------------------
-/// @brief Corrector with convergence check
+/// @brief Update solution from linear solve result and check convergence
 ///
-/// Decision for next eqn is also made here (modifies cEq global).
+/// Performs the corrector update of An, Yn, Dn from the Newton solve result
+/// and checks convergence norms. Sets eq.ok if converged.
+/// Does NOT handle equation cycling (that is done by corrector()).
 ///
 /// Modifies:
 /// \code {.cpp}
@@ -637,13 +728,12 @@ void Integrator::initiator(SolutionStates& solutions)
 ///   com_mod.pS0
 ///   com_mod.pSa
 ///   com_mod.pSn
-///
-///   com_mod.cEq
 ///   eq.FSILS.RI.iNorm
 ///   eq.pNorm
+///   eq.ok
 /// \endcode
 //
-void Integrator::corrector()
+void Integrator::update_solution()
 {
   using namespace consts;
 
@@ -853,10 +943,42 @@ void Integrator::corrector()
     dmsg << "com_mod.eq[1].ok: " << com_mod.eq[1].ok;
     #endif
   }
+}
+
+//------------------------
+// corrector
+//------------------------
+/// @brief Corrector with convergence check and equation cycling
+///
+/// Calls update_solution() to update the solution and check convergence,
+/// then handles equation switching for coupled problems (modifies cEq global).
+//
+void Integrator::corrector()
+{
+  using namespace consts;
 
+  auto& com_mod = simulation_->com_mod;
+  const int tnNo = com_mod.tnNo;
+  auto& cEq = com_mod.cEq;
+  auto& eq = com_mod.eq[cEq];
+
+  auto& An = solutions_.current.get_acceleration();
+  auto& Dn = solutions_.current.get_displacement();
+  auto& Yn = solutions_.current.get_velocity();
+
+  #define n_debug_corrector_cycling
+  #ifdef debug_corrector_cycling
+  DebugMsg dmsg(__func__, com_mod.cm.idcm());
+  dmsg.banner();
+  #endif
+
+  // Update solution and check convergence
+  update_solution();
+
+  // Check if all equations converged - if so, skip equation cycling
   auto& eqs = com_mod.eq;
   if (std::count_if(eqs.begin(),eqs.end(),[](eqType& eq){return eq.ok;}) == eqs.size()) {
-    #ifdef debug_corrector
+    #ifdef debug_corrector_cycling
     dmsg << "all ok";
     #endif
     return;
@@ -868,7 +990,7 @@ void Integrator::corrector()
       // For coupled equations, if explicit geometric coupling is not used,
       // increment the equation counter after each linear solve
       cEq = cEq + 1;
-      #ifdef debug_corrector
+      #ifdef debug_corrector_cycling
       dmsg << "eq " << " coupled ";
       dmsg << "1st update cEq: " << cEq;
       #endif
@@ -929,7 +1051,7 @@ void Integrator::corrector()
       cEq = cEq + 1;
     }
   }
- #ifdef debug_corrector
+ #ifdef debug_corrector_cycling
  dmsg << "eq " << " coupled ";
  dmsg << "2nd update cEq: " << cEq;
  #endif
diff --git a/Code/Source/solver/Integrator.h b/Code/Source/solver/Integrator.h
index 790e17eff..23570889b 100644
--- a/Code/Source/solver/Integrator.h
+++ b/Code/Source/solver/Integrator.h
@@ -9,6 +9,38 @@
 #include "Vector.h"
 #include "Simulation.h"
 
+#include <functional>
+
+/// @brief Newmark time integration utilities.
+///
+/// Compute consistent state variables from a prescribed displacement or
+/// velocity using the Newmark-beta / generalized-alpha relationships:
+///   Dn = Do + dt*Yo + dt^2*((0.5-beta)*Ao + beta*An)
+///   Yn = Yo + dt*((1-gamma)*Ao + gamma*An)
+namespace newmark {
+
+/// Compute acceleration and velocity from prescribed displacement.
+inline void state_from_displacement(
+    double d_new, double d_old, double v_old, double a_old,
+    double dt, double beta, double gam,
+    double& a_new, double& v_new)
+{
+  a_new = (d_new - d_old - dt * v_old) / (beta * dt * dt)
+        - (0.5 - beta) / beta * a_old;
+  v_new = v_old + dt * ((1.0 - gam) * a_old + gam * a_new);
+}
+
+/// Compute acceleration from prescribed velocity.
+inline void state_from_velocity(
+    double v_new, double v_old, double a_old,
+    double dt, double gam,
+    double& a_new)
+{
+  a_new = (v_new - v_old) / (gam * dt) - (1.0 - gam) / gam * a_old;
+}
+
+} // namespace newmark
+
 /**
  * @brief Integrator class encapsulates the Newton iteration loop for time integration
  *
@@ -42,6 +74,21 @@ class Integrator {
    */
   bool step();
 
+  /**
+   * @brief Solve a single equation to convergence (for partitioned coupling)
+   *
+   * Runs Newton iterations for only the specified equation, without cycling
+   * to other equations. Used by partitioned FSI to solve fluid, solid, and
+   * mesh equations independently.
+   *
+   * @param iEq Index of the equation to solve
+   * @param post_assembly Optional callback invoked after boundary condition
+   *        application but before the linear solve. Used by partitioned FSI
+   *        to inject interface traction into the residual.
+   * @return True if the equation converged, false if max iterations reached
+   */
+  bool step_equation(int iEq, std::function<void()> post_assembly = nullptr);
+
   /**
    * @brief Perform predictor step for next time step
    *
@@ -173,11 +220,20 @@ class Integrator {
    */
   void initiator(SolutionStates& solutions);
 
+  /**
+   * @brief Update solution and check convergence of current equation
+   *
+   * Performs the corrector step: updates An, Yn, Dn from the linear solve
+   * result and checks convergence norms. Sets eq.ok if converged.
+   * Does NOT handle equation cycling -- that is done separately in corrector().
+   */
+  void update_solution();
+
   /**
    * @brief Corrector function with convergence check (corrector)
    *
-   * Updates solution at n+1 time level and checks convergence of Newton
-   * iterations. Also handles equation switching for coupled problems.
+   * Calls update_solution(), then handles equation switching for coupled
+   * problems (modifies cEq global).
    */
   void corrector();
 
diff --git a/Code/Source/solver/Parameters.cpp b/Code/Source/solver/Parameters.cpp
index e20dcf2f7..273ceee52 100644
--- a/Code/Source/solver/Parameters.cpp
+++ b/Code/Source/solver/Parameters.cpp
@@ -222,14 +222,29 @@ void Parameters::print_parameters()
 
 /// @brief Set the simulation parameter values given in an XML format file.
 void Parameters::read_xml(std::string file_name)
-{ 
+{
   tinyxml2::XMLDocument doc;
-  
   auto error = doc.LoadFile(file_name.c_str());
-  
+  set_values_from_doc(doc, error, "XML file '" + file_name + "'");
+}
+
+// Parse parameters from an in-memory XML string (no file on disk). Used by the
+// partitioned FSI driver to initialize its sub-simulations directly from the
+// single solver.xml without writing temporary per-role XML files.
+void Parameters::read_xml_from_string(const std::string& xml_content)
+{
+  tinyxml2::XMLDocument doc;
+  auto error = doc.Parse(xml_content.c_str());
+  set_values_from_doc(doc, error, "in-memory sub-simulation XML");
+}
+
+void Parameters::set_values_from_doc(tinyxml2::XMLDocument& doc,
+                                     tinyxml2::XMLError error,
+                                     const std::string& source_desc)
+{
   auto root_element = doc.FirstChildElement(FSI_FILE.c_str());
   if (error != tinyxml2::XML_SUCCESS || root_element == nullptr) {
-    svmp::raise<svmp::ParseException>(SVMP_HERE, "The following error occurred while reading the XML file '" + file_name + "'.\n" +
+    svmp::raise<svmp::ParseException>(SVMP_HERE, "The following error occurred while reading the " + source_desc + ".\n" +
         "[svMultiPhysics] ERROR " + std::string(doc.ErrorStr()));
   }
 
@@ -248,6 +263,9 @@ void Parameters::read_xml(std::string file_name)
   // Set mesh projection parameters.
   set_projection_values(root_element);
 
+  // Set partitioned coupling parameters.
+  set_partitioned_coupling_values(root_element);
+
   // Set Add_equation values.
   set_equation_values(root_element);
 
@@ -278,6 +296,8 @@ void Parameters::set_equation_values(tinyxml2::XMLElement* root_element)
 
     auto eq_params = new EquationParameters();
     eq_params->type.set(std::string(eq_type));
+    const char* eq_role = add_eq_item->Attribute("role");
+    if (eq_role) eq_params->role.set(std::string(eq_role));
     eq_params->set_values(add_eq_item);
     equation_parameters.push_back(eq_params);
 
@@ -2926,6 +2946,50 @@ void ProjectionParameters::set_values(tinyxml2::XMLElement* xml_elem)
   xml_util_set_parameters(ftpr, xml_elem, error_msg);
 }
 
+//////////////////////////////////////////////////////////
+//    PartitionedCouplingParameters                      //
+//////////////////////////////////////////////////////////
+
+const std::string PartitionedCouplingParameters::xml_element_name_ = "Partitioned_coupling";
+
+PartitionedCouplingParameters::PartitionedCouplingParameters()
+{
+  bool required = true;
+
+  set_parameter("Max_coupling_iterations", 50, !required, max_coupling_iterations);
+  set_parameter("Coupling_tolerance", 1e-6, !required, coupling_tolerance);
+  set_parameter("Initial_relaxation", 1.0, !required, initial_relaxation);
+  set_parameter("Omega_max", 1.0, !required, omega_max);
+  set_parameter("Coupling_method", "aitken", !required, coupling_method);
+  set_parameter("Fluid_interface_face", "", required, fluid_interface_face);
+  set_parameter("Solid_interface_face", "", required, solid_interface_face);
+
+}
+
+void PartitionedCouplingParameters::set_values(tinyxml2::XMLElement* xml_elem)
+{
+  using namespace tinyxml2;
+  std::string error_msg = "Unknown " + xml_element_name_ + " XML element '";
+
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+
+  std::function<void(const std::string&, const std::string&)> ftpr =
+      std::bind(&PartitionedCouplingParameters::set_parameter_value, *this, _1, _2);
+
+  xml_util_set_parameters(ftpr, xml_elem, error_msg);
+  value_set = true;
+}
+
+void Parameters::set_partitioned_coupling_values(tinyxml2::XMLElement* root_element)
+{
+  auto item = root_element->FirstChildElement(PartitionedCouplingParameters::xml_element_name_.c_str());
+  if (item == nullptr) {
+    return;
+  }
+  partitioned_coupling_parameters.set_values(item);
+}
+
 //////////////////////////////////////////////////////////
 //        RISProjectionParameters       //
 //////////////////////////////////////////////////////////
diff --git a/Code/Source/solver/Parameters.h b/Code/Source/solver/Parameters.h
index de383e80f..cd14f16a5 100644
--- a/Code/Source/solver/Parameters.h
+++ b/Code/Source/solver/Parameters.h
@@ -1508,6 +1508,7 @@ class EquationParameters : public ParameterLists
     Parameter<double> tolerance;
 
     Parameter<std::string> type;
+    Parameter<std::string> role;  // "partitioned_fluid", "partitioned_solid", or "partitioned_mesh"
     Parameter<bool> use_taylor_hood_type_basis;
 
     // Explicit geometric coupling for FSI simulations: the fluid-structure equations
@@ -1791,6 +1792,34 @@ class URISMeshParameters : public ParameterLists
 
 
 
+//////////////////////////////////////////////////////////
+//          PartitionedCouplingParameters               //
+//////////////////////////////////////////////////////////
+
+/// @brief Parameters for the 'Partitioned_coupling' XML element.
+///
+/// Configures partitioned FSI coupling between separately solved
+/// fluid and solid equations with Aitken relaxation.
+class PartitionedCouplingParameters : public ParameterLists
+{
+  public:
+    PartitionedCouplingParameters();
+    static const std::string xml_element_name_;
+    void set_values(tinyxml2::XMLElement* xml_elem);
+    bool defined() const { return value_set; }
+
+    Parameter<int> max_coupling_iterations;
+    Parameter<double> coupling_tolerance;
+    Parameter<double> initial_relaxation;
+    Parameter<double> omega_max;
+    Parameter<std::string> coupling_method;  // "constant" or "aitken"
+    Parameter<std::string> fluid_interface_face;
+    Parameter<std::string> solid_interface_face;
+
+    bool value_set = false;
+};
+
+
 /// @brief The Parameters class stores parameter values read in from a solver input file.
 class Parameters {
 
@@ -1804,6 +1833,9 @@ class Parameters {
     void get_logging_levels(int& verbose, int& warning, int& debug);
     void print_parameters();
     void read_xml(std::string file_name);
+    void read_xml_from_string(const std::string& xml_content);
+    void set_values_from_doc(tinyxml2::XMLDocument& doc, tinyxml2::XMLError error,
+                             const std::string& source_desc);
 
     void set_contact_values(tinyxml2::XMLElement* root_element);
     void set_equation_values(tinyxml2::XMLElement* root_element);
@@ -1814,6 +1846,7 @@ class Parameters {
 
     void set_RIS_projection_values(tinyxml2::XMLElement* root_element);
     void set_URIS_mesh_values(tinyxml2::XMLElement* root_element);
+    void set_partitioned_coupling_values(tinyxml2::XMLElement* root_element);
 
     // Objects representing each parameter section of XML file.
     ContactParameters contact_parameters;
@@ -1826,6 +1859,8 @@ class Parameters {
     std::vector<RISProjectionParameters*> RIS_projection_parameters;
     std::vector<URISMeshParameters*> URIS_mesh_parameters;
 
+    PartitionedCouplingParameters partitioned_coupling_parameters;
+
 };
 
 #endif
diff --git a/Code/Source/solver/PartitionedFSI.cpp b/Code/Source/solver/PartitionedFSI.cpp
new file mode 100644
index 000000000..f3425356d
--- /dev/null
+++ b/Code/Source/solver/PartitionedFSI.cpp
@@ -0,0 +1,1082 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "PartitionedFSI.h"
+#include "Integrator.h"
+#include "fsi_coupling.h"
+#include "post.h"
+#include "set_bc.h"
+#include "distribute.h"
+#include "initialize.h"
+#include "output.h"
+#include "vtk_xml.h"
+#include "read_files.h"
+
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <set>
+#include <stdexcept>
+#include <unordered_map>
+
+/// Check if any value in the solution arrays is NaN
+static bool has_nan(const SolutionStates& sol) {
+  const Array<double>* arrays[] = {
+    &sol.current.get_velocity(),
+    &sol.current.get_acceleration(),
+    &sol.current.get_displacement()
+  };
+  for (auto* arr : arrays)
+    for (int a = 0; a < arr->ncols(); a++)
+      for (int i = 0; i < arr->nrows(); i++)
+        if (std::isnan((*arr)(i, a))) return true;
+  return false;
+}
+
+
+//----------------------------------------------------------------------
+// Helper: initialize one sub-simulation through the standard pipeline.
+// xml_content is the in-memory sub-XML produced by build_sub_xml (no file
+// is written to disk).
+//----------------------------------------------------------------------
+static void init_sub_sim(Simulation* sim, const std::string& xml_content)
+{
+  read_files_ns::read_files(sim, xml_content, /*from_string=*/true);
+  sim->logger.set_cout_write(false);
+
+  // The mesh sub-sim includes a <Partitioned_coupling> stub so that read_files
+  // accepts the mesh equation, but this sim has only the mesh equation (tDof=3).
+  // baf_ini assumes FSI DOF layout (Do(i+nsd+1,Ac) reaches row 4) when mvMsh=true,
+  // which would go out-of-bounds for the standalone mesh sub-sim.
+  if (sim->com_mod.nEq == 1 &&
+      sim->com_mod.eq[0].phys == consts::EquationType::phys_mesh) {
+    sim->com_mod.mvMsh = false;
+  }
+
+  distribute(sim);
+  Vector<double> init_time(3);
+  initialize(sim, init_time);
+  for (int iEq = 0; iEq < sim->com_mod.nEq; iEq++) {
+    add_eq_linear_algebra(sim->com_mod, sim->com_mod.eq[iEq]);
+  }
+}
+
+//----------------------------------------------------------------------
+// build_sub_xml — extract one role's equation + its meshes from the
+// main XML and return a minimal standalone sub-simulation XML as an
+// in-memory string (consumed by read_files with from_string=true).
+//
+// Mesh association: the meshes included in the sub-XML are those whose
+// <Domain> value matches any <Domain id="..."> child of the target
+// equation.  The partitioned_mesh role falls back to the fluid meshes
+// when the mesh equation carries no explicit domain block.
+//----------------------------------------------------------------------
+std::string PartitionedFSI::build_sub_xml(const std::string& main_xml_path,
+                                          const std::string& role)
+{
+  using namespace tinyxml2;
+
+  XMLDocument doc;
+  if (doc.LoadFile(main_xml_path.c_str()) != XML_SUCCESS)
+    throw std::runtime_error("[PartitionedFSI] Cannot parse main XML: " + main_xml_path);
+
+  XMLElement* root = doc.FirstChildElement("svMultiPhysicsFile");
+  if (!root)
+    throw std::runtime_error("[PartitionedFSI] Missing <svMultiPhysicsFile> root in " + main_xml_path);
+
+  // Find the equation element with the requested role attribute.
+  XMLElement* target_eq = nullptr;
+  for (XMLElement* eq = root->FirstChildElement("Add_equation"); eq;
+       eq = eq->NextSiblingElement("Add_equation")) {
+    const char* r = eq->Attribute("role");
+    if (r && std::string(r) == role) { target_eq = eq; break; }
+  }
+  if (!target_eq)
+    throw std::runtime_error("[PartitionedFSI] No <Add_equation role=\"" + role + "\"> found in " + main_xml_path);
+
+  // Collect domain IDs used by the target equation.
+  std::set<int> domain_ids;
+  for (XMLElement* d = target_eq->FirstChildElement("Domain"); d;
+       d = d->NextSiblingElement("Domain"))
+    domain_ids.insert(d->IntAttribute("id", -1));
+
+  // Mesh role with no domain block: use the same domains as the fluid equation.
+  if (domain_ids.empty() && role == "partitioned_mesh") {
+    for (XMLElement* eq = root->FirstChildElement("Add_equation"); eq;
+         eq = eq->NextSiblingElement("Add_equation")) {
+      const char* r = eq->Attribute("role");
+      if (r && std::string(r) == "partitioned_fluid") {
+        for (XMLElement* d = eq->FirstChildElement("Domain"); d;
+             d = d->NextSiblingElement("Domain"))
+          domain_ids.insert(d->IntAttribute("id", -1));
+        break;
+      }
+    }
+  }
+
+  // Build sub-document.
+  XMLDocument sub;
+  XMLElement* sub_root = sub.NewElement("svMultiPhysicsFile");
+  sub_root->SetAttribute("version", "0.1");
+  sub.InsertFirstChild(sub_root);
+
+  // Copy GeneralSimulationParameters, overriding the VTK save prefix
+  // so each sub-sim writes to distinct files (result_fluid_*, result_solid_*, result_mesh_*).
+  XMLElement* gen = root->FirstChildElement("GeneralSimulationParameters");
+  if (gen) {
+    XMLElement* gen_clone = gen->DeepClone(&sub)->ToElement();
+    // Map role → short suffix used in the prefix name
+    std::string suffix;
+    if      (role == "partitioned_fluid") suffix = "fluid";
+    else if (role == "partitioned_solid") suffix = "solid";
+    else if (role == "partitioned_mesh")  suffix = "mesh";
+    if (!suffix.empty()) {
+      XMLElement* name_elem = gen_clone->FirstChildElement("Name_prefix_of_saved_VTK_files");
+      if (name_elem) {
+        std::string base_prefix = name_elem->GetText() ? name_elem->GetText() : "result";
+        // trim whitespace
+        base_prefix.erase(0, base_prefix.find_first_not_of(" \t\n\r"));
+        base_prefix.erase(base_prefix.find_last_not_of(" \t\n\r") + 1);
+        name_elem->SetText((" " + base_prefix + "_" + suffix + " ").c_str());
+      }
+    }
+    sub_root->InsertEndChild(gen_clone);
+  }
+
+  // Copy matching Add_mesh elements.
+  for (XMLElement* mesh = root->FirstChildElement("Add_mesh"); mesh;
+       mesh = mesh->NextSiblingElement("Add_mesh")) {
+    XMLElement* dom_elem = mesh->FirstChildElement("Domain");
+    if (!dom_elem) continue;
+    int mesh_dom = dom_elem->IntText(-1);
+    if (domain_ids.empty() || domain_ids.count(mesh_dom))
+      sub_root->InsertEndChild(mesh->DeepClone(&sub));
+  }
+
+  // Clone the equation, stripping the role attribute.
+  XMLElement* eq_clone = target_eq->DeepClone(&sub)->ToElement();
+  eq_clone->DeleteAttribute("role");
+  sub_root->InsertEndChild(eq_clone);
+
+  // The mesh sub-sim needs a minimal <Partitioned_coupling> block so that
+  // read_files sets mvMsh=true (required for the mesh equation to be valid).
+  if (role == "partitioned_mesh") {
+    XMLElement* pcp = sub.NewElement("Partitioned_coupling");
+    XMLElement* fface = sub.NewElement("Fluid_interface_face");
+    fface->SetText("dummy");
+    XMLElement* sface = sub.NewElement("Solid_interface_face");
+    sface->SetText("dummy");
+    pcp->InsertEndChild(fface);
+    pcp->InsertEndChild(sface);
+    sub_root->InsertEndChild(pcp);
+  }
+
+  // Serialize the sub-document to an in-memory string instead of writing a
+  // temp file. The string is handed directly to read_files(..., from_string).
+  tinyxml2::XMLPrinter printer;
+  sub.Print(&printer);
+  return std::string(printer.CStr());
+}
+
+//----------------------------------------------------------------------
+// Constructor
+//----------------------------------------------------------------------
+PartitionedFSI::PartitionedFSI(Simulation* main_simulation,
+                               const PartitionedFSIConfig& config,
+                               const std::string& xml_file_path)
+  : main_sim_(main_simulation), config_(config),
+    xml_file_path_(xml_file_path), omega_(config.initial_relaxation)
+{
+  auto& cm = main_sim_->com_mod.cm;
+  auto& cm_mod = main_sim_->cm_mod;
+
+  // Build each role's sub-XML in memory from the single main solver.xml and
+  // initialize a standalone sub-simulation from it. No temp files are written:
+  // every rank parses the same main XML (read-only) and passes the resulting
+  // sub-XML string straight to read_files, so no rank-0 write / barrier is needed.
+  fluid_sim_ = std::make_unique<Simulation>();
+  init_sub_sim(fluid_sim_.get(), build_sub_xml(xml_file_path_, "partitioned_fluid"));
+
+  solid_sim_ = std::make_unique<Simulation>();
+  init_sub_sim(solid_sim_.get(), build_sub_xml(xml_file_path_, "partitioned_solid"));
+
+  mesh_sim_ = std::make_unique<Simulation>();
+  init_sub_sim(mesh_sim_.get(), build_sub_xml(xml_file_path_, "partitioned_mesh"));
+
+  if (cm.mas(cm_mod)) {
+    // Open log files
+    std::string log_dir = fluid_sim_->get_chnl_mod().appPath;
+    coupling_log_.open(log_dir + "coupling.dat");
+    char hdr[256];
+    snprintf(hdr, sizeof(hdr), "# %4s %3s %10s %5s %10s %10s %10s %10s",
+             "cTS", "cp", "time", "dB", "Ri/R1", "Ri/R0", "omega", "|disp|");
+    coupling_log_ << hdr << std::endl;
+
+    histor_log_.open(log_dir + "histor.dat");
+  }
+
+  resolve_faces();
+  build_node_maps();
+}
+
+PartitionedFSI::~PartitionedFSI() = default;
+
+//----------------------------------------------------------------------
+// resolve_faces
+//----------------------------------------------------------------------
+void PartitionedFSI::resolve_faces()
+{
+  auto find_face = [](Simulation* sim, const std::string& face_name,
+                      const faceType*& face_out, const mshType*& mesh_out) {
+    for (int iM = 0; iM < sim->com_mod.nMsh; iM++) {
+      auto& msh = sim->com_mod.msh[iM];
+      for (int iFa = 0; iFa < msh.nFa; iFa++) {
+        if (msh.fa[iFa].name == face_name) {
+          face_out = &msh.fa[iFa];
+          mesh_out = &msh;
+          return;
+        }
+      }
+    }
+    throw std::runtime_error("[PartitionedFSI] Face '" + face_name + "' not found.");
+  };
+
+  find_face(fluid_sim_.get(), config_.fluid_interface_face, fluid_face_, fluid_mesh_);
+  find_face(solid_sim_.get(), config_.solid_interface_face, solid_face_, solid_mesh_);
+  find_face(mesh_sim_.get(),  config_.fluid_interface_face, mesh_face_,  mesh_mesh_);
+}
+
+//----------------------------------------------------------------------
+// compute_face_global_info — gather per-rank nNo to compute global nNo
+// and this rank's offset within the global face node ordering.
+//----------------------------------------------------------------------
+void PartitionedFSI::compute_face_global_info(
+    const faceType& face, cmType& cm, const CmMod& cm_mod,
+    int& global_nNo, int& local_offset)
+{
+  int np = cm.np();
+  int my_rank = cm.id();
+  int local_nNo = face.nNo;
+
+  std::vector<int> all_nNo(np);
+  MPI_Allgather(&local_nNo, 1, MPI_INT,
+                all_nNo.data(), 1, MPI_INT, cm.com());
+
+  global_nNo = 0;
+  local_offset = 0;
+  for (int p = 0; p < np; p++) {
+    if (p < my_rank) local_offset += all_nNo[p];
+    global_nNo += all_nNo[p];
+  }
+}
+
+//----------------------------------------------------------------------
+// gather_face_data — MPI_Allgatherv local face data to all ranks.
+// local_data is (nrows, local_nNo); returns (nrows, global_nNo).
+// Rank ordering in global array matches the local_offset convention.
+//----------------------------------------------------------------------
+Array<double> PartitionedFSI::gather_face_data(
+    const Array<double>& local_data,
+    int global_nNo, int /*local_offset*/,
+    cmType& cm, const CmMod& cm_mod)
+{
+  const int nrows = local_data.nrows();
+  const int local_nNo = local_data.ncols();
+  const int np = cm.np();
+
+  // Pack as flat row-major: [node0_row0, node0_row1, ..., node1_row0, ...]
+  std::vector<double> local_flat(nrows * local_nNo);
+  for (int a = 0; a < local_nNo; a++)
+    for (int i = 0; i < nrows; i++)
+      local_flat[a * nrows + i] = local_data(i, a);
+
+  int send_count = nrows * local_nNo;
+  std::vector<int> recv_counts(np), displs(np);
+  MPI_Allgather(&send_count, 1, MPI_INT,
+                recv_counts.data(), 1, MPI_INT, cm.com());
+  int total = 0;
+  for (int p = 0; p < np; p++) { displs[p] = total; total += recv_counts[p]; }
+
+  std::vector<double> global_flat(total);
+  MPI_Allgatherv(local_flat.data(), send_count, MPI_DOUBLE,
+                 global_flat.data(), recv_counts.data(), displs.data(),
+                 MPI_DOUBLE, cm.com());
+
+  Array<double> result(nrows, global_nNo);
+  int node_offset = 0;
+  for (int p = 0; p < np; p++) {
+    int p_nNo = recv_counts[p] / nrows;
+    for (int la = 0; la < p_nNo; la++)
+      for (int i = 0; i < nrows; i++)
+        result(i, node_offset + la) = global_flat[displs[p] + la * nrows + i];
+    node_offset += p_nNo;
+  }
+  return result;
+}
+
+//----------------------------------------------------------------------
+// gather_global_map — all-gather a local (local_src → global_tgt) map
+// into a global (global_src → global_tgt) map.
+//----------------------------------------------------------------------
+void PartitionedFSI::gather_global_map(
+    const std::vector<int>& local_map,
+    int global_src_nNo,
+    cmType& cm, const CmMod& cm_mod,
+    std::vector<int>& global_map)
+{
+  int np = cm.np();
+  int send_count = static_cast<int>(local_map.size());
+
+  std::vector<int> recv_counts(np), displs(np);
+  MPI_Allgather(&send_count, 1, MPI_INT,
+                recv_counts.data(), 1, MPI_INT, cm.com());
+  int total = 0;
+  for (int p = 0; p < np; p++) { displs[p] = total; total += recv_counts[p]; }
+
+  std::vector<int> global_flat(total);
+  MPI_Allgatherv(local_map.data(), send_count, MPI_INT,
+                 global_flat.data(), recv_counts.data(), displs.data(),
+                 MPI_INT, cm.com());
+
+  global_map.assign(global_src_nNo, -1);
+  int offset = 0;
+  for (int p = 0; p < np; p++) {
+    for (int la = 0; la < recv_counts[p]; la++)
+      global_map[offset + la] = global_flat[displs[p] + la];
+    offset += recv_counts[p];
+  }
+}
+
+//----------------------------------------------------------------------
+// build_face_node_map — match each LOCAL face_a node to its nearest
+// node in the PRE-GATHERED global face_b coordinates.
+// Returns local_src_idx → global_tgt_idx map.
+//----------------------------------------------------------------------
+void PartitionedFSI::build_face_node_map(
+    const faceType& face_a, const ComMod& com_a,
+    int global_b_nNo, const Array<double>& global_b_coords,
+    std::vector<int>& a_to_global_b)
+{
+  const int nsd = com_a.nsd;
+  const double tol = 1e-8;
+  a_to_global_b.assign(face_a.nNo, -1);
+
+  for (int a = 0; a < face_a.nNo; a++) {
+    int Ac = face_a.gN(a);
+    double best = 1e30;
+    int best_b = -1;
+    for (int bg = 0; bg < global_b_nNo; bg++) {
+      double d2 = 0.0;
+      for (int i = 0; i < nsd; i++) {
+        double d = com_a.x(i, Ac) - global_b_coords(i, bg);
+        d2 += d * d;
+      }
+      if (d2 < best) { best = d2; best_b = bg; }
+    }
+    if (best < tol * tol) a_to_global_b[a] = best_b;
+  }
+}
+
+//----------------------------------------------------------------------
+// build_node_maps — build global→global interface node maps.
+// Each sub-mesh is independently distributed, so we gather all face
+// coordinates from all ranks before performing the nearest-neighbor
+// search; then gather the local partial maps into global maps.
+//----------------------------------------------------------------------
+void PartitionedFSI::build_node_maps()
+{
+  auto& cm = main_sim_->com_mod.cm;
+  auto& cm_mod = main_sim_->cm_mod;
+  const int nsd = main_sim_->com_mod.nsd;
+
+  // Compute global nNo and per-rank offsets for each interface face
+  compute_face_global_info(*solid_face_, cm, cm_mod,
+                           solid_face_global_nNo_, solid_face_local_offset_);
+  compute_face_global_info(*fluid_face_, cm, cm_mod,
+                           fluid_face_global_nNo_, fluid_face_local_offset_);
+  compute_face_global_info(*mesh_face_,  cm, cm_mod,
+                           mesh_face_global_nNo_,  mesh_face_local_offset_);
+
+  // Gather face coordinates globally for each sub-mesh face
+  auto pack_coords = [&](const faceType& face, const ComMod& com) {
+    Array<double> local(nsd, face.nNo);
+    for (int a = 0; a < face.nNo; a++) {
+      int Ac = face.gN(a);
+      for (int i = 0; i < nsd; i++)
+        local(i, a) = com.x(i, Ac);
+    }
+    return local;
+  };
+
+  auto global_solid_coords = gather_face_data(
+      pack_coords(*solid_face_, solid_sim_->com_mod),
+      solid_face_global_nNo_, solid_face_local_offset_, cm, cm_mod);
+  auto global_fluid_coords = gather_face_data(
+      pack_coords(*fluid_face_, fluid_sim_->com_mod),
+      fluid_face_global_nNo_, fluid_face_local_offset_, cm, cm_mod);
+  auto global_mesh_coords  = gather_face_data(
+      pack_coords(*mesh_face_,  mesh_sim_->com_mod),
+      mesh_face_global_nNo_,  mesh_face_local_offset_,  cm, cm_mod);
+
+  // Build local (local_src → global_tgt) maps, then gather to global maps
+  std::vector<int> local_s2f, local_f2s, local_s2m;
+  build_face_node_map(*solid_face_, solid_sim_->com_mod,
+                      fluid_face_global_nNo_, global_fluid_coords, local_s2f);
+  build_face_node_map(*fluid_face_, fluid_sim_->com_mod,
+                      solid_face_global_nNo_, global_solid_coords, local_f2s);
+  build_face_node_map(*solid_face_, solid_sim_->com_mod,
+                      mesh_face_global_nNo_,  global_mesh_coords,  local_s2m);
+
+  gather_global_map(local_s2f, solid_face_global_nNo_, cm, cm_mod, solid_to_fluid_map_);
+  gather_global_map(local_f2s, fluid_face_global_nNo_, cm, cm_mod, fluid_to_solid_map_);
+  gather_global_map(local_s2m, solid_face_global_nNo_, cm, cm_mod, solid_to_mesh_map_);
+
+  solid_face_canonical_ = build_face_dedup_map(*solid_face_, solid_sim_->com_mod,
+                                               solid_face_global_nNo_, cm, cm_mod);
+  fluid_face_canonical_ = build_face_dedup_map(*fluid_face_, fluid_sim_->com_mod,
+                                               fluid_face_global_nNo_, cm, cm_mod);
+  mesh_face_canonical_  = build_face_dedup_map(*mesh_face_,  mesh_sim_->com_mod,
+                                               mesh_face_global_nNo_,  cm, cm_mod);
+
+  // Owner flags for the solid interface face: the interface traction is a
+  // precomputed nodal force that must be injected exactly once per physical
+  // node, since all_fun::commu sums it across ranks after the callback.
+  solid_face_owner_ = build_face_owner_flags(*solid_face_, solid_sim_->com_mod, cm);
+
+  if (cm.mas(cm_mod)) {
+    auto count_dups = [](const std::vector<int>& c) {
+      int n = 0; for (int i = 0; i < (int)c.size(); i++) if (c[i] != i) n++; return n;
+    };
+    std::cout << "[PartitionedFSI] face dedup: solid "
+              << count_dups(solid_face_canonical_) << "/" << solid_face_global_nNo_
+              << " dups, fluid " << count_dups(fluid_face_canonical_) << "/" << fluid_face_global_nNo_
+              << " dups, mesh " << count_dups(mesh_face_canonical_) << "/" << mesh_face_global_nNo_
+              << " dups" << std::endl;
+  }
+}
+
+//----------------------------------------------------------------------
+// build_face_dedup_map — canonical[i] = first global slot index that
+// shares the same physical node (global node ID) as slot i.
+// Slots with no duplicate have canonical[i] == i.
+//----------------------------------------------------------------------
+std::vector<int> PartitionedFSI::build_face_dedup_map(
+    const faceType& face, const ComMod& com,
+    int global_nNo, cmType& cm, const CmMod& cm_mod)
+{
+  std::vector<int> local_ids(face.nNo);
+  for (int a = 0; a < face.nNo; a++)
+    local_ids[a] = com.ltg[face.gN(a)];
+
+  int np = cm.np();
+  std::vector<int> counts(np), displs(np);
+  MPI_Allgather(&face.nNo, 1, MPI_INT, counts.data(), 1, MPI_INT, cm.com());
+  int total = 0;
+  for (int p = 0; p < np; p++) { displs[p] = total; total += counts[p]; }
+
+  std::vector<int> all_ids(total);
+  MPI_Allgatherv(local_ids.data(), face.nNo, MPI_INT,
+                 all_ids.data(), counts.data(), displs.data(), MPI_INT, cm.com());
+
+  std::unordered_map<int,int> first;
+  std::vector<int> canonical(total);
+  for (int i = 0; i < total; i++) {
+    auto [it, inserted] = first.emplace(all_ids[i], i);
+    canonical[i] = it->second;
+  }
+  return canonical;
+}
+
+//----------------------------------------------------------------------
+// apply_dedup — copy canonical slot value to every duplicate slot.
+//----------------------------------------------------------------------
+void PartitionedFSI::apply_dedup(Array<double>& arr, const std::vector<int>& canonical)
+{
+  for (int i = 0; i < (int)canonical.size(); i++)
+    if (canonical[i] != i)
+      for (int row = 0; row < arr.nrows(); row++)
+        arr(row, i) = arr(row, canonical[i]);
+}
+
+//----------------------------------------------------------------------
+// build_face_owner_flags — flags[a] = 1 if THIS rank is the lowest-ID rank
+// that contains local face node a, else 0.  A node shared across ranks
+// (a partition-ring node) is owned by exactly one rank under this rule.
+//----------------------------------------------------------------------
+std::vector<int> PartitionedFSI::build_face_owner_flags(
+    const faceType& face, const ComMod& com, cmType& cm)
+{
+  const int np = cm.np();
+  const int myrank = cm.id();
+
+  std::vector<int> local_ids(face.nNo);
+  for (int a = 0; a < face.nNo; a++)
+    local_ids[a] = com.ltg[face.gN(a)];
+
+  std::vector<int> counts(np), displs(np);
+  MPI_Allgather(&face.nNo, 1, MPI_INT, counts.data(), 1, MPI_INT, cm.com());
+  int total = 0;
+  for (int p = 0; p < np; p++) { displs[p] = total; total += counts[p]; }
+
+  std::vector<int> all_ids(total);
+  MPI_Allgatherv(local_ids.data(), face.nNo, MPI_INT,
+                 all_ids.data(), counts.data(), displs.data(), MPI_INT, cm.com());
+
+  // owner[globalID] = lowest rank containing it (ranks scanned in ascending order)
+  std::unordered_map<int,int> owner;
+  for (int p = 0; p < np; p++)
+    for (int k = 0; k < counts[p]; k++)
+      owner.emplace(all_ids[displs[p] + k], p);
+
+  std::vector<int> flags(face.nNo);
+  for (int a = 0; a < face.nNo; a++)
+    flags[a] = (owner[local_ids[a]] == myrank) ? 1 : 0;
+  return flags;
+}
+
+//----------------------------------------------------------------------
+// transfer_data
+//----------------------------------------------------------------------
+Array<double> PartitionedFSI::transfer_data(
+    const std::vector<int>& src_to_tgt_map,
+    const Array<double>& src_data, int tgt_nNo)
+{
+  int nrows = src_data.nrows();
+  Array<double> result(nrows, tgt_nNo);
+  for (int a = 0; a < static_cast<int>(src_to_tgt_map.size()); a++) {
+    int b = src_to_tgt_map[a];
+    if (b >= 0) {
+      for (int i = 0; i < nrows; i++) result(i, b) = src_data(i, a);
+    }
+  }
+  return result;
+}
+
+//----------------------------------------------------------------------
+// relax_interface — updates disp_prev_ and vel_prev_
+//----------------------------------------------------------------------
+void PartitionedFSI::relax_interface(int cp, int nsd,
+                                     const Array<double>& disp_current)
+{
+  switch (config_.coupling_method) {
+    case CouplingMethod::constant:
+      relax_constant(cp, nsd, disp_current);
+      break;
+    case CouplingMethod::aitken:
+      relax_aitken(cp, nsd, disp_current);
+      break;
+  }
+}
+
+//----------------------------------------------------------------------
+// relax_constant — fixed relaxation (operates on global face arrays)
+//----------------------------------------------------------------------
+void PartitionedFSI::relax_constant(int cp, int nsd,
+                                     const Array<double>& disp_current)
+{
+  omega_ = config_.initial_relaxation;
+  for (int a = 0; a < solid_face_global_nNo_; a++)
+    for (int i = 0; i < nsd; i++)
+      disp_prev_(i, a) += omega_ * (disp_current(i, a) - disp_prev_(i, a));
+}
+
+//----------------------------------------------------------------------
+// relax_aitken — Aitken Delta^2 (Küttler & Wall 2008, Eq. 44)
+// Operates on global face arrays; all ranks have identical data so no
+// MPI reduction is needed here.
+//----------------------------------------------------------------------
+void PartitionedFSI::relax_aitken(int cp, int nsd,
+                                   const Array<double>& disp_current)
+{
+  const int u = nsd * solid_face_global_nNo_;
+
+  // Build residual r = x_tilde - x
+  std::vector<double> r(u);
+  for (int a = 0; a < solid_face_global_nNo_; a++)
+    for (int i = 0; i < nsd; i++)
+      r[a * nsd + i] = disp_current(i, a) - disp_prev_(i, a);
+
+  // Aitken update: omega = -omega * r^T (r_new - r_old) / |r_new - r_old|^2
+  // Negative omega allowed (corrects overshoot)
+  if (cp > 0 && !r_prev_.empty()) {
+    double num = 0, den = 0;
+    for (int j = 0; j < u; j++) {
+      // Count each physical node once so omega is independent of partition
+      // count (duplicate ring slots hold identical residual entries).
+      if (solid_face_canonical_[j / nsd] != j / nsd) continue;
+      double dr = r[j] - r_prev_[j];
+      num += r_prev_[j] * dr;
+      den += dr * dr;
+    }
+    if (den > 1e-30) {
+      omega_ = -omega_ * num / den;
+      if (std::abs(omega_) > config_.omega_max)
+        omega_ = (omega_ > 0) ? config_.omega_max : -config_.omega_max;
+    }
+  }
+  r_prev_ = r;
+
+  // Apply: x_{k+1} = x_k + omega * r
+  for (int a = 0; a < solid_face_global_nNo_; a++)
+    for (int i = 0; i < nsd; i++)
+      disp_prev_(i, a) += omega_ * (disp_current(i, a) - disp_prev_(i, a));
+}
+
+//======================================================================
+// run — full time-stepping loop with Dirichlet-Neumann coupling
+//======================================================================
+void PartitionedFSI::run()
+{
+  auto& main_com = main_sim_->com_mod;
+  auto& cm_mod = main_sim_->cm_mod;
+  auto& cm = main_com.cm;
+
+  int nTS = main_com.nTS;
+  int& cTS = main_com.cTS;
+  double& dt = main_com.dt;
+  double& time = main_com.time;
+  int nITs = main_com.nITs;
+
+  if (cTS <= nITs) dt = dt / 10.0;
+
+  Simulation* sims[3] = {fluid_sim_.get(), solid_sim_.get(), mesh_sim_.get()};
+
+  while (true) {
+    if (cTS == nITs) dt = 10.0 * dt;
+    cTS = cTS + 1;
+    time = time + dt;
+
+    // Sync time to sub-sims
+    for (auto* sim : sims) {
+      sim->com_mod.cTS = cTS;
+      sim->com_mod.time = time;
+      sim->com_mod.dt = dt;
+      for (auto& eq : sim->com_mod.eq) { eq.itr = 0; eq.ok = false; }
+    }
+
+    if (cm.mas(cm_mod)) {
+      if (histor_log_.is_open()) {
+        histor_log_ << std::string(70, '=') << std::endl;
+        histor_log_ << "  TIME STEP " << cTS << "  t=" << time << "  dt=" << dt << std::endl;
+        histor_log_ << std::string(70, '=') << std::endl;
+      }
+    }
+
+    // Predictor + Dirichlet BCs for each sub-sim
+    for (auto* sim : sims) {
+      sim->get_integrator().predictor();
+      set_bc::set_bc_dir(sim->com_mod, sim->get_integrator().get_solutions());
+    }
+
+    // Coupling loop
+    bool converged = step();
+
+    if (!converged && cm.mas(cm_mod)) {
+      std::cout << "  TIME STEP " << cTS << " FAILED (NaN or no convergence)" << std::endl;
+      if (histor_log_.is_open())
+        histor_log_ << "  TIME STEP " << cTS << " FAILED (NaN or no convergence)" << std::endl;
+    }
+
+    // Stop on failure
+    if (!converged) break;
+
+    // Save results
+    save_results();
+
+    // Copy current -> old
+    for (auto* sim : sims) {
+      auto& sol = sim->get_integrator().get_solutions();
+      sol.old.get_acceleration() = sol.current.get_acceleration();
+      sol.old.get_velocity()     = sol.current.get_velocity();
+      if (sim->com_mod.dFlag)
+        sol.old.get_displacement() = sol.current.get_displacement();
+    }
+
+    // Stop condition
+    int stopTS = nTS;
+    if (cm.mas(cm_mod)) {
+      if (FILE* fp = fopen(main_com.stopTrigName.c_str(), "r")) {
+        int count = fscanf(fp, "%d", &stopTS);
+        if (count == 0) stopTS = cTS;
+        fclose(fp);
+      }
+    }
+    cm.bcast(cm_mod, &stopTS);
+    if (cTS >= stopTS) break;
+  }
+}
+
+//----------------------------------------------------------------------
+// compute_interface_velocity — Newmark-consistent velocity from disp_prev_.
+// Each rank computes its local solid face nodes, then all-gather to
+// produce the global vel_prev_ replicated on all ranks.
+//----------------------------------------------------------------------
+void PartitionedFSI::compute_interface_velocity()
+{
+  auto& solid_com = solid_sim_->com_mod;
+  auto& solid_sol = solid_sim_->get_integrator().get_solutions();
+  const auto& eq = solid_com.eq[0];
+  const int s = eq.s;
+  const int nsd = main_sim_->com_mod.nsd;
+  const double dt = solid_com.dt;
+  const auto& Do = solid_sol.old.get_displacement();
+  const auto& Yo = solid_sol.old.get_velocity();
+  const auto& Ao = solid_sol.old.get_acceleration();
+  auto& cm = main_sim_->com_mod.cm;
+  auto& cm_mod = main_sim_->cm_mod;
+
+  // Compute velocity for this rank's local solid face nodes
+  Array<double> local_vel(nsd, solid_face_->nNo);
+  for (int a = 0; a < solid_face_->nNo; a++) {
+    int Ac = solid_face_->gN(a);
+    for (int i = 0; i < nsd; i++) {
+      double disp_a = disp_prev_(i, solid_face_local_offset_ + a);
+      double a_new, v_new;
+      newmark::state_from_displacement(
+          disp_a, Do(i + s, Ac), Yo(i + s, Ac), Ao(i + s, Ac),
+          dt, eq.beta, eq.gam, a_new, v_new);
+      local_vel(i, a) = v_new;
+    }
+  }
+
+  // All-gather to global.  Duplicate ring slots are filled identically by
+  // their owning ranks (velocity is computed from the replicated disp_prev_),
+  // so no dedup is needed.
+  vel_prev_ = gather_face_data(local_vel, solid_face_global_nNo_,
+                               solid_face_local_offset_, cm, cm_mod);
+}
+
+//----------------------------------------------------------------------
+// solve_fluid — fluid equation with interface velocity and ALE.
+// vel_prev_ is global; extract this rank's local fluid face portion.
+//----------------------------------------------------------------------
+bool PartitionedFSI::solve_fluid(
+    const Array<double>& mesh_vel_Yo, const Array<double>& mesh_vel_Yn)
+{
+  auto& fluid_com = fluid_sim_->com_mod;
+  auto& fluid_int = fluid_sim_->get_integrator();
+  auto& fluid_sol = fluid_int.get_solutions();
+  const int nsd = main_sim_->com_mod.nsd;
+
+  // Transfer global solid velocity → global fluid velocity, then extract local.
+  // transfer_data only writes the canonical target slot of each node; dedup
+  // fills the duplicate ring slots so every rank reads a correct Dirichlet
+  // value below (the velocity is applied on all face nodes, not just owners).
+  auto global_fluid_vel = transfer_data(solid_to_fluid_map_, vel_prev_,
+                                        fluid_face_global_nNo_);
+  apply_dedup(global_fluid_vel, fluid_face_canonical_);
+  Array<double> local_fluid_vel(nsd, fluid_face_->nNo);
+  for (int a = 0; a < fluid_face_->nNo; a++)
+    for (int i = 0; i < nsd; i++)
+      local_fluid_vel(i, a) = global_fluid_vel(i, fluid_face_local_offset_ + a);
+
+  set_bc::set_bc_dir(fluid_com, fluid_sol);
+  fsi_coupling::apply_velocity_on_fluid(
+      fluid_com, fluid_com.eq[0], *fluid_face_, local_fluid_vel, fluid_sol);
+
+  // ALE mesh velocity at generalized-alpha intermediate time
+  double af = fluid_com.eq[0].af;
+  fluid_com.ale_mesh_velocity.resize(nsd, fluid_com.tnNo);
+  for (int a = 0; a < fluid_com.tnNo; a++)
+    for (int i = 0; i < nsd; i++)
+      fluid_com.ale_mesh_velocity(i, a) = (1.0 - af) * mesh_vel_Yo(i, a)
+                                         + af * mesh_vel_Yn(i, a);
+
+  fluid_int.step_equation(0, [&]() {
+    set_bc::enforce_dirichlet_dofs_on_face(fluid_com, *fluid_face_, 0, nsd);
+  });
+  return !has_nan(fluid_sol);
+}
+
+//----------------------------------------------------------------------
+// solve_solid — extract traction from fluid, solve solid.
+// All-gathers local fluid traction to global, transfers to global solid,
+// then extracts this rank's local solid portion.
+//----------------------------------------------------------------------
+bool PartitionedFSI::solve_solid()
+{
+  auto& fluid_com = fluid_sim_->com_mod;
+  auto& solid_com = solid_sim_->com_mod;
+  auto& fluid_int = fluid_sim_->get_integrator();
+  auto& solid_int = solid_sim_->get_integrator();
+  auto& solid_sol = solid_int.get_solutions();
+  auto& cm = main_sim_->com_mod.cm;
+  auto& cm_mod = main_sim_->cm_mod;
+
+  // Compute local fluid traction, all-gather to global fluid face
+  auto local_fluid_traction = post::compute_face_traction(
+      fluid_com, fluid_sim_->cm_mod,
+      *fluid_mesh_, *fluid_face_, fluid_com.eq[0],
+      fluid_int.get_solutions());
+  // gather_face_data fills duplicate ring slots identically (compute_face_traction
+  // already summed across ranks via commu), so no dedup is needed here.
+  auto global_fluid_traction = gather_face_data(local_fluid_traction,
+                                                fluid_face_global_nNo_,
+                                                fluid_face_local_offset_,
+                                                cm, cm_mod);
+
+  // Transfer global fluid → global solid, then extract local solid portion.
+  // Only the owning (min-rank) slot of each node is read below, and that slot
+  // is the canonical one that transfer_data writes, so no dedup is needed.
+  auto global_solid_traction = transfer_data(fluid_to_solid_map_,
+                                             global_fluid_traction,
+                                             solid_face_global_nNo_);
+  const int nrows = global_solid_traction.nrows();
+  Array<double> local_solid_traction(nrows, solid_face_->nNo);
+  for (int a = 0; a < solid_face_->nNo; a++) {
+    // Apply the nodal force on the owning rank only; partition-ring nodes are
+    // shared by several ranks and all_fun::commu (called after the traction
+    // callback in step_equation) SUMS R across them.  Subtracting the full
+    // force on every sharing rank would multiply it by the node's multiplicity.
+    double w = solid_face_owner_[a] ? 1.0 : 0.0;
+    for (int i = 0; i < nrows; i++)
+      local_solid_traction(i, a) =
+          w * global_solid_traction(i, solid_face_local_offset_ + a);
+  }
+
+  set_bc::set_bc_dir(solid_com, solid_sol);
+  solid_int.step_equation(0, [&]() {
+    fsi_coupling::apply_traction_on_solid(
+        solid_com, solid_com.eq[0], *solid_face_, local_solid_traction);
+  });
+  return !has_nan(solid_sol);
+}
+
+//----------------------------------------------------------------------
+// solve_mesh — mesh equation with relaxed displacement, deform fluid mesh.
+// disp_prev_ is global; extract this rank's local mesh face portion.
+//----------------------------------------------------------------------
+bool PartitionedFSI::solve_mesh(const Array<double>& x_ref, int mesh_s)
+{
+  auto& fluid_com = fluid_sim_->com_mod;
+  auto& mesh_com  = mesh_sim_->com_mod;
+  auto& mesh_int  = mesh_sim_->get_integrator();
+  auto& mesh_sol  = mesh_int.get_solutions();
+  const int nsd = main_sim_->com_mod.nsd;
+
+  // Transfer global solid displacement → global mesh, extract local portion.
+  // transfer_data only writes the canonical target slot of each node; dedup
+  // fills the duplicate ring slots so every rank reads a correct Dirichlet
+  // value below (the displacement is applied on all face nodes, not just owners).
+  auto global_mesh_disp = transfer_data(solid_to_mesh_map_, disp_prev_,
+                                        mesh_face_global_nNo_);
+  apply_dedup(global_mesh_disp, mesh_face_canonical_);
+  Array<double> local_mesh_disp(nsd, mesh_face_->nNo);
+  for (int a = 0; a < mesh_face_->nNo; a++)
+    for (int i = 0; i < nsd; i++)
+      local_mesh_disp(i, a) = global_mesh_disp(i, mesh_face_local_offset_ + a);
+
+  set_bc::set_bc_dir(mesh_com, mesh_sol);
+  fsi_coupling::apply_displacement_on_mesh(
+      mesh_com, mesh_com.eq[0], *mesh_face_, local_mesh_disp, mesh_sol);
+  mesh_int.step_equation(0, [&]() {
+    set_bc::enforce_dirichlet_on_face(mesh_com, *mesh_face_, nsd);
+  });
+  if (has_nan(mesh_sol)) return false;
+
+  // Deform fluid mesh: apply only the INCREMENT (Dn - Do) to x_ref
+  // so that fluid_com.x = x_original + Dn
+  auto& mesh_Dn = mesh_sol.current.get_displacement();
+  auto& mesh_Do = mesh_sol.old.get_displacement();
+  for (int a = 0; a < fluid_com.tnNo; a++)
+    for (int i = 0; i < nsd; i++)
+      fluid_com.x(i, a) = x_ref(i, a)
+                         + mesh_Dn(i + mesh_s, a) - mesh_Do(i + mesh_s, a);
+  return true;
+}
+
+//======================================================================
+// step — one coupling iteration loop for one time step
+//======================================================================
+bool PartitionedFSI::step()
+{
+  auto& fluid_com = fluid_sim_->com_mod;
+  auto& solid_com = solid_sim_->com_mod;
+  auto& mesh_com  = mesh_sim_->com_mod;
+  auto& cm_mod = main_sim_->cm_mod;
+  auto& cm = main_sim_->com_mod.cm;
+  const int nsd = main_sim_->com_mod.nsd;
+  const int cTS = main_sim_->com_mod.cTS;
+
+  auto& fluid_sol = fluid_sim_->get_integrator().get_solutions();
+  auto& solid_sol = solid_sim_->get_integrator().get_solutions();
+  auto& mesh_sol  = mesh_sim_->get_integrator().get_solutions();
+
+  omega_ = config_.initial_relaxation;
+  r_prev_.clear();
+
+  // Save predictor state
+  struct SavedState { Array<double> An, Yn, Dn; };
+  auto save_state = [](SolutionStates& s) -> SavedState {
+    return {s.current.get_acceleration(), s.current.get_velocity(), s.current.get_displacement()};
+  };
+  auto restore_state = [](SolutionStates& s, const SavedState& st) {
+    s.current.get_acceleration() = st.An;
+    s.current.get_velocity() = st.Yn;
+    s.current.get_displacement() = st.Dn;
+  };
+  SavedState fluid_pred = save_state(fluid_sol);
+  SavedState solid_pred = save_state(solid_sol);
+  SavedState mesh_pred  = save_state(mesh_sol);
+
+  // Save mesh coordinates at start of time step = x_original + Do
+  Array<double> x_ref(fluid_com.x);
+
+  // ALE mesh velocity from predictor (updated after each mesh solve)
+  const int mesh_s = mesh_com.eq[0].s;
+  Array<double> mesh_vel_Yn(nsd, mesh_com.tnNo);
+  Array<double> mesh_vel_Yo(nsd, mesh_com.tnNo);
+  {
+    auto& mYn = mesh_sol.current.get_velocity();
+    auto& mYo = mesh_sol.old.get_velocity();
+    for (int a = 0; a < mesh_com.tnNo; a++)
+      for (int i = 0; i < nsd; i++) {
+        mesh_vel_Yn(i, a) = mYn(mesh_s + i, a);
+        mesh_vel_Yo(i, a) = mYo(mesh_s + i, a);
+      }
+  }
+
+  // Initial interface state from predictor — extract local, all-gather to global
+  Array<double> disp_current;
+  {
+    auto local_disp = fsi_coupling::extract_solid_displacement(
+        solid_com, solid_com.eq[0], *solid_face_, solid_sol);
+    disp_prev_ = gather_face_data(local_disp, solid_face_global_nNo_,
+                                  solid_face_local_offset_, cm, cm_mod);
+  }
+  compute_interface_velocity();
+
+  bool converged = false;
+
+  for (int cp = 0; cp < config_.max_coupling_iterations; cp++) {
+
+    // Restore all sub-sims to predictor state
+    restore_state(fluid_sol, fluid_pred);
+    restore_state(solid_sol, solid_pred);
+    restore_state(mesh_sol, mesh_pred);
+
+    // ---- 1. Mesh solve + deform fluid mesh ----
+    // Use latest disp_prev_ (relaxed from previous iter, or predictor on iter 0).
+    // Writes fluid_com.x = x_ref + (Dn - Do) so the fluid solves on the deformed mesh.
+    if (!solve_mesh(x_ref, mesh_s)) {
+      if (cm.mas(cm_mod)) std::cout << "  ABORT: NaN in mesh solve" << std::endl;
+      return false;
+    }
+
+    // Update ALE mesh velocity from this iteration's mesh solve
+    {
+      auto& mYn = mesh_sol.current.get_velocity();
+      for (int a = 0; a < mesh_com.tnNo; a++)
+        for (int i = 0; i < nsd; i++)
+          mesh_vel_Yn(i, a) = mYn(mesh_s + i, a);
+    }
+
+    // ---- 2. Fluid solve ----
+    if (!solve_fluid(mesh_vel_Yo, mesh_vel_Yn)) {
+      if (cm.mas(cm_mod)) std::cout << "  ABORT: NaN in fluid solve" << std::endl;
+      return false;
+    }
+
+    // ---- 3. Solid solve ----
+    if (!solve_solid()) {
+      if (cm.mas(cm_mod)) std::cout << "  ABORT: NaN in solid solve" << std::endl;
+      return false;
+    }
+
+    // ---- 4. Extract displacement (global), check convergence ----
+    // Extract local solid displacement and all-gather to global so all ranks
+    // have identical arrays — no MPI reduction needed for norms.
+    {
+      auto local_disp = fsi_coupling::extract_solid_displacement(
+          solid_com, solid_com.eq[0], *solid_face_, solid_sol);
+      disp_current = gather_face_data(local_disp, solid_face_global_nNo_,
+                                      solid_face_local_offset_, cm, cm_mod);
+    }
+
+    double res_norm = 0.0, disp_norm = 0.0;
+    for (int a = 0; a < solid_face_global_nNo_; a++) {
+      // Count each physical node once: duplicate ring slots (canonical != a)
+      // hold identical values and would otherwise be weighted by multiplicity,
+      // making the convergence metric depend on the partition count.
+      if (solid_face_canonical_[a] != a) continue;
+      for (int i = 0; i < nsd; i++) {
+        double res = disp_current(i, a) - disp_prev_(i, a);
+        res_norm  += res * res;
+        disp_norm += disp_current(i, a) * disp_current(i, a);
+      }
+    }
+    res_norm  = sqrt(res_norm);
+    disp_norm = sqrt(disp_norm);
+    double rel = (disp_norm > 1e-30) ? res_norm / disp_norm : res_norm;
+
+    // ---- 5. Relaxation ----
+    relax_interface(cp, nsd, disp_current);
+    compute_interface_velocity();
+
+    // Check for NaN/divergence (global arrays — consistent on all ranks)
+    {
+      bool bad = false;
+      double max_disp = 0;
+      for (int a = 0; a < solid_face_global_nNo_ && !bad; a++)
+        for (int i = 0; i < nsd; i++) {
+          if (std::isnan(disp_prev_(i, a)) || std::isinf(disp_prev_(i, a)))
+            { bad = true; break; }
+          max_disp = std::max(max_disp, std::abs(disp_prev_(i, a)));
+        }
+      if (bad || max_disp > 1e10) {
+        if (cm.mas(cm_mod)) std::cout << "  ABORT: NaN/divergence after relaxation" << std::endl;
+        return false;
+      }
+    }
+
+    // ---- 6. Output ----
+    if (cp == 0) first_res_norm_ = res_norm;
+    int dB_val = 0;
+    double ri_r1 = 1.0;
+    if (first_res_norm_ > 1e-30 && res_norm > 0) {
+      ri_r1 = res_norm / first_res_norm_;
+      dB_val = static_cast<int>(20.0 * log10(ri_r1));
+    }
+
+    if (cm.mas(cm_mod)) {
+      bool conv = rel < config_.coupling_tolerance;
+      bool saved = conv
+                && (cTS % fluid_sim_->com_mod.saveIncr == 0)
+                && (cTS >= fluid_sim_->com_mod.saveATS);
+      char buf[256];
+      snprintf(buf, sizeof(buf), " CP %d-%d%s %10.3e %5d %10.3e %10.3e %10.3e %10.3e",
+               cTS, cp + 1, saved ? "s" : " ",
+               main_sim_->com_mod.timer.get_elapsed_time(),
+               dB_val, ri_r1, rel, omega_, disp_norm);
+      std::cout << buf << std::endl;
+      if (coupling_log_.is_open()) coupling_log_ << buf << std::endl;
+      if (histor_log_.is_open()) histor_log_ << buf << std::endl;
+    }
+
+    if (rel < config_.coupling_tolerance) { converged = true; break; }
+  }
+  return converged;
+}
+
+//----------------------------------------------------------------------
+// save_results
+//----------------------------------------------------------------------
+void PartitionedFSI::save_results()
+{
+  int cTS = main_sim_->com_mod.cTS;
+  Simulation* sims[3] = {fluid_sim_.get(), solid_sim_.get(), mesh_sim_.get()};
+
+  for (auto* sim : sims) {
+    auto& com = sim->com_mod;
+    auto& sol = sim->get_integrator().get_solutions();
+    if (com.saveVTK) {
+      bool l2 = ((cTS % com.saveIncr) == 0);
+      bool l3 = (cTS >= com.saveATS);
+      if (l2 && l3) {
+        output::output_result(sim, com.timeP, 3, 0);
+        vtk_xml::write_vtus(sim, sol, false);
+      }
+    }
+  }
+}
+
diff --git a/Code/Source/solver/PartitionedFSI.h b/Code/Source/solver/PartitionedFSI.h
new file mode 100644
index 000000000..24cdff83e
--- /dev/null
+++ b/Code/Source/solver/PartitionedFSI.h
@@ -0,0 +1,177 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef PARTITIONED_FSI_H
+#define PARTITIONED_FSI_H
+
+#include "Simulation.h"
+#include "Integrator.h"
+#include "Array.h"
+#include "CmMod.h"
+
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <string>
+
+/// @brief Coupling method for interface relaxation
+enum class CouplingMethod { constant, aitken };
+
+/// @brief Configuration for partitioned FSI coupling, read from XML input.
+struct PartitionedFSIConfig {
+  int max_coupling_iterations = 50;
+  double coupling_tolerance = 1e-6;
+  double initial_relaxation = 1.0;
+  double omega_max = 1.0;
+  CouplingMethod coupling_method = CouplingMethod::aitken;
+
+  // Face names for the FSI interface
+  std::string fluid_interface_face;
+  std::string solid_interface_face;
+};
+
+/// @brief Partitioned FSI coupling with 3 independent sub-Simulations.
+///
+/// Each sub-field (fluid, struct, mesh) has its own Simulation object with
+/// independent mesh, solution arrays, and linear system. No shared global
+/// arrays, no DOF offsets (each eq.s=0), no regularization of inactive nodes.
+///
+/// Implements Dirichlet-Neumann coupling with Aitken relaxation:
+///   1. Transfer solid displacement to mesh interface, solve mesh equation
+///   2. Deform fluid mesh using mesh displacement, solve fluid equation
+///   3. Extract fluid traction, apply to solid, solve solid equation
+///   4. Extract solid displacement, apply Aitken relaxation
+///   5. Check coupling convergence
+///
+/// Related to GitHub issue #431: Implement partitioned FSI in svMultiPhysics
+class PartitionedFSI {
+public:
+  PartitionedFSI(Simulation* main_simulation, const PartitionedFSIConfig& config,
+                 const std::string& xml_file_path);
+
+  ~PartitionedFSI();
+
+  void run();
+  bool step();
+
+private:
+  Simulation* main_sim_;
+  PartitionedFSIConfig config_;
+  std::string xml_file_path_;
+
+  // Sub-simulations (owned)
+  std::unique_ptr<Simulation> fluid_sim_;
+  std::unique_ptr<Simulation> solid_sim_;
+  std::unique_ptr<Simulation> mesh_sim_;
+
+  // Interface face pointers within each sub-sim
+  const faceType* fluid_face_ = nullptr;
+  const faceType* solid_face_ = nullptr;
+  const faceType* mesh_face_ = nullptr;
+
+  // Mesh pointers within each sub-sim
+  const mshType* fluid_mesh_ = nullptr;
+  const mshType* solid_mesh_ = nullptr;
+  const mshType* mesh_mesh_ = nullptr;
+
+  // Global face node counts and per-rank offsets (for MPI distribution)
+  int solid_face_global_nNo_ = 0;
+  int solid_face_local_offset_ = 0;
+  int fluid_face_global_nNo_ = 0;
+  int fluid_face_local_offset_ = 0;
+  int mesh_face_global_nNo_ = 0;
+  int mesh_face_local_offset_ = 0;
+
+  // Node maps between interface faces: global_src_idx → global_tgt_idx
+  std::vector<int> solid_to_fluid_map_;
+  std::vector<int> fluid_to_solid_map_;
+  std::vector<int> solid_to_mesh_map_;
+
+  // Canonical slot maps: canonical_[i] = first global slot with same physical node as i
+  std::vector<int> solid_face_canonical_;
+  std::vector<int> fluid_face_canonical_;
+  std::vector<int> mesh_face_canonical_;
+
+  // Per-local-node owner flag for the solid interface face: 1 if THIS rank is
+  // the min-rank owner of the node, else 0.  Used to apply the interface
+  // traction exactly once per physical node (it is summed by all_fun::commu).
+  std::vector<int> solid_face_owner_;
+
+  // Coupling state — indexed by GLOBAL solid face nodes (replicated on all ranks)
+  Array<double> disp_prev_;
+  Array<double> vel_prev_;
+  double omega_;
+  double first_res_norm_ = 0.0;
+
+  // Aitken state — sized for global solid face
+  std::vector<double> r_prev_;
+
+  // Output files for coupling convergence history
+  std::ofstream coupling_log_;
+  std::ofstream histor_log_;
+
+  void resolve_faces();
+  void build_node_maps();
+
+  /// Solve fluid equation with current interface velocity and ALE mesh velocity
+  bool solve_fluid(const Array<double>& mesh_vel_Yo, const Array<double>& mesh_vel_Yn);
+
+  /// Extract fluid traction, transfer to solid, solve solid equation
+  bool solve_solid();
+
+  /// Solve mesh equation with relaxed displacement, deform fluid mesh
+  bool solve_mesh(const Array<double>& x_ref, int mesh_s);
+
+  /// Compute vel_prev_ (global) from disp_prev_ (global) using Newmark relationship
+  void compute_interface_velocity();
+
+  void relax_interface(int cp, int nsd, const Array<double>& disp_current);
+  void relax_constant(int cp, int nsd, const Array<double>& disp_current);
+  void relax_aitken(int cp, int nsd, const Array<double>& disp_current);
+
+  /// Compute global_nNo and local_offset for a distributed face
+  static void compute_face_global_info(const faceType& face, cmType& cm,
+                                       const CmMod& cm_mod,
+                                       int& global_nNo, int& local_offset);
+
+  /// All-gather local face data (nrows, local_nNo) to global (nrows, global_nNo)
+  static Array<double> gather_face_data(const Array<double>& local_data,
+                                        int global_nNo, int local_offset,
+                                        cmType& cm, const CmMod& cm_mod);
+
+  /// All-gather local (local_src → global_tgt) map to global (global_src → global_tgt) map
+  static void gather_global_map(const std::vector<int>& local_map,
+                                int global_src_nNo,
+                                cmType& cm, const CmMod& cm_mod,
+                                std::vector<int>& global_map);
+
+  /// Build local face_a → global face_b node map using pre-gathered global face_b coords
+  static void build_face_node_map(const faceType& face_a, const ComMod& com_a,
+                                  int global_b_nNo, const Array<double>& global_b_coords,
+                                  std::vector<int>& a_to_global_b);
+
+  /// Build a minimal sub-simulation XML for the given role by extracting
+  /// the tagged equation and its meshes from the main XML. Returns temp file path.
+  static std::string build_sub_xml(const std::string& main_xml_path,
+                                   const std::string& role);
+
+  /// Transfer data from global src face to global tgt face using global map
+  static Array<double> transfer_data(const std::vector<int>& src_to_tgt_map,
+                                     const Array<double>& src_data, int tgt_nNo);
+
+  /// Build canonical[i] = index of first occurrence of global node ID i across ranks
+  std::vector<int> build_face_dedup_map(const faceType& face, const ComMod& com,
+                                        int global_nNo, cmType& cm, const CmMod& cm_mod);
+
+  /// Propagate canonical slot values to duplicate slots (canonical[i] != i)
+  static void apply_dedup(Array<double>& arr, const std::vector<int>& canonical);
+
+  /// Build per-local-node flags marking whether THIS rank is the min-rank
+  /// owner of each face node (1 = owner, 0 = shared but owned by a lower rank).
+  static std::vector<int> build_face_owner_flags(const faceType& face,
+                                                 const ComMod& com, cmType& cm);
+
+  void save_results();
+};
+
+#endif // PARTITIONED_FSI_H
diff --git a/Code/Source/solver/Simulation.cpp b/Code/Source/solver/Simulation.cpp
index 7e6d56523..c1ce70250 100644
--- a/Code/Source/solver/Simulation.cpp
+++ b/Code/Source/solver/Simulation.cpp
@@ -3,6 +3,7 @@
 
 #include "Simulation.h"
 #include "Integrator.h"
+#include "PartitionedFSI.h"
 
 #include "all_fun.h"
 #include "load_msh.h"
@@ -12,6 +13,17 @@
 #include <iostream>
 #include <stdexcept>
 
+void add_eq_linear_algebra(ComMod& com_mod, eqType& lEq)
+{
+  lEq.linear_algebra = LinearAlgebraFactory::create_interface(lEq.linear_algebra_type);
+  lEq.linear_algebra->set_preconditioner(lEq.linear_algebra_preconditioner);
+  lEq.linear_algebra->initialize(com_mod, lEq);
+
+  if (lEq.linear_algebra_assembly_type != consts::LinearAlgebraType::none) {
+    lEq.linear_algebra->set_assembly(lEq.linear_algebra_assembly_type);
+  }
+}
+
 Simulation::Simulation() 
 {
   roInf = 0.2;
@@ -40,6 +52,13 @@ void Simulation::read_parameters(const std::string& file_name)
   parameters.read_xml(file_name);
 }
 
+/// @brief Read solver parameters from an in-memory XML string (no file).
+//
+void Simulation::read_parameters_from_string(const std::string& xml_content)
+{
+  parameters.read_xml_from_string(xml_content);
+}
+
 /// @brief Set the simulation and module member data.
 ///
 /// Replicates the README subroutine lines to set COMMOD module varliables
@@ -112,3 +131,81 @@ Integrator& Simulation::get_integrator()
   }
   return *integrator_;
 }
+
+/// @brief Get pointer to PartitionedFSI object (null if not configured)
+PartitionedFSI* Simulation::get_partitioned_fsi()
+{
+  return partitioned_fsi_.get();
+}
+
+/// @brief Initialize partitioned FSI if configured in parameters.
+///
+/// Parameters are only parsed on rank 0 (slaves skip read_files), so we
+/// broadcast the active flag and config to all ranks before branching.
+void Simulation::initialize_partitioned_fsi(const std::string& xml_file_path)
+{
+  auto& cm = com_mod.cm;
+  auto& cm_mod_ref = cm_mod;
+
+  // Rank 0 determines whether partitioned FSI is active and builds the config.
+  // Broadcast the decision so all ranks take the same path.
+  int active = 0;
+  PartitionedFSIConfig config;
+
+  if (cm.mas(cm_mod_ref)) {
+    auto& pcp = parameters.partitioned_coupling_parameters;
+    // Active when <Partitioned_coupling> is present and at least one equation
+    // carries a partitioned role attribute.
+    if (pcp.defined()) {
+      for (auto* ep : parameters.equation_parameters) {
+        if (ep->role.defined() && !ep->role.value().empty()) { active = 1; break; }
+      }
+    }
+    if (active) {
+      config.max_coupling_iterations = pcp.max_coupling_iterations.value();
+      config.coupling_tolerance       = pcp.coupling_tolerance.value();
+      config.initial_relaxation       = pcp.initial_relaxation.value();
+      config.omega_max                = pcp.omega_max.value();
+
+      std::string method = pcp.coupling_method.value();
+      if (method == "constant")    config.coupling_method = CouplingMethod::constant;
+      else if (method == "aitken") config.coupling_method = CouplingMethod::aitken;
+      else throw std::runtime_error("[PartitionedFSI] Unknown Coupling_method: " + method);
+
+      config.fluid_interface_face = pcp.fluid_interface_face.value();
+      config.solid_interface_face = pcp.solid_interface_face.value();
+    }
+  }
+
+  // Broadcast the active flag and config fields to all ranks.
+  MPI_Bcast(&active, 1, MPI_INT, 0, cm.com());
+  if (!active) return;
+
+  int max_iter = config.max_coupling_iterations;
+  double tol   = config.coupling_tolerance;
+  double relax = config.initial_relaxation;
+  double omax  = config.omega_max;
+  int method_i = static_cast<int>(config.coupling_method);
+  MPI_Bcast(&max_iter, 1, MPI_INT,    0, cm.com());
+  MPI_Bcast(&tol,      1, MPI_DOUBLE, 0, cm.com());
+  MPI_Bcast(&relax,    1, MPI_DOUBLE, 0, cm.com());
+  MPI_Bcast(&omax,     1, MPI_DOUBLE, 0, cm.com());
+  MPI_Bcast(&method_i, 1, MPI_INT,    0, cm.com());
+
+  auto bcast_str = [&](std::string& s) {
+    int len = static_cast<int>(s.size());
+    MPI_Bcast(&len, 1, MPI_INT, 0, cm.com());
+    s.resize(len);
+    MPI_Bcast(s.data(), len, MPI_CHAR, 0, cm.com());
+  };
+  bcast_str(config.fluid_interface_face);
+  bcast_str(config.solid_interface_face);
+
+  config.max_coupling_iterations = max_iter;
+  config.coupling_tolerance       = tol;
+  config.initial_relaxation       = relax;
+  config.omega_max                = omax;
+  config.coupling_method          = static_cast<CouplingMethod>(method_i);
+
+  partitioned_fsi_ = std::make_unique<PartitionedFSI>(this, config, xml_file_path);
+}
diff --git a/Code/Source/solver/Simulation.h b/Code/Source/solver/Simulation.h
index a8e092158..f38d7751f 100644
--- a/Code/Source/solver/Simulation.h
+++ b/Code/Source/solver/Simulation.h
@@ -13,8 +13,11 @@
 #include <string>
 #include <memory>
 
-// Forward declaration
+// Forward declarations
 class Integrator;
+class PartitionedFSI;
+
+void add_eq_linear_algebra(ComMod& com_mod, eqType& lEq);
 
 class Simulation {
 
@@ -28,6 +31,8 @@ class Simulation {
     ChnlMod& get_chnl_mod() { return chnl_mod; };
     ComMod& get_com_mod() { return com_mod; };
     Integrator& get_integrator();
+    PartitionedFSI* get_partitioned_fsi();
+    void initialize_partitioned_fsi(const std::string& xml_file_path);
 
     // Initialize the Integrator object after simulation setup is complete
     // Takes ownership of solution states via move semantics
@@ -36,6 +41,9 @@ class Simulation {
     // Read a solver paramerer input XML file.
     void read_parameters(const std::string& fileName);
 
+    // Read solver parameters from an in-memory XML string (no file on disk).
+    void read_parameters_from_string(const std::string& xml_content);
+
     // Set simulation and module member data from Parameters.
     void set_module_parameters();
 
@@ -75,6 +83,9 @@ class Simulation {
   private:
     // Time integrator for Newton iteration loop
     std::unique_ptr<Integrator> integrator_;
+
+    // Partitioned FSI coupling (null if not configured)
+    std::unique_ptr<PartitionedFSI> partitioned_fsi_;
 };
 
 #endif
diff --git a/Code/Source/solver/SimulationLogger.h b/Code/Source/solver/SimulationLogger.h
index 4023079bb..03dd96b44 100755
--- a/Code/Source/solver/SimulationLogger.h
+++ b/Code/Source/solver/SimulationLogger.h
@@ -33,6 +33,8 @@ class SimulationLogger {
 
     bool is_initialized() const { return log_file_.is_open(); }
 
+    void set_cout_write(bool v) const { cout_write_ = v; }
+
     ~SimulationLogger() 
     {
       log_file_.close();
diff --git a/Code/Source/solver/eq_assem.cpp b/Code/Source/solver/eq_assem.cpp
index 9cea03b30..39c69719f 100644
--- a/Code/Source/solver/eq_assem.cpp
+++ b/Code/Source/solver/eq_assem.cpp
@@ -58,9 +58,13 @@ void b_assem_neu_bc(ComMod& com_mod, const faceType& lFa, const Vector<double>&
     cDmn = all_fun::domain(com_mod, msh, cEq, Ec);
     auto cPhys = eq.dmn[cDmn].phys;
 
-    Vector<int> ptr(eNoN); 
-    Vector<double> N(eNoN), hl(eNoN); 
-    Array<double> yl(tDof,eNoN), lR(dof,eNoN); 
+    // For ALE partitioned FSI, extend yl to hold mesh velocity
+    const bool has_ale = (com_mod.ale_mesh_velocity.size() > 0);
+    const int yl_nrows = has_ale ? tDof + nsd : tDof;
+
+    Vector<int> ptr(eNoN);
+    Vector<double> N(eNoN), hl(eNoN);
+    Array<double> yl(yl_nrows,eNoN), lR(dof,eNoN);
     Array3<double> lK(dof*dof,eNoN,eNoN);
 
     for (int a = 0; a < eNoN; a++) {
@@ -70,6 +74,11 @@ void b_assem_neu_bc(ComMod& com_mod, const faceType& lFa, const Vector<double>&
       for (int i = 0; i < tDof; i++) {
         yl(i,a) = Yg(i,Ac);
       }
+      if (has_ale) {
+        for (int i = 0; i < nsd; i++) {
+          yl(nsd+1+i, a) = com_mod.ale_mesh_velocity(i, Ac);
+        }
+      }
     }
 
     // Updating the shape functions, if neccessary
@@ -87,7 +96,7 @@ void b_assem_neu_bc(ComMod& com_mod, const faceType& lFa, const Vector<double>&
       N  = lFa.N.col(g);
 
       double h = 0.0;
-      Vector<double> y(tDof);
+      Vector<double> y(yl_nrows);
 
       for (int a = 0; a < eNoN; a++) {
         h = h + N(a)*hl(a);
@@ -162,6 +171,9 @@ void b_assem_neu_bc(ComMod& com_mod, const faceType& lFa, const Vector<double>&
 /// @param Dg 
 void b_neu_folw_p(ComMod& com_mod, const bcType& lBc, const faceType& lFa, const Vector<double>& hg, const SolutionStates& solutions)
 {
+  // Local alias for old displacement
+  const auto& Do = solutions.old.get_displacement();
+
   using namespace consts;
   using namespace utils;
   const auto& Dg = solutions.intermediate.get_displacement();
@@ -296,6 +308,10 @@ void b_neu_folw_p(ComMod& com_mod, const bcType& lBc, const faceType& lFa, const
 /// an arbitrary vector.
 void fsi_ls_upd(ComMod& com_mod, const bcType& lBc, const faceType& lFa, const SolutionStates& solutions)
 {
+  // Local aliases for displacement arrays
+  const auto& Dn = solutions.current.get_displacement();
+  const auto& Do = solutions.old.get_displacement();
+
   using namespace consts;
   using namespace utils;
   using namespace fsi_linear_solver;
diff --git a/Code/Source/solver/fluid.cpp b/Code/Source/solver/fluid.cpp
index 71235b169..5d6821c46 100644
--- a/Code/Source/solver/fluid.cpp
+++ b/Code/Source/solver/fluid.cpp
@@ -51,7 +51,7 @@ void b_fluid(ComMod& com_mod, const int eNoN, const double w, const Vector<doubl
 
   Vector<double> u(nsd);
 
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int i = 0; i < nsd; i++) {
       int j = i + nsd + 1;
       u(i) = y(i) - y(j);
@@ -163,7 +163,7 @@ void bw_fluid_2d(ComMod& com_mod, const int eNoNw, const int eNoNq, const double
 
   Vector<double> uh(2); 
 
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       uh(0) = uh(0) + Nw(a)*yl(3,a);
       uh(1) = uh(1) + Nw(a)*yl(4,a);
@@ -319,7 +319,7 @@ void bw_fluid_3d(ComMod& com_mod, const int eNoNw, const int eNoNq, const double
 
   Vector<double> uh(3); 
 
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       uh(0) = uh(0) + Nw(a)*yl(4,a);
       uh(1) = uh(1) + Nw(a)*yl(5,a);
@@ -522,14 +522,18 @@ void construct_fluid(ComMod& com_mod, const mshType& lM, const SolutionStates& s
   #endif
 
   // FLUID: dof = nsd+1
-  Vector<int> ptr(eNoN); 
-  Array<double> xl(nsd,eNoN); 
-  
+  // For ALE partitioned FSI, extend yl to hold mesh velocity at DOFs nsd+1..2*nsd
+  const bool has_ale = (com_mod.ale_mesh_velocity.size() > 0);
+  const int yl_nrows = has_ale ? tDof + nsd : tDof;
+
+  Vector<int> ptr(eNoN);
+  Array<double> xl(nsd,eNoN);
+
   // local acceleration vector (for a single element)
   Array<double> al(tDof,eNoN);
-  
+
   // local velocity vector (for a single element)
-  Array<double> yl(tDof,eNoN);
+  Array<double> yl(yl_nrows,eNoN);
   Array<double> bfl(nsd,eNoN);
   
   // local (weak form) residual vector (for a single element) 
@@ -569,10 +573,16 @@ void construct_fluid(ComMod& com_mod, const mshType& lM, const SolutionStates& s
         xl(i,a) = com_mod.x(i,Ac);
         bfl(i,a) = com_mod.Bf(i,Ac);
      }
-      for (int i = 0; i < al.nrows(); i++) {
+      for (int i = 0; i < tDof; i++) {
         al(i,a) = Ag(i,Ac);
         yl(i,a) = Yg(i,Ac);
       }
+      // Append ALE mesh velocity at DOFs nsd+1..2*nsd (same layout as mvMsh)
+      if (has_ale) {
+        for (int i = 0; i < nsd; i++) {
+          yl(nsd+1+i, a) = com_mod.ale_mesh_velocity(i, Ac);
+        }
+      }
     }
 
     // Initialize residual and tangents
@@ -895,7 +905,7 @@ void fluid_2d_c(ComMod& com_mod, const int vmsFlag, const int eNoNw, const int e
   }
 
   //  Update convection velocity relative to mesh velocity
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       u(0) = u(0) - Nw(a)*yl(3,a);
       u(1) = u(1) - Nw(a)*yl(4,a);
@@ -1212,7 +1222,7 @@ void fluid_2d_m(ComMod& com_mod, const int vmsFlag, const int eNoNw, const int e
   }
 
   //  Update convection velocity relative to mesh velocity
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       u(0) = u(0) - Nw(a)*yl(3,a);
       u(1) = u(1) - Nw(a)*yl(4,a);
@@ -1562,7 +1572,7 @@ void fluid_3d_c(ComMod& com_mod, const int vmsFlag, const int eNoNw, const int e
 
   // Update convection velocity relative to mesh velocity
   //
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       u[0] = u[0] - Nw(a)*yl(4,a);
       u[1] = u[1] - Nw(a)*yl(5,a);
@@ -1906,7 +1916,7 @@ void fluid_3d_m(ComMod& com_mod, const int vmsFlag, const int eNoNw, const int e
 
   // Update convection velocity relative to mesh velocity
   //
-  if (com_mod.mvMsh) {
+  if (com_mod.mvMsh || com_mod.ale_mesh_velocity.size() > 0) {
     for (int a = 0; a < eNoNw; a++) {
       u[0] = u[0] - Nw(a)*yl(4,a);
       u[1] = u[1] - Nw(a)*yl(5,a);
diff --git a/Code/Source/solver/fsi_coupling.cpp b/Code/Source/solver/fsi_coupling.cpp
new file mode 100644
index 000000000..540669e48
--- /dev/null
+++ b/Code/Source/solver/fsi_coupling.cpp
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "fsi_coupling.h"
+#include "Integrator.h"
+
+namespace fsi_coupling {
+
+//----------------------------------------------------------------------
+// extract_solid_displacement
+//----------------------------------------------------------------------
+Array<double> extract_solid_displacement(
+    const ComMod& com_mod, const eqType& solid_eq,
+    const faceType& lFa, const SolutionStates& solutions)
+{
+  const int nsd = com_mod.nsd;
+  const int s = solid_eq.s;  // DOF offset for the solid equation
+  const auto& Dn = solutions.current.get_displacement();
+
+  Array<double> result(nsd, lFa.nNo);
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < nsd; i++) {
+      result(i, a) = Dn(i + s, Ac);
+    }
+  }
+  return result;
+}
+
+//----------------------------------------------------------------------
+// apply_velocity_on_fluid
+//----------------------------------------------------------------------
+void apply_velocity_on_fluid(
+    ComMod& com_mod, const eqType& fluid_eq,
+    const faceType& lFa,
+    const Array<double>& velocity,
+    SolutionStates& solutions)
+{
+  const int nsd = com_mod.nsd;
+  const int s = fluid_eq.s;
+  const double dt = com_mod.dt;
+  const double gam = fluid_eq.gam;
+
+  auto& An = solutions.current.get_acceleration();
+  auto& Yn = solutions.current.get_velocity();
+  const auto& Yo = solutions.old.get_velocity();
+  const auto& Ao = solutions.old.get_acceleration();
+
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < nsd; i++) {
+      Yn(i + s, Ac) = velocity(i, a);
+      double a_new;
+      newmark::state_from_velocity(
+          velocity(i, a), Yo(i + s, Ac), Ao(i + s, Ac), dt, gam, a_new);
+      An(i + s, Ac) = a_new;
+    }
+  }
+}
+
+//----------------------------------------------------------------------
+// apply_traction_on_solid
+//----------------------------------------------------------------------
+void apply_traction_on_solid(
+    ComMod& com_mod, const eqType& solid_eq,
+    const faceType& lFa,
+    const Array<double>& traction)
+{
+  // The traction array contains consistent nodal forces (external force on solid).
+  // In svMultiPhysics, external forces are SUBTRACTED from R (see b_l_elas:
+  // lR -= w*N*h). So R -= traction.
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < traction.nrows(); i++) {
+      com_mod.R(i, Ac) -= traction(i, a);
+    }
+  }
+}
+
+//----------------------------------------------------------------------
+// apply_displacement_on_mesh
+//----------------------------------------------------------------------
+void apply_displacement_on_mesh(
+    ComMod& com_mod, const eqType& mesh_eq,
+    const faceType& lFa,
+    const Array<double>& displacement,
+    SolutionStates& solutions)
+{
+  const int nsd = com_mod.nsd;
+  const int s = mesh_eq.s;
+  const double dt = com_mod.dt;
+  const double gam = mesh_eq.gam;
+  const double beta = mesh_eq.beta;
+
+  auto& An = solutions.current.get_acceleration();
+  auto& Yn = solutions.current.get_velocity();
+  auto& Dn = solutions.current.get_displacement();
+  const auto& Do = solutions.old.get_displacement();
+  const auto& Yo = solutions.old.get_velocity();
+  const auto& Ao = solutions.old.get_acceleration();
+
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < nsd; i++) {
+      Dn(i + s, Ac) = displacement(i, a);
+      double a_new, v_new;
+      newmark::state_from_displacement(
+          displacement(i, a), Do(i + s, Ac), Yo(i + s, Ac), Ao(i + s, Ac),
+          dt, beta, gam, a_new, v_new);
+      An(i + s, Ac) = a_new;
+      Yn(i + s, Ac) = v_new;
+    }
+  }
+}
+
+} // namespace fsi_coupling
diff --git a/Code/Source/solver/fsi_coupling.h b/Code/Source/solver/fsi_coupling.h
new file mode 100644
index 000000000..c32c5e569
--- /dev/null
+++ b/Code/Source/solver/fsi_coupling.h
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef FSI_COUPLING_H
+#define FSI_COUPLING_H
+
+#include "ComMod.h"
+#include "SolutionStates.h"
+
+/// @brief FSI interface data exchange functions for partitioned coupling.
+///
+/// These functions extract and apply fluid traction and solid displacement
+/// at the FSI interface, enabling partitioned (Dirichlet-Neumann) coupling
+/// between separately solved fluid and solid equations.
+///
+/// Related to GitHub issue #431: Implement partitioned FSI in svMultiPhysics
+
+namespace fsi_coupling {
+
+/// @brief Extract solid displacement at interface face nodes.
+///
+/// @param com_mod Common module
+/// @param solid_eq The solid equation (for DOF offset)
+/// @param solid_face The solid-side FSI interface face
+/// @param solutions Solution states
+/// @return Array(nsd, solid_face.nNo) of displacement values
+Array<double> extract_solid_displacement(
+    const ComMod& com_mod, const eqType& solid_eq,
+    const faceType& solid_face, const SolutionStates& solutions);
+
+/// @brief Apply velocity as strong Dirichlet BC on fluid interface nodes.
+/// Directly sets Yn at the fluid equation DOF range for the face nodes.
+void apply_velocity_on_fluid(
+    ComMod& com_mod, const eqType& fluid_eq,
+    const faceType& fluid_face,
+    const Array<double>& velocity,
+    SolutionStates& solutions);
+
+/// @brief Apply pre-computed consistent nodal forces to the solid residual.
+///
+/// Adds the traction forces directly to com_mod.R at the global node locations
+/// corresponding to the solid face. This should be called during the
+/// post-assembly callback of step_equation() for the solid equation.
+///
+/// @param com_mod Common module (R is modified)
+/// @param solid_eq The solid equation (for DOF offset)
+/// @param solid_face The solid-side FSI interface face
+/// @param traction Array(nsd, solid_face.nNo) of consistent nodal forces
+void apply_traction_on_solid(
+    ComMod& com_mod, const eqType& solid_eq,
+    const faceType& solid_face,
+    const Array<double>& traction);
+
+/// @brief Apply displacement as strong Dirichlet BC on mesh interface nodes.
+///
+/// Directly sets the displacement in the solution arrays (An, Yn, Dn) for
+/// the mesh equation DOF range at the interface face nodes.
+///
+/// @param com_mod Common module
+/// @param mesh_eq The mesh equation (for DOF offset)
+/// @param mesh_face The mesh-side FSI interface face
+/// @param displacement Array(nsd, mesh_face.nNo) of displacement values
+/// @param solutions Solution states (modified)
+void apply_displacement_on_mesh(
+    ComMod& com_mod, const eqType& mesh_eq,
+    const faceType& mesh_face,
+    const Array<double>& displacement,
+    SolutionStates& solutions);
+
+} // namespace fsi_coupling
+
+#endif // FSI_COUPLING_H
diff --git a/Code/Source/solver/main.cpp b/Code/Source/solver/main.cpp
index 4e778f03e..377adec6a 100644
--- a/Code/Source/solver/main.cpp
+++ b/Code/Source/solver/main.cpp
@@ -9,6 +9,7 @@
 //
 #include "Simulation.h"
 #include "Integrator.h"
+#include "PartitionedFSI.h"
 
 #include "all_fun.h"
 #include "bf.h"
@@ -36,21 +37,6 @@
 #include <cmath>
 #include <fstream>
 
-//------------------------
-// add_eq_linear_algebra
-//------------------------
-// Create a LinearAlgebra object for an equation.
-//
-void add_eq_linear_algebra(ComMod& com_mod, eqType& lEq)
-{
-  lEq.linear_algebra = LinearAlgebraFactory::create_interface(lEq.linear_algebra_type);
-  lEq.linear_algebra->set_preconditioner(lEq.linear_algebra_preconditioner);
-  lEq.linear_algebra->initialize(com_mod, lEq);
-
-  if (lEq.linear_algebra_assembly_type != consts::LinearAlgebraType::none) {
-    lEq.linear_algebra->set_assembly(lEq.linear_algebra_assembly_type);
-  }
-}
 
 void finalize_linear_algebra(eqType& lEq)
 {
@@ -350,6 +336,8 @@ void iterate_solution(Simulation* simulation)
     #endif
 
     int iEqOld = cEq;
+
+    // Monolithic Newton iteration loop
     integrator.step();
 
     #ifdef debug_iterate_solution
@@ -569,7 +557,12 @@ void iterate_solution(Simulation* simulation)
 
 void run_simulation(Simulation* simulation)
 {
-  iterate_solution(simulation);
+  auto* partitioned_fsi = simulation->get_partitioned_fsi();
+  if (partitioned_fsi) {
+    partitioned_fsi->run();
+  } else {
+    iterate_solution(simulation);
+  }
 }
 
 
@@ -655,6 +648,9 @@ int main(int argc, char *argv[])
       add_eq_linear_algebra(simulation->com_mod, eq);
     }
 
+    // Initialize partitioned FSI coupling if configured
+    simulation->initialize_partitioned_fsi(file_name);
+
     #ifdef debug_main
     for (int iM = 0; iM < simulation->com_mod.nMsh; iM++) {
       dmsg << "---------- iM " << iM;
diff --git a/Code/Source/solver/mesh.cpp b/Code/Source/solver/mesh.cpp
index b7b8cdfb9..c3d6c38cf 100644
--- a/Code/Source/solver/mesh.cpp
+++ b/Code/Source/solver/mesh.cpp
@@ -45,9 +45,12 @@ void construct_mesh(ComMod& com_mod, CepMod& cep_mod, const mshType& lM, const S
   auto& pSa = com_mod.pSa;
   bool pstEq = com_mod.pstEq;
 
-  // Start and end DOF
-  int is = nsd + 1;
-  int ie = 2*nsd;
+  // Start and end DOF for mesh equation in the global solution arrays.
+  // Use eq.s (DOF offset) instead of hardcoded nsd+1, so this works both
+  // in monolithic FSI (where mesh is the 3rd equation) and in partitioned
+  // FSI (where mesh is the only equation with eq.s=0).
+  int is = eq.s;
+  int ie = eq.s + nsd - 1;
   int eNoN = lM.eNoN;
   #ifdef debug_construct_mesh
   dmsg << "cEq: " << cEq;
diff --git a/Code/Source/solver/post.cpp b/Code/Source/solver/post.cpp
index 84b2c23c8..a4a2a2984 100644
--- a/Code/Source/solver/post.cpp
+++ b/Code/Source/solver/post.cpp
@@ -5,6 +5,7 @@
 
 #include "FE/Common/FEException.h"
 #include "all_fun.h"
+#include <unordered_map>
 #include "fluid.h"
 #include "fs.h"
 #include "initialize.h"
@@ -2088,4 +2089,148 @@ void tpost(Simulation* simulation, const mshType& lM, const int m, Array<double>
   }
 }
 
+//----------------------------------------------------------------------
+// compute_face_traction — consistent nodal forces at a fluid face
+//----------------------------------------------------------------------
+// Computes f(i,a) = integral(sigma_ij * n_j * N_a dGamma) at each face
+// node.  Returns the force that the fluid exerts ON the solid.
+// Adapts the stress computation from bpost() but integrates over the
+// face using face Gauss quadrature.
+//
+Array<double> compute_face_traction(
+    ComMod& com_mod, const CmMod& cm_mod,
+    const mshType& lM, const faceType& lFa,
+    const eqType& eq,
+    const SolutionStates& solutions)
+{
+  const auto& Yg = solutions.intermediate.get_velocity();
+  const auto& Dg = solutions.intermediate.get_displacement();
+
+  const int nsd = com_mod.nsd;
+  const int eNoN = lM.eNoN;
+
+  // Pressure function space (P1-P1 or Taylor-Hood P2-P1)
+  fsType fsP;
+  if (lM.nFs == 1) {
+    fsP.eNoN = lM.fs[0].eNoN;
+    fsP.N = lM.fs[0].N;
+  } else {
+    fsP.eNoN = lM.fs[1].eNoN;
+    fsP.nG = lM.fs[0].nG;
+    fsP.eType = lM.fs[1].eType;
+    fs::alloc_fs(fsP, nsd, nsd);
+    fsP.xi = lM.fs[0].xi;
+    for (int g = 0; g < fsP.nG; g++) {
+      nn::get_gnn(nsd, fsP.eType, fsP.eNoN, g, fsP.xi, fsP.N, fsP.Nx);
+    }
+  }
+
+  Array<double> result(nsd, lFa.nNo);
+
+  std::unordered_map<int, int> global_to_face;
+  global_to_face.reserve(lFa.nNo);
+  for (int a = 0; a < lFa.nNo; a++) {
+    global_to_face[lFa.gN(a)] = a;
+  }
+
+  Array<double> xl(nsd, eNoN);
+  Array<double> ul(nsd, eNoN);
+  Vector<double> pl(fsP.eNoN);
+  Array<double> Nx(nsd, eNoN);
+  Array<double> ks(nsd, nsd);
+
+  for (int e = 0; e < lFa.nEl; e++) {
+    int Ec = lFa.gE(e);
+    int cEq = com_mod.cEq;
+    int cDmn = all_fun::domain(com_mod, lM, cEq, Ec);
+    if (cDmn == -1) continue;
+
+    for (int a = 0; a < eNoN; a++) {
+      int Ac = lM.IEN(a, Ec);
+      for (int i = 0; i < nsd; i++) {
+        xl(i, a) = com_mod.x(i, Ac);
+        ul(i, a) = Yg(i, Ac);
+      }
+    }
+    for (int a = 0; a < fsP.eNoN; a++) {
+      int Ac = lM.IEN(a, Ec);
+      pl(a) = Yg(nsd, Ac);
+    }
+
+    Array<double> sigma_avg(nsd, nsd);
+    double Jac_vol;
+
+    for (int g = 0; g < lM.nG; g++) {
+      if (g == 0 || !lM.lShpF) {
+        auto lM_Nx = lM.Nx.slice(g);
+        nn::gnn(eNoN, nsd, nsd, lM_Nx, xl, Nx, Jac_vol, ks);
+      }
+
+      Array<double> ux(nsd, nsd);
+      for (int a = 0; a < eNoN; a++)
+        for (int i = 0; i < nsd; i++)
+          for (int j = 0; j < nsd; j++)
+            ux(i, j) += Nx(i, a) * ul(j, a);
+
+      double p = 0.0;
+      for (int a = 0; a < fsP.eNoN; a++)
+        p += fsP.N(a, g) * pl(a);
+
+      double gam = 0.0;
+      for (int i = 0; i < nsd; i++)
+        for (int j = 0; j < nsd; j++)
+          gam += (ux(i, j) + ux(j, i)) * (ux(i, j) + ux(j, i));
+      gam = sqrt(0.5 * gam);
+
+      double mu, mu_s;
+      fluid::get_viscosity(com_mod, eq.dmn[cDmn], gam, mu, mu_s, mu_s);
+
+      for (int i = 0; i < nsd; i++)
+        for (int j = 0; j < nsd; j++)
+          sigma_avg(i, j) += (-p * (i == j ? 1.0 : 0.0)
+                             + mu * (ux(i, j) + ux(j, i)))
+                             / static_cast<double>(lM.nG);
+    }
+
+    for (int gf = 0; gf < lFa.nG; gf++) {
+      Vector<double> nV(nsd);
+      auto face_Nx = lFa.Nx.slice(gf);
+      nn::gnnb(com_mod, lFa, e, gf, nsd, nsd - 1, lFa.eNoN, face_Nx, nV,
+               solutions, consts::MechanicalConfigurationType::reference);
+      double Jac_face = sqrt(utils::norm(nV));
+      double w = lFa.w(gf) * Jac_face;
+      for (int i = 0; i < nsd; i++) nV(i) /= Jac_face;
+
+      Vector<double> trac(nsd);
+      for (int i = 0; i < nsd; i++)
+        for (int j = 0; j < nsd; j++)
+          trac(i) += sigma_avg(i, j) * nV(j);
+
+      auto N = lFa.N.col(gf);
+      for (int a = 0; a < lFa.eNoN; a++) {
+        int Ac = lFa.IEN(a, e);
+        int a_local = global_to_face[Ac];
+        for (int i = 0; i < nsd; i++)
+          result(i, a_local) -= w * N(a) * trac(i);
+      }
+    }
+  }
+
+  // MPI communication
+  Array<double> gResult(nsd, com_mod.tnNo);
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < nsd; i++)
+      gResult(i, Ac) = result(i, a);
+  }
+  all_fun::commu(com_mod, gResult);
+  for (int a = 0; a < lFa.nNo; a++) {
+    int Ac = lFa.gN(a);
+    for (int i = 0; i < nsd; i++)
+      result(i, a) = gResult(i, Ac);
+  }
+
+  return result;
+}
+
 };
diff --git a/Code/Source/solver/post.h b/Code/Source/solver/post.h
index 88982d867..5fdcc60c3 100644
--- a/Code/Source/solver/post.h
+++ b/Code/Source/solver/post.h
@@ -37,6 +37,16 @@ void shl_post(Simulation* simulation, const mshType& lM, const int m, Array<doub
 void tpost(Simulation* simulation, const mshType& lM, const int m, Array<double>& res, Vector<double>& resE,
     const SolutionStates& solutions, const int iEq, consts::OutputNameType outGrp);
 
+/// @brief Compute consistent nodal traction forces at a fluid face.
+///
+/// Used by partitioned FSI to extract the fluid traction at the FSI
+/// interface.  Returns force ON the solid (sign: -(sigma . n_fluid)).
+Array<double> compute_face_traction(
+    ComMod& com_mod, const CmMod& cm_mod,
+    const mshType& fluid_mesh, const faceType& fluid_face,
+    const eqType& fluid_eq,
+    const SolutionStates& solutions);
+
 };
 
 #endif
diff --git a/Code/Source/solver/read_files.cpp b/Code/Source/solver/read_files.cpp
index fdfbe36ec..a2b155f87 100644
--- a/Code/Source/solver/read_files.cpp
+++ b/Code/Source/solver/read_files.cpp
@@ -1651,7 +1651,7 @@ void read_fiber_temporal_values_file(FiberReinforcementStressParameters& fiber_p
 // The many global COMMOD values set in the Fortan subroutine are set in the
 // Simulation::set_module_parameters() method.
 //
-void read_files(Simulation* simulation, const std::string& file_name)
+void read_files(Simulation* simulation, const std::string& file_name, bool from_string)
 {
   using namespace consts;
 
@@ -1663,12 +1663,18 @@ void read_files(Simulation* simulation, const std::string& file_name)
   dmsg.banner();
   #endif
 
-  // Read the solver XML file.
+  // Read the solver parameters. 'file_name' is either a path to an XML file or,
+  // when from_string is true, the XML content itself (used by the partitioned
+  // FSI driver to build sub-simulations without temp files).
   #ifdef debug_read_files
-  dmsg << "Read the solver XML file " << " ... ";
+  dmsg << "Read the solver XML " << " ... ";
   #endif
   if (!com_mod.resetSim) {
-    simulation->read_parameters(std::string(file_name));
+    if (from_string) {
+      simulation->read_parameters_from_string(file_name);
+    } else {
+      simulation->read_parameters(std::string(file_name));
+    }
   }
 
   auto& chnl_mod = simulation->get_chnl_mod();
@@ -1816,12 +1822,18 @@ void read_files(Simulation* simulation, const std::string& file_name)
       }
     }     
 
-    if (eq.phys == EquationType::phys_mesh) {   
+    if (eq.phys == EquationType::phys_mesh) {
+      // For partitioned FSI, mvMsh is set when Partitioned_coupling is configured
+      if (!com_mod.mvMsh && simulation->parameters.partitioned_coupling_parameters.defined()) {
+        com_mod.mvMsh = true;
+      }
       if (!com_mod.mvMsh) {
-        throw std::runtime_error("mesh equation can only be specified after FSI equation");
+        throw std::runtime_error("mesh equation can only be specified after FSI or with Partitioned_coupling");
+      }
+      if (com_mod.nEq > 0 && com_mod.eq[0].phys == EquationType::phys_FSI) {
+        // Use the explicit geometry coupling flag of the FSI equation.
+        eq.expl_geom_cpl = com_mod.eq[0].expl_geom_cpl;
       }
-      // Use the explicit geometry coupling flag of the FSI equation.
-      eq.expl_geom_cpl = com_mod.eq[0].expl_geom_cpl; 
     }     
   }
   #ifdef debug_read_files
diff --git a/Code/Source/solver/read_files.h b/Code/Source/solver/read_files.h
index 3cccfc469..3574e9c53 100644
--- a/Code/Source/solver/read_files.h
+++ b/Code/Source/solver/read_files.h
@@ -33,7 +33,7 @@ namespace read_files_ns {
 
   void read_eq(Simulation* simulation, EquationParameters* params, eqType& eq);
 
-  void read_files(Simulation* simulation, const std::string& file_name);
+  void read_files(Simulation* simulation, const std::string& file_name, bool from_string = false);
 
   void read_fourier_coeff_values_file(const std::string& file_name, bcType& lBc);
   void read_fourier_coeff_values_file(const std::string& file_name, bfType& lBf);
diff --git a/Code/Source/solver/set_bc.cpp b/Code/Source/solver/set_bc.cpp
index 01d361ceb..1288db0a9 100644
--- a/Code/Source/solver/set_bc.cpp
+++ b/Code/Source/solver/set_bc.cpp
@@ -1109,6 +1109,9 @@ void set_bc_dir_l(ComMod& com_mod, const bcType& lBc, const faceType& lFa, Array
 //
 void set_bc_dir_w(ComMod& com_mod, const SolutionStates& solutions)
 {
+  // Local alias for old displacement
+  const auto& Do = solutions.old.get_displacement();
+
   using namespace consts;
 
   const int cEq = com_mod.cEq;
@@ -1231,8 +1234,12 @@ void set_bc_dir_wl(ComMod& com_mod, const bcType& lBc, const mshType& lM, const
     }
   }
 
-  Vector<int> ptr(eNoN); 
-  Array<double> xl(nsd,eNoN), yl(tDof,eNoN), lR(dof,eNoN);
+  // For ALE partitioned FSI, extend yl to hold mesh velocity
+  const bool has_ale = (com_mod.ale_mesh_velocity.size() > 0);
+  const int yl_nrows = has_ale ? tDof + nsd : tDof;
+
+  Vector<int> ptr(eNoN);
+  Array<double> xl(nsd,eNoN), yl(yl_nrows,eNoN), lR(dof,eNoN);
   Array3<double> lK(dof*dof,eNoN,eNoN);
   Array<double> xbl(nsd,eNoNb), ubl(nsd,eNoNb);
 
@@ -1261,6 +1268,11 @@ void set_bc_dir_wl(ComMod& com_mod, const bcType& lBc, const mshType& lM, const
       for (int i = 0; i < tDof; i++) {
         yl(i,a) = Yg(i,Ac);
       }
+      if (has_ale) {
+        for (int i = 0; i < nsd; i++) {
+          yl(nsd+1+i, a) = com_mod.ale_mesh_velocity(i, Ac);
+        }
+      }
 
       for (int i = 0; i < nsd; i++) {
         xl(i,a) = com_mod.x(i,Ac);
@@ -1991,6 +2003,71 @@ void set_bc_undef_neu_l(ComMod& com_mod, const bcType& lBc, const faceType& lFa)
   }
 }
 
+//----------------------------------------------------------------------
+// enforce_dirichlet_on_face
+//----------------------------------------------------------------------
+void enforce_dirichlet_on_face(ComMod& com_mod, const faceType& lFa, int nsd)
+{
+  const auto& eq = com_mod.eq[com_mod.cEq];
+  const int dof = eq.dof;
+  const auto& rowPtr = com_mod.rowPtr;
+  const auto& colPtr = com_mod.colPtr;
+  auto& R = com_mod.R;
+  auto& Val = com_mod.Val;
+
+  for (int a = 0; a < lFa.nNo; a++) {
+    int rowN = lFa.gN(a);
+    for (int i = 0; i < dof; i++) {
+      R(i, rowN) = 0.0;
+    }
+    for (int j = rowPtr(rowN); j <= rowPtr(rowN + 1) - 1; j++) {
+      int colN = colPtr(j);
+      for (int iDof = 0; iDof < dof * dof; iDof++) {
+        Val(iDof, j) = 0.0;
+      }
+      if (colN == rowN) {
+        for (int i = 0; i < dof; i++) {
+          Val(i * dof + i, j) = 1.0;
+        }
+      }
+    }
+  }
+}
+
+//----------------------------------------------------------------------
+// enforce_dirichlet_dofs_on_face
+//----------------------------------------------------------------------
+void enforce_dirichlet_dofs_on_face(ComMod& com_mod, const faceType& lFa,
+                                    int dof_start, int num_dofs)
+{
+  const auto& eq = com_mod.eq[com_mod.cEq];
+  const int dof = eq.dof;
+  const auto& rowPtr = com_mod.rowPtr;
+  const auto& colPtr = com_mod.colPtr;
+  auto& R = com_mod.R;
+  auto& Val = com_mod.Val;
+
+  for (int a = 0; a < lFa.nNo; a++) {
+    int rowN = lFa.gN(a);
+    for (int i = dof_start; i < dof_start + num_dofs; i++) {
+      R(i, rowN) = 0.0;
+    }
+    for (int j = rowPtr(rowN); j <= rowPtr(rowN + 1) - 1; j++) {
+      int colN = colPtr(j);
+      for (int i = dof_start; i < dof_start + num_dofs; i++) {
+        for (int k = 0; k < dof; k++) {
+          Val(i * dof + k, j) = 0.0;
+        }
+      }
+      if (colN == rowN) {
+        for (int i = dof_start; i < dof_start + num_dofs; i++) {
+          Val(i * dof + i, j) = 1.0;
+        }
+      }
+    }
+  }
+}
+
 };
 
 
diff --git a/Code/Source/solver/set_bc.h b/Code/Source/solver/set_bc.h
index b84eadba7..229788d0a 100644
--- a/Code/Source/solver/set_bc.h
+++ b/Code/Source/solver/set_bc.h
@@ -45,6 +45,13 @@ void set_bc_undef_neu(ComMod& com_mod);
 
 void set_bc_undef_neu_l(ComMod& com_mod, const bcType& lBc, const faceType& lFa);
 
+/// @brief Enforce Dirichlet BC at all DOFs of face nodes in assembled system.
+void enforce_dirichlet_on_face(ComMod& com_mod, const faceType& lFa, int nsd);
+
+/// @brief Enforce Dirichlet BC for DOFs [dof_start, dof_start+num_dofs) at face nodes.
+void enforce_dirichlet_dofs_on_face(ComMod& com_mod, const faceType& lFa,
+                                    int dof_start, int num_dofs);
+
 };
 
 #endif
diff --git a/tests/cases/fsi/compare_fsi.py b/tests/cases/fsi/compare_fsi.py
new file mode 100644
index 000000000..2a03dafb4
--- /dev/null
+++ b/tests/cases/fsi/compare_fsi.py
@@ -0,0 +1,215 @@
+"""
+Compare monolithic vs partitioned FSI results and generate:
+1. Field comparison at final time step
+2. Videos of both simulations
+3. Coupling performance plot for partitioned FSI
+"""
+
+import numpy as np
+import meshio
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
+from matplotlib.colors import Normalize
+import matplotlib.animation as animation
+import os, re
+
+mono_dir = "pipe_3d/1-procs"
+part_dir = "pipe_3d_partitioned/1-procs"
+coupling_log = "pipe_3d_partitioned/coupling_log.txt"
+out_dir = "pipe_3d_partitioned"
+
+n_steps = 50
+
+# ============================================================
+# 1. Parse coupling log
+# ============================================================
+print("Parsing coupling log...")
+coupling_data = {}  # ts -> [(outer, omega, rel_disp), ...]
+
+with open(coupling_log) as f:
+    for line in f:
+        # Format: CP TS-ITER  TIME  [dB REL_DISP OMEGA]
+        m = re.match(r'\s*CP\s+(\d+)-(\d+)\s+([\d.e+-]+)\s+\[(-?\d+)\s+([\d.e+-]+)\s+([\d.e+-]+)\]', line)
+        if m:
+            ts = int(m.group(1))
+            outer = int(m.group(2))
+            rel_disp = float(m.group(5))
+            omega = float(m.group(6))
+            if ts not in coupling_data:
+                coupling_data[ts] = []
+            coupling_data[ts].append((outer, omega, rel_disp))
+
+# ============================================================
+# 2. Coupling performance plot
+# ============================================================
+print("Creating coupling performance plot...")
+
+fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)
+
+# Top: number of coupling iterations per time step
+ts_list = sorted(coupling_data.keys())
+n_iters = [len(coupling_data[ts]) for ts in ts_list]
+axes[0].bar(ts_list, n_iters, color='steelblue', width=0.8)
+axes[0].set_ylabel('Coupling iterations')
+axes[0].set_title('Partitioned FSI Coupling Performance (Aitken relaxation)')
+axes[0].set_ylim(0, max(n_iters) + 1)
+
+# Bottom: convergence history (all time steps overlaid)
+cmap = plt.cm.viridis
+for i, ts in enumerate(ts_list):
+    iters = coupling_data[ts]
+    x = [it[0] for it in iters]
+    y = [it[2] for it in iters]
+    color = cmap(i / max(len(ts_list) - 1, 1))
+    axes[1].semilogy(x, y, 'o-', color=color, alpha=0.6, lw=1, markersize=4)
+
+# Highlight first and last
+for ts, color, label in [(ts_list[0], 'blue', f'TS {ts_list[0]}'),
+                          (ts_list[-1], 'red', f'TS {ts_list[-1]}')]:
+    iters = coupling_data[ts]
+    x = [it[0] for it in iters]
+    y = [it[2] for it in iters]
+    axes[1].semilogy(x, y, 'o-', color=color, lw=2, markersize=6, label=label)
+
+axes[1].axhline(1e-8, color='green', linestyle='--', linewidth=1.5, label='Tolerance (1e-8)')
+axes[1].set_xlabel('Coupling iteration within time step')
+axes[1].set_ylabel('Relative displacement change')
+axes[1].legend(loc='upper right')
+axes[1].set_ylim(1e-12, 10)
+axes[1].set_xlim(0.5, max(n_iters) + 0.5)
+
+plt.tight_layout()
+plt.savefig(os.path.join(out_dir, 'coupling_performance.png'), dpi=150)
+plt.close()
+print(f"  Saved {out_dir}/coupling_performance.png")
+
+# ============================================================
+# 3. Final time step comparison
+# ============================================================
+print("Comparing final time step fields...")
+
+mono = meshio.read(os.path.join(mono_dir, f"result_{n_steps:03d}.vtu"))
+part = meshio.read(os.path.join(part_dir, f"result_{n_steps:03d}.vtu"))
+
+print(f"\n{'Field':<20} {'Mono max':>12} {'Part max':>12} {'Rel diff':>12}")
+print("-" * 60)
+for f in mono.point_data:
+    m = np.max(np.abs(mono.point_data[f]))
+    if f in part.point_data:
+        p = np.max(np.abs(part.point_data[f]))
+    else:
+        p = 0.0
+    rd = abs(m - p) / (m + 1e-30)
+    print(f"{f:<20} {m:12.4e} {p:12.4e} {rd:12.4e}")
+
+# ============================================================
+# 4. Videos (side-by-side velocity magnitude on z=0 slice)
+# ============================================================
+print("\nCreating videos...")
+
+def get_velocity_mag(fname):
+    """Read VTU and return (points, velocity_magnitude)"""
+    m = meshio.read(fname)
+    pts = m.points
+    vel = m.point_data.get('Velocity', np.zeros((len(pts), 3)))
+    vmag = np.sqrt(vel[:, 0]**2 + vel[:, 1]**2 + vel[:, 2]**2)
+    return pts, vmag
+
+def get_displacement_mag(fname):
+    """Read VTU and return (points, displacement_magnitude)"""
+    m = meshio.read(fname)
+    pts = m.points
+    # Try FS_Displacement first (monolithic), then Displacement
+    for key in ['FS_Displacement', 'Displacement']:
+        if key in m.point_data:
+            d = m.point_data[key]
+            if np.max(np.abs(d)) > 1e-15:
+                return pts, np.sqrt(d[:, 0]**2 + d[:, 1]**2 + d[:, 2]**2)
+    return pts, np.zeros(len(pts))
+
+# Get all data for velocity video
+print("  Reading velocity data...")
+mono_vel = []
+part_vel = []
+for ts in range(1, n_steps + 1):
+    mono_pts, mono_vmag = get_velocity_mag(os.path.join(mono_dir, f"result_{ts:03d}.vtu"))
+    part_pts, part_vmag = get_velocity_mag(os.path.join(part_dir, f"result_{ts:03d}.vtu"))
+    mono_vel.append((mono_pts, mono_vmag))
+    part_vel.append((part_pts, part_vmag))
+
+# Use monolithic velocity range for color scaling (partitioned may differ)
+vmax_mono = max(np.max(v[1]) for v in mono_vel)
+
+# Create velocity animation
+print("  Rendering velocity video...")
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+def plot_frame(frame_idx):
+    for ax in axes:
+        ax.clear()
+
+    pts_m, vmag_m = mono_vel[frame_idx]
+    pts_p, vmag_p = part_vel[frame_idx]
+
+    # Each subplot uses its own normalization for best visibility
+    sc1 = axes[0].scatter(pts_m[:, 2], pts_m[:, 0], c=vmag_m, s=2,
+                          vmin=0, vmax=vmax_mono, cmap='coolwarm')
+    axes[0].set_title(f'Monolithic FSI')
+    axes[0].set_xlabel('z')
+    axes[0].set_ylabel('x')
+    axes[0].set_aspect('equal')
+
+    sc2 = axes[1].scatter(pts_p[:, 2], pts_p[:, 0], c=vmag_p, s=2,
+                          vmin=0, vmax=vmax_mono, cmap='coolwarm')
+    axes[1].set_title(f'Partitioned FSI')
+    axes[1].set_xlabel('z')
+    axes[1].set_ylabel('x')
+    axes[1].set_aspect('equal')
+
+    fig.suptitle(f'Velocity Magnitude (step {frame_idx+1}/{n_steps}, dt=1e-4)', fontsize=14)
+    return sc1, sc2
+
+plot_frame(0)
+plt.tight_layout()
+
+ani = animation.FuncAnimation(fig, plot_frame, frames=n_steps, interval=200, blit=False)
+ani.save(os.path.join(out_dir, 'velocity_comparison.gif'), writer='pillow', fps=5)
+plt.close()
+print(f"  Saved {out_dir}/velocity_comparison.gif")
+
+# Create pressure comparison at final step
+print("  Creating final pressure comparison...")
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+mono_final = meshio.read(os.path.join(mono_dir, f"result_{n_steps:03d}.vtu"))
+part_final = meshio.read(os.path.join(part_dir, f"result_{n_steps:03d}.vtu"))
+
+p_mono = mono_final.point_data.get('Pressure', np.zeros(len(mono_final.points)))
+p_part = part_final.point_data.get('Pressure', np.zeros(len(part_final.points)))
+pmax = max(np.max(np.abs(p_mono)), np.max(np.abs(p_part)))
+
+sc1 = axes[0].scatter(mono_final.points[:, 2], mono_final.points[:, 0],
+                       c=p_mono, s=2, vmin=-pmax, vmax=pmax, cmap='RdBu_r')
+axes[0].set_title('Monolithic FSI')
+axes[0].set_xlabel('z')
+axes[0].set_ylabel('x')
+axes[0].set_aspect('equal')
+plt.colorbar(sc1, ax=axes[0], label='Pressure')
+
+sc2 = axes[1].scatter(part_final.points[:, 2], part_final.points[:, 0],
+                       c=p_part, s=2, vmin=-pmax, vmax=pmax, cmap='RdBu_r')
+axes[1].set_title('Partitioned FSI')
+axes[1].set_xlabel('z')
+axes[1].set_ylabel('x')
+axes[1].set_aspect('equal')
+plt.colorbar(sc2, ax=axes[1], label='Pressure')
+
+fig.suptitle(f'Pressure at step {n_steps}', fontsize=14)
+plt.tight_layout()
+plt.savefig(os.path.join(out_dir, 'pressure_comparison.png'), dpi=150)
+plt.close()
+print(f"  Saved {out_dir}/pressure_comparison.png")
+
+print("\nDone!")
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-complete.mesh.vtu b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-complete.mesh.vtu
new file mode 100644
index 000000000..8ddb1119b
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-complete.mesh.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bff21b54094fe528e6f7cd9b57dafc5863b530e112e08f8a62a04431ec14a7a3
+size 67969
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/end.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/end.vtp
new file mode 100644
index 000000000..e7c6c313c
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/end.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57511367796bab30c07b3b5dbd3b6ec596e433cb1aae2afe763f05ab357655bf
+size 13661
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/interface.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/interface.vtp
new file mode 100644
index 000000000..c52d70d2a
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/interface.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c4a25e331d2c3ccbfa2ffcdd52e0ebbd37a8045989bec9abd7b2c0ed775e2b8
+size 28724
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/start.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/start.vtp
new file mode 100644
index 000000000..55f41e4f7
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/fluid/mesh-surfaces/start.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d90d5fb5ce21d0889b27fd0a592aafa6121d19047bebd801710caf5cba6fb48a
+size 13685
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-complete.mesh.vtu b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-complete.mesh.vtu
new file mode 100644
index 000000000..08402c8eb
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-complete.mesh.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2527472b08ece18c95175c0661ca838f90ac8b97406d0c1c77459e94df620fe7
+size 36980
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/end.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/end.vtp
new file mode 100644
index 000000000..4f2a81290
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/end.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abb10cc3c374742b481031050fd6ceb2af30b8a14bd5627d89f930a4df01708a
+size 12351
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/interface.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/interface.vtp
new file mode 100644
index 000000000..9a8bafd82
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/interface.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9302a80f392f30ae07b98d0048cb7ad85f2fd3fcb0a86dbe50def69ba5a6cafc
+size 28900
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/outside.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/outside.vtp
new file mode 100644
index 000000000..1c5fbfe2e
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/outside.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29c18dc6c30b33a98016c3df4038433430307f05bc5fd3d2d39469b4a3d4bb63
+size 28813
diff --git a/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/start.vtp b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/start.vtp
new file mode 100644
index 000000000..37fd38c98
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/mesh/solid/mesh-surfaces/start.vtp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c679e42882a1a2db6382133d896879e84468058870863e670fc31212d4e29c2
+size 12335
diff --git a/tests/cases/fsi/pipe_3d_partitioned/result_fluid_001.vtu b/tests/cases/fsi/pipe_3d_partitioned/result_fluid_001.vtu
new file mode 100644
index 000000000..af7b9a28d
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/result_fluid_001.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee484711a6cedf8f12e844e763bd35ac9d27f8667283c9181c2e702a5afdcf4c
+size 79329
diff --git a/tests/cases/fsi/pipe_3d_partitioned/result_solid_001.vtu b/tests/cases/fsi/pipe_3d_partitioned/result_solid_001.vtu
new file mode 100644
index 000000000..62d85fb3a
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/result_solid_001.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1586977756f9936e95237ccfcdecb9d6b0857c3076ed938efdee5e7d69b814b5
+size 23114
diff --git a/tests/cases/fsi/pipe_3d_partitioned/solver.xml b/tests/cases/fsi/pipe_3d_partitioned/solver.xml
new file mode 100644
index 000000000..094518d2d
--- /dev/null
+++ b/tests/cases/fsi/pipe_3d_partitioned/solver.xml
@@ -0,0 +1,199 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<svMultiPhysicsFile version="0.1">
+
+<!-- Partitioned FSI version of pipe_3d: separate fluid, struct, and mesh
+     equations coupled via Dirichlet-Neumann iteration with Aitken relaxation.
+     Results should match the monolithic FSI pipe_3d test case. -->
+
+<GeneralSimulationParameters>
+  <Continue_previous_simulation> 0 </Continue_previous_simulation>
+  <Number_of_spatial_dimensions> 3 </Number_of_spatial_dimensions>
+  <Number_of_time_steps> 1 </Number_of_time_steps>
+  <Time_step_size> 1e-2 </Time_step_size>
+  <Spectral_radius_of_infinite_time_step> 0.50 </Spectral_radius_of_infinite_time_step>
+  <Searched_file_name_to_trigger_stop> STOP_SIM </Searched_file_name_to_trigger_stop>
+  <Save_results_to_VTK_format> true </Save_results_to_VTK_format>
+  <Name_prefix_of_saved_VTK_files> result </Name_prefix_of_saved_VTK_files>
+  <Increment_in_saving_VTK_files> 1 </Increment_in_saving_VTK_files>
+  <Start_saving_after_time_step> 1 </Start_saving_after_time_step>
+  <Increment_in_saving_restart_files> 1 </Increment_in_saving_restart_files>
+  <Convert_BIN_to_VTK_format> 0 </Convert_BIN_to_VTK_format>
+  <Verbose> 1 </Verbose>
+  <Warning> 0 </Warning>
+  <Debug> 0 </Debug>
+</GeneralSimulationParameters>
+
+<Add_mesh name="lumen" >
+  <Mesh_file_path> mesh/fluid/mesh-complete.mesh.vtu  </Mesh_file_path>
+  <Add_face name="lumen_inlet">
+      <Face_file_path> mesh/fluid/mesh-surfaces/start.vtp </Face_file_path>
+  </Add_face>
+  <Add_face name="lumen_outlet">
+      <Face_file_path> mesh/fluid/mesh-surfaces/end.vtp </Face_file_path>
+  </Add_face>
+  <Add_face name="lumen_wall">
+      <Face_file_path> mesh/fluid/mesh-surfaces/interface.vtp </Face_file_path>
+  </Add_face>
+  <Domain> 0 </Domain>
+</Add_mesh>
+
+<Add_mesh name="wall" >
+  <Mesh_file_path> mesh/solid/mesh-complete.mesh.vtu  </Mesh_file_path>
+  <Add_face name="wall_inlet">
+      <Face_file_path> mesh/solid/mesh-surfaces/start.vtp </Face_file_path>
+  </Add_face>
+  <Add_face name="wall_outlet">
+      <Face_file_path> mesh/solid/mesh-surfaces/end.vtp </Face_file_path>
+  </Add_face>
+  <Add_face name="wall_inner">
+      <Face_file_path> mesh/solid/mesh-surfaces/interface.vtp </Face_file_path>
+  </Add_face>
+  <Add_face name="wall_outer">
+      <Face_file_path> mesh/solid/mesh-surfaces/outside.vtp </Face_file_path>
+  </Add_face>
+  <Domain> 1 </Domain>
+</Add_mesh>
+
+<Add_projection name="wall_inner" >
+   <Project_from_face> lumen_wall </Project_from_face>
+</Add_projection>
+
+<!-- Fluid equation (linear solver from fluid/newtonian test) -->
+<Add_equation type="fluid" role="partitioned_fluid" >
+   <Coupled> false </Coupled>
+   <Min_iterations> 1 </Min_iterations>
+   <Max_iterations> 100 </Max_iterations>
+   <Tolerance> 1e-12 </Tolerance>
+
+   <Domain id="0" >
+      <Equation> fluid </Equation>
+      <Density> 1.0 </Density>
+      <Viscosity model="Constant" >
+         <Value> 0.04 </Value>
+      </Viscosity>
+      <Backflow_stabilization_coefficient> 0.2 </Backflow_stabilization_coefficient>
+   </Domain>
+
+   <LS type="GMRES" >
+      <Linear_algebra type="fsils" >
+         <Preconditioner> fsils </Preconditioner>
+      </Linear_algebra>
+      <Tolerance> 1e-14 </Tolerance>
+      <Max_iterations> 200 </Max_iterations>
+      <Krylov_space_dimension> 100 </Krylov_space_dimension>
+   </LS>
+
+   <Output type="Spatial" >
+     <Velocity> true </Velocity>
+     <Pressure> true </Pressure>
+   </Output>
+
+   <Add_BC name="lumen_inlet" >
+      <Type> Neu </Type>
+      <Value> 5.0e4 </Value>
+   </Add_BC>
+
+   <!-- No-slip BC at FSI interface is handled by the partitioned coupling:
+        solid velocity is transferred as Dirichlet BC on lumen_wall. -->
+
+</Add_equation>
+
+<!-- Structure equation -->
+<Add_equation type="struct" role="partitioned_solid" >
+   <Coupled> false </Coupled>
+   <Min_iterations> 1 </Min_iterations>
+   <Max_iterations> 100 </Max_iterations>
+   <Tolerance> 1e-12 </Tolerance>
+
+   <Domain id="1" >
+      <Equation> struct </Equation>
+      <Constitutive_model type="neoHookean"> </Constitutive_model>
+      <Dilational_penalty_model> M94 </Dilational_penalty_model>
+      <Density> 1.0 </Density>
+      <Elasticity_modulus> 1.0e7 </Elasticity_modulus>
+      <Poisson_ratio> 0.3 </Poisson_ratio>
+   </Domain>
+
+   <LS type="GMRES" >
+      <Linear_algebra type="fsils" >
+         <Preconditioner> fsils </Preconditioner>
+      </Linear_algebra>
+      <Tolerance> 1e-14 </Tolerance>
+      <Max_iterations> 500 </Max_iterations>
+      <Krylov_space_dimension> 100 </Krylov_space_dimension>
+   </LS>
+
+   <Output type="Spatial" >
+     <Displacement> true </Displacement>
+     <VonMises_stress> true </VonMises_stress>
+   </Output>
+
+   <Add_BC name="wall_inlet" >
+      <Type> Dir </Type>
+      <Value> 0.0 </Value>
+      <Impose_on_state_variable_integral> true </Impose_on_state_variable_integral>
+      <Zero_out_perimeter> false </Zero_out_perimeter>
+      <Effective_direction> (0, 0, 1) </Effective_direction>
+   </Add_BC>
+
+   <Add_BC name="wall_outlet" >
+      <Type> Dir </Type>
+      <Value> 0.0 </Value>
+      <Impose_on_state_variable_integral> true </Impose_on_state_variable_integral>
+      <Zero_out_perimeter> false </Zero_out_perimeter>
+      <Effective_direction> (0, 0, 1) </Effective_direction>
+   </Add_BC>
+
+</Add_equation>
+
+<!-- Mesh motion equation (ALE, same as monolithic mesh equation) -->
+<Add_equation type="mesh" role="partitioned_mesh" >
+   <Coupled> false </Coupled>
+   <Min_iterations> 1 </Min_iterations>
+   <Max_iterations> 100 </Max_iterations>
+   <Tolerance> 1e-12 </Tolerance>
+   <Poisson_ratio> 0.3 </Poisson_ratio>
+
+   <Domain id="0" >
+      <Equation> mesh </Equation>
+      <Density> 0.0 </Density>
+      <Elasticity_modulus> 1.0 </Elasticity_modulus>
+      <Poisson_ratio> 0.3 </Poisson_ratio>
+   </Domain>
+
+   <LS type="CG" >
+      <Linear_algebra type="fsils" >
+         <Preconditioner> fsils </Preconditioner>
+      </Linear_algebra>
+      <Tolerance> 1e-12 </Tolerance>
+      <Max_iterations> 400 </Max_iterations>
+   </LS>
+
+   <Output type="Spatial" >
+     <Displacement> true </Displacement>
+   </Output>
+
+   <Add_BC name="lumen_inlet" >
+      <Type> Dir </Type>
+      <Value> 0.0 </Value>
+   </Add_BC>
+
+   <Add_BC name="lumen_outlet" >
+      <Type> Dir </Type>
+      <Value> 0.0 </Value>
+   </Add_BC>
+
+</Add_equation>
+
+<!-- Partitioned coupling configuration -->
+<Partitioned_coupling>
+  <Max_coupling_iterations> 400 </Max_coupling_iterations>
+  <Coupling_tolerance> 1e-6 </Coupling_tolerance>
+  <Initial_relaxation> 0.01 </Initial_relaxation>
+  <Omega_max> 1.0 </Omega_max>
+  <Coupling_method> aitken </Coupling_method>
+  <Fluid_interface_face> lumen_wall </Fluid_interface_face>
+  <Solid_interface_face> wall_inner </Solid_interface_face>
+</Partitioned_coupling>
+
+</svMultiPhysicsFile>
diff --git a/tests/conftest.py b/tests/conftest.py
index d2f884c96..53919f524 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -47,157 +47,64 @@ def n_proc(request):
     return request.param
 
 
-def run_by_name(folder, name, t_max, n_proc=1):
-    """
-    Run a test case and return results
-    Args:
-        folder: location from which test will be executed
-        name: name of svMultiPhysics input file (.xml)
-        t_max: time step to compare
-        n_proc: number of processors
-
-    Returns:
-    Simulation results
-    """
-
-    # remove old results folders if they exist
+def _run_simulation(folder, name_inp, n_proc):
+    """Run the solver once, removing any previous output directory first."""
     dir_path = os.path.join(folder, str(n_proc) + "-procs")
     if os.path.exists(dir_path):
         shutil.rmtree(dir_path)
 
-    # run simulation
-    if is_not_Darwin:
-        if "petsc" in folder:
-            cmd = " ".join(
-            [
-                "mpirun",
-                "--oversubscribe" if n_proc > 1 else "",
-                "-np",
-                str(n_proc),
-                cpp_exec_p,
-                name,
-            ]
-            )
-        else:
-            cmd = " ".join(
-            [
-                "mpirun",
-                "--oversubscribe" if n_proc > 1 else "",
-                "-np",
-                str(n_proc),
-                cpp_exec,
-                name,
-            ]
-            )
-    else:
-        if "petsc" in folder or "trilinos" in folder: 
-            return
-        else:
-            cmd = " ".join(
-                [
-                    "mpirun",
-                    "--oversubscribe" if n_proc > 1 else "",
-                    "-np",
-                    str(n_proc),
-                    cpp_exec,
-                    name,
-                ]
-                )
-
+    exec_path = cpp_exec_p if "petsc" in folder else cpp_exec
+    cmd = " ".join([
+        "mpirun",
+        "--oversubscribe" if n_proc > 1 else "",
+        "-np", str(n_proc),
+        exec_path,
+        name_inp,
+    ])
     subprocess.call(cmd, cwd=folder, shell=True)
 
-    # read results
-    fname = os.path.join(
-        folder, str(n_proc) + "-procs", "result_" + str(t_max).zfill(3) + ".vtu"
-    )
+
+def _read_result(folder, n_proc, name_result, t_max):
+    """Read a single VTU result file from the n_proc output directory."""
+    result_file = name_result if name_result else "result_" + str(t_max).zfill(3) + ".vtu"
+    fname = os.path.join(folder, str(n_proc) + "-procs", result_file)
     if not os.path.exists(fname):
         raise RuntimeError("No svMultiPhysics output: " + fname)
     return meshio.read(fname)
 
 
-def run_with_reference(
-    base_folder,
-    test_folder,
-    fields,
-    n_proc=1,
-    t_max=1,
-    name_ref=None,
-    name_inp="solver.xml",
-):
-    """
-    Run a test case and compare it to a stored reference solution
-    Args:
-        folder: location from which test will be executed
-        fields: array fields to compare (e.g. ["Pressure", "Velocity"])
-        n_proc: number of processors
-        t_max: time step to compare
-        name_inp: name of svMultiPhysics input file (.xml)
-        name_ref: name of refence file (.vtu)
-    """
-    # default reference name
-    if not name_ref:
-        name_ref = "result_" + str(t_max).zfill(3) + ".vtu"
-
-    # run simulation
-    folder = os.path.join("cases", base_folder, test_folder)
-    
-    if is_not_Darwin:
-        res = run_by_name(folder, name_inp, t_max, n_proc)
-    else:
-        if "petsc" in folder or "trilinos" in folder: 
-            return
-        else:
-            res = run_by_name(folder, name_inp, t_max, n_proc)
-
-    # read reference
-    fname = os.path.join(folder, name_ref)
-    ref = meshio.read(fname)
-
-    # check results
+def _compare_fields(res, ref, fields):
+    """Compare fields between a result and reference mesh. Returns an error string."""
     msg = ""
     for f in fields:
-        # extract field
-        if f not in res.point_data.keys():
+        if f not in res.point_data:
             raise ValueError("Field " + f + " not in simulation result")
         a = res.point_data[f]
 
-        if f not in ref.point_data.keys():
+        if f not in ref.point_data:
             raise ValueError("Field " + f + " not in reference result")
         b = ref.point_data[f]
 
-        # truncate last dimension if solution is 2D but reference is 3D
         if len(a.shape) == 2:
             if a.shape[1] == 2 and b.shape[1] == 3:
                 assert not np.any(b[:, 2])
                 b = b[:, :2]
 
-        # pick tolerance for current field
         if f not in RTOL:
             raise ValueError("No tolerance defined for field " + f)
         rtol = RTOL[f]
 
-        # relative difference (as computed in np.isclose)
-        # note that we consider rtol as absolute zero (and as relative tolerance)
         a_fl = a.flatten()
         b_fl = b.flatten()
         rel_diff = np.abs(a_fl - b_fl) - rtol - rtol * np.abs(b_fl)
 
-        # throw error if not all results are within relative tolerance
         close = rel_diff <= 0.0
         if not np.all(close):
-            # portion of individual results that are above the tolerance
             wrong = 1 - np.sum(close) / close.size
-
-            # location of maximum relative difference
             i_max = rel_diff.argmax()
-
-            # maximum relative difference
             max_rel = rel_diff[i_max]
-
-            # maximum absolute difference at same location
             max_abs = np.abs(a_fl[i_max] - b_fl[i_max])
 
-            # throw error message for pytest
             msg += "Test failed in field " + f + "."
             msg += " Results differ by more than rtol=" + str(rtol)
             msg += " in {:.1%}".format(wrong)
@@ -205,6 +112,62 @@ def run_with_reference(
             msg += " Max. rel. difference is"
             msg += " {:.1e}".format(max_rel)
             msg += " (abs. {:.1e}".format(max_abs) + ")\n"
-    # check all fields first and then throw error if any failed
+    return msg
+
+
+def run_by_name(folder, name, t_max, n_proc=1, name_result=None):
+    """Run a test case and return results (legacy single-file interface)."""
+    _run_simulation(folder, name, n_proc)
+    return _read_result(folder, n_proc, name_result, t_max)
+
+
+def run_with_reference(
+    base_folder,
+    test_folder,
+    fields,
+    n_proc=1,
+    t_max=1,
+    name_ref=None,
+    name_inp="solver.xml",
+    name_result=None,
+    comparisons=None,
+):
+    """
+    Run a test case once and compare one or more output files to stored references.
+
+    Args:
+        fields:      fields to compare for the primary output file
+        n_proc:      number of processors
+        t_max:       timestep index used for default file naming
+        name_inp:    solver input XML filename
+        name_ref:    reference VTU filename (default: result_{t_max:03d}.vtu)
+        name_result: result VTU filename inside {n_proc}-procs/ (default: same as name_ref)
+        comparisons: list of dicts for multi-file comparison, each with keys:
+                       "fields"      — list of field names to compare
+                       "name_ref"    — reference VTU filename
+                       "name_result" — result VTU filename (optional, defaults to name_ref)
+                     When provided, fields/name_ref/name_result are ignored.
+    """
+    folder = os.path.join("cases", base_folder, test_folder)
+
+    if not is_not_Darwin and ("petsc" in folder or "trilinos" in folder):
+        return
+
+    # Build the comparison list from either the new or legacy parameters
+    if comparisons is None:
+        if not name_ref:
+            name_ref = "result_" + str(t_max).zfill(3) + ".vtu"
+        comparisons = [{"fields": fields, "name_ref": name_ref, "name_result": name_result}]
+
+    # Run the simulation once
+    _run_simulation(folder, name_inp, n_proc)
+
+    # Compare each requested output file against its reference
+    msg = ""
+    for comp in comparisons:
+        res = _read_result(folder, n_proc, comp.get("name_result"), t_max)
+        ref = meshio.read(os.path.join(folder, comp["name_ref"]))
+        msg += _compare_fields(res, ref, comp["fields"])
+
     if msg:
         raise AssertionError(msg)
diff --git a/tests/test_fsi.py b/tests/test_fsi.py
index dd383c9fa..330307afe 100644
--- a/tests/test_fsi.py
+++ b/tests/test_fsi.py
@@ -1,3 +1,5 @@
+import pytest
+
 from .conftest import run_with_reference
 
 # Common folder for all tests in this file
@@ -30,4 +32,29 @@ def test_pipe_3d_trilinos_ml(n_proc):
 def test_pipe_RCR_3d(n_proc):
     test_folder = "pipe_RCR_3d"
     t_max = 5
-    run_with_reference(base_folder, test_folder, fields, n_proc, t_max)
\ No newline at end of file
+    run_with_reference(base_folder, test_folder, fields, n_proc, t_max)
+
+def test_pipe_3d_partitioned(n_proc):
+    test_folder = "pipe_3d_partitioned"
+    t_max = 1
+    # A single 1-proc result is the reference for all processor counts, compared
+    # at the global RTOL. Each sub-mesh (fluid/solid/mesh) is partitioned
+    # independently, so multi-proc runs only match the 1-proc run once the
+    # Dirichlet-Neumann coupling has converged tightly. Two settings in
+    # solver.xml make that possible at the original tolerances:
+    #   - Time_step_size = 1e-2: a larger step weakens the added-mass coupling
+    #     stiffness (which grows ~1/dt for partitioned FSI), so the Aitken
+    #     iteration converges robustly and identically at 1, 3 and 4 procs
+    #     instead of stagnating at 4 procs.
+    #   - Coupling_tolerance = 1e-6: at this dt all proc counts reach the same
+    #     coupling fixed point in ~10 iterations, leaving the partition-count
+    #     round-off 5-10 orders below the global RTOL for every field.
+    run_with_reference(base_folder, test_folder, fields=[], n_proc=n_proc, t_max=t_max,
+                       comparisons=[
+                           {"fields": ["Velocity", "Pressure"],
+                            "name_ref": "result_fluid_001.vtu",
+                            "name_result": "result_fluid_001.vtu"},
+                           {"fields": ["Displacement", "VonMises_stress"],
+                            "name_ref": "result_solid_001.vtu",
+                            "name_result": "result_solid_001.vtu"},
+                       ])
\ No newline at end of file
diff --git a/tests/unitTests/integrator_tests/test_fsi_coupling.cpp b/tests/unitTests/integrator_tests/test_fsi_coupling.cpp
new file mode 100644
index 000000000..982499212
--- /dev/null
+++ b/tests/unitTests/integrator_tests/test_fsi_coupling.cpp
@@ -0,0 +1,262 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+/**
+ * @brief Integration tests for fsi_coupling namespace functions.
+ *
+ * Tests the FSI interface data exchange functions: extract_fluid_traction,
+ * extract_solid_displacement, apply_traction_on_solid, apply_displacement_on_mesh.
+ *
+ * Requires MPI and access to the FSI pipe_3d test case data files.
+ */
+
+#include "gtest/gtest.h"
+
+#include "fsi_coupling.h"
+#include "post.h"
+#include "Integrator.h"
+#include "Simulation.h"
+#include "distribute.h"
+#include "initialize.h"
+#include "read_files.h"
+#include "LinearAlgebra.h"
+#include "set_bc.h"
+#include "post.h"
+
+#include <cmath>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <mpi.h>
+
+#ifndef TEST_DATA_DIR
+#define TEST_DATA_DIR ""
+#endif
+
+// ---------------------------------------------------------------------------
+// MPI environment (same as in test_step_equation.cpp -- only one takes effect)
+// ---------------------------------------------------------------------------
+class MPIEnvironment_FSICoupling : public ::testing::Environment {
+public:
+  void SetUp() override {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+    if (!initialized) {
+      int argc = 0;
+      char** argv = nullptr;
+      MPI_Init(&argc, &argv);
+    }
+  }
+  void TearDown() override {
+    int finalized = 0;
+    MPI_Finalized(&finalized);
+    if (!finalized) {
+      MPI_Finalize();
+    }
+  }
+};
+
+static testing::Environment* const mpi_env_fsi =
+    testing::AddGlobalTestEnvironment(new MPIEnvironment_FSICoupling);
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+static void add_eq_la(ComMod& com_mod, eqType& lEq)
+{
+  lEq.linear_algebra = LinearAlgebraFactory::create_interface(lEq.linear_algebra_type);
+  lEq.linear_algebra->set_preconditioner(lEq.linear_algebra_preconditioner);
+  lEq.linear_algebra->initialize(com_mod, lEq);
+  if (lEq.linear_algebra_assembly_type != consts::LinearAlgebraType::none) {
+    lEq.linear_algebra->set_assembly(lEq.linear_algebra_assembly_type);
+  }
+}
+
+static Simulation* setup_fsi_simulation()
+{
+  std::string fsi_dir = std::string(TEST_DATA_DIR) + "/fsi/pipe_3d";
+  char orig_dir[4096];
+  getcwd(orig_dir, sizeof(orig_dir));
+  chdir(fsi_dir.c_str());
+
+  auto sim = new Simulation();
+  read_files_ns::read_files(sim, "solver.xml");
+  distribute(sim);
+  Vector<double> init_time(3);
+  initialize(sim, init_time);
+  for (int iEq = 0; iEq < sim->com_mod.nEq; iEq++) {
+    add_eq_la(sim->com_mod, sim->com_mod.eq[iEq]);
+  }
+
+  chdir(orig_dir);
+  return sim;
+}
+
+static void run_one_fsi_timestep(Simulation* sim)
+{
+  auto& com_mod = sim->com_mod;
+  auto& integrator = sim->get_integrator();
+  auto& solutions = integrator.get_solutions();
+
+  com_mod.cTS += 1;
+  com_mod.time += com_mod.dt;
+  com_mod.cEq = 0;
+  for (auto& eq : com_mod.eq) {
+    eq.itr = 0;
+    eq.ok = false;
+  }
+
+  integrator.predictor();
+  set_bc::set_bc_dir(com_mod, solutions);
+  integrator.step();
+
+  solutions.old.get_acceleration() = solutions.current.get_acceleration();
+  solutions.old.get_velocity() = solutions.current.get_velocity();
+  if (com_mod.dFlag) {
+    solutions.old.get_displacement() = solutions.current.get_displacement();
+  }
+}
+
+static void teardown_sim(Simulation* sim)
+{
+  for (int iEq = 0; iEq < sim->com_mod.nEq; iEq++) {
+    sim->com_mod.eq[iEq].linear_algebra->finalize();
+  }
+  delete sim;
+}
+
+static bool test_data_available()
+{
+  std::string path = std::string(TEST_DATA_DIR);
+  if (path.empty()) return false;
+  struct stat st;
+  return (stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode));
+}
+
+// Find a face by name in the simulation meshes
+static const faceType* find_face(const ComMod& com_mod, const std::string& name)
+{
+  for (int iM = 0; iM < com_mod.nMsh; iM++) {
+    for (int iFa = 0; iFa < com_mod.msh[iM].nFa; iFa++) {
+      if (com_mod.msh[iM].fa[iFa].name == name) {
+        return &com_mod.msh[iM].fa[iFa];
+      }
+    }
+  }
+  return nullptr;
+}
+
+static const mshType* find_mesh_for_face(const ComMod& com_mod, const std::string& face_name)
+{
+  for (int iM = 0; iM < com_mod.nMsh; iM++) {
+    for (int iFa = 0; iFa < com_mod.msh[iM].nFa; iFa++) {
+      if (com_mod.msh[iM].fa[iFa].name == face_name) {
+        return &com_mod.msh[iM];
+      }
+    }
+  }
+  return nullptr;
+}
+
+// ===========================================================================
+// Tests
+// ===========================================================================
+
+
+/// @brief Extract solid displacement from a converged FSI solution.
+TEST(FSICoupling, ExtractSolidDisplacement)
+{
+  if (!test_data_available()) GTEST_SKIP() << "Test data not available";
+
+  auto sim = setup_fsi_simulation();
+  auto& com_mod = sim->com_mod;
+  const int nsd = com_mod.nsd;
+
+  // Run one time step to get a non-trivial solution
+  run_one_fsi_timestep(sim);
+
+  auto& solutions = sim->get_integrator().get_solutions();
+  auto& eq = com_mod.eq[0];  // FSI equation
+
+  auto* solid_face = find_face(com_mod, "wall_inner");
+  ASSERT_NE(solid_face, nullptr);
+
+  // Extract displacement
+  auto disp = fsi_coupling::extract_solid_displacement(com_mod, eq, *solid_face, solutions);
+
+  // Verify against direct solution array access
+  const auto& Dn = solutions.current.get_displacement();
+  int s = eq.s;
+  double max_diff = 0.0;
+  for (int a = 0; a < solid_face->nNo; a++) {
+    int Ac = solid_face->gN(a);
+    for (int i = 0; i < nsd; i++) {
+      double diff = std::abs(disp(i, a) - Dn(i + s, Ac));
+      if (diff > max_diff) max_diff = diff;
+    }
+  }
+  EXPECT_LT(max_diff, 1e-14)
+      << "Extracted displacement should match solution array";
+
+  // Verify displacement is non-zero (problem has deformation)
+  double max_val = 0.0;
+  for (int a = 0; a < solid_face->nNo; a++) {
+    for (int i = 0; i < nsd; i++) {
+      double v = std::abs(disp(i, a));
+      if (v > max_val) max_val = v;
+    }
+  }
+  EXPECT_GT(max_val, 1e-10) << "Displacement should be non-zero after FSI solve";
+
+  teardown_sim(sim);
+}
+
+/// @brief Extract fluid traction and verify total force is reasonable.
+TEST(FSICoupling, ExtractFluidTraction)
+{
+  if (!test_data_available()) GTEST_SKIP() << "Test data not available";
+
+  auto sim = setup_fsi_simulation();
+  auto& com_mod = sim->com_mod;
+  const int nsd = com_mod.nsd;
+
+  // Run one time step
+  run_one_fsi_timestep(sim);
+
+  auto& integrator = sim->get_integrator();
+  auto& solutions = integrator.get_solutions();
+  auto& eq = com_mod.eq[0];  // FSI equation
+
+  auto* fluid_face = find_face(com_mod, "lumen_wall");
+  auto* fluid_mesh = find_mesh_for_face(com_mod, "lumen_wall");
+  ASSERT_NE(fluid_face, nullptr);
+  ASSERT_NE(fluid_mesh, nullptr);
+
+  // Extract consistent nodal traction forces
+  com_mod.cEq = 0;  // ensure correct equation is active
+  auto traction = post::compute_face_traction(
+      com_mod, sim->cm_mod, *fluid_mesh, *fluid_face, eq, solutions);
+
+  // Check dimensions
+  EXPECT_EQ(traction.nrows(), nsd);
+  EXPECT_EQ(traction.ncols(), fluid_face->nNo);
+
+  // Compute total force (sum of consistent nodal forces)
+  Vector<double> total_force(nsd);
+  for (int a = 0; a < fluid_face->nNo; a++) {
+    for (int i = 0; i < nsd; i++) {
+      total_force(i) += traction(i, a);
+    }
+  }
+
+  // The total force should be non-zero (pressure-driven flow in a pipe)
+  double force_mag = 0.0;
+  for (int i = 0; i < nsd; i++) {
+    force_mag += total_force(i) * total_force(i);
+  }
+  force_mag = sqrt(force_mag);
+  EXPECT_GT(force_mag, 1e-10)
+      << "Total traction force should be non-zero for pressure-driven FSI flow";
+
+  teardown_sim(sim);
+}
+
diff --git a/tests/unitTests/integrator_tests/test_step_equation.cpp b/tests/unitTests/integrator_tests/test_step_equation.cpp
new file mode 100644
index 000000000..07472b79e
--- /dev/null
+++ b/tests/unitTests/integrator_tests/test_step_equation.cpp
@@ -0,0 +1,304 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+/**
+ * @brief Integration tests for Integrator::step_equation()
+ *
+ * Tests that step_equation() produces correct results by comparing against
+ * the monolithic step() method on real solver problems.
+ *
+ * These tests require MPI and access to test case data files.
+ * They are skipped if the test data directory is not available.
+ */
+
+#include "gtest/gtest.h"
+
+#include "Integrator.h"
+#include "Simulation.h"
+#include "distribute.h"
+#include "initialize.h"
+#include "read_files.h"
+#include "LinearAlgebra.h"
+#include "set_bc.h"
+
+#include <cmath>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <mpi.h>
+
+// Path to test data, defined via CMake
+#ifndef TEST_DATA_DIR
+#define TEST_DATA_DIR ""
+#endif
+
+// ---------------------------------------------------------------------------
+// MPI environment: initializes MPI once before all tests, finalizes after
+// ---------------------------------------------------------------------------
+class MPIEnvironment : public ::testing::Environment {
+public:
+  void SetUp() override {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+    if (!initialized) {
+      int argc = 0;
+      char** argv = nullptr;
+      MPI_Init(&argc, &argv);
+    }
+  }
+  void TearDown() override {
+    int finalized = 0;
+    MPI_Finalized(&finalized);
+    if (!finalized) {
+      MPI_Finalize();
+    }
+  }
+};
+
+// Register the MPI environment (runs before any test)
+static testing::Environment* const mpi_env =
+    testing::AddGlobalTestEnvironment(new MPIEnvironment);
+
+// ---------------------------------------------------------------------------
+// Helper: set up a full simulation from an XML file
+// ---------------------------------------------------------------------------
+static Simulation* setup_simulation(const std::string& xml_path)
+{
+  // The solver reads mesh files relative to the XML directory,
+  // so we must chdir there before calling read_files.
+  std::string dir = xml_path.substr(0, xml_path.find_last_of('/'));
+  std::string file = xml_path.substr(xml_path.find_last_of('/') + 1);
+  char orig_dir[4096];
+  getcwd(orig_dir, sizeof(orig_dir));
+  chdir(dir.c_str());
+
+  auto simulation = new Simulation();
+  read_files_ns::read_files(simulation, file);
+  distribute(simulation);
+  Vector<double> init_time(3);
+  initialize(simulation, init_time);
+  for (int iEq = 0; iEq < simulation->com_mod.nEq; iEq++) {
+    add_eq_linear_algebra(simulation->com_mod, simulation->com_mod.eq[iEq]);
+  }
+
+  chdir(orig_dir);
+  return simulation;
+}
+
+static void teardown_simulation(Simulation* simulation)
+{
+  for (int iEq = 0; iEq < simulation->com_mod.nEq; iEq++) {
+    simulation->com_mod.eq[iEq].linear_algebra->finalize();
+  }
+  delete simulation;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: run one time step using step()
+// ---------------------------------------------------------------------------
+static void run_one_timestep_step(Simulation* simulation)
+{
+  auto& com_mod = simulation->com_mod;
+  auto& integrator = simulation->get_integrator();
+  auto& solutions = integrator.get_solutions();
+
+  com_mod.cTS += 1;
+  com_mod.time += com_mod.dt;
+  com_mod.cEq = 0;
+  for (auto& eq : com_mod.eq) {
+    eq.itr = 0;
+    eq.ok = false;
+  }
+
+  integrator.predictor();
+  set_bc::set_bc_dir(com_mod, solutions);
+  integrator.step();
+
+  // Copy current -> old for next step
+  solutions.old.get_acceleration() = solutions.current.get_acceleration();
+  solutions.old.get_velocity() = solutions.current.get_velocity();
+  if (com_mod.dFlag) {
+    solutions.old.get_displacement() = solutions.current.get_displacement();
+  }
+  com_mod.cplBC.xo = com_mod.cplBC.xn;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: run one time step using step_equation() per equation
+// ---------------------------------------------------------------------------
+static void run_one_timestep_step_equation(Simulation* simulation,
+                                           int outer_iters = 1)
+{
+  auto& com_mod = simulation->com_mod;
+  auto& integrator = simulation->get_integrator();
+  auto& solutions = integrator.get_solutions();
+
+  com_mod.cTS += 1;
+  com_mod.time += com_mod.dt;
+  com_mod.cEq = 0;
+  for (auto& eq : com_mod.eq) {
+    eq.itr = 0;
+    eq.ok = false;
+  }
+
+  integrator.predictor();
+  set_bc::set_bc_dir(com_mod, solutions);
+
+  for (int outer = 0; outer < outer_iters; outer++) {
+    for (int iEq = 0; iEq < com_mod.nEq; iEq++) {
+      integrator.step_equation(iEq);
+    }
+  }
+
+  // Copy current -> old for next step
+  solutions.old.get_acceleration() = solutions.current.get_acceleration();
+  solutions.old.get_velocity() = solutions.current.get_velocity();
+  if (com_mod.dFlag) {
+    solutions.old.get_displacement() = solutions.current.get_displacement();
+  }
+  com_mod.cplBC.xo = com_mod.cplBC.xn;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: compute relative difference between two solution arrays
+// ---------------------------------------------------------------------------
+static double rel_diff(const Array<double>& a, const Array<double>& b)
+{
+  double max_diff = 0.0;
+  double max_val = 0.0;
+  for (int j = 0; j < a.ncols(); j++) {
+    for (int i = 0; i < a.nrows(); i++) {
+      double d = std::abs(a(i,j) - b(i,j));
+      if (d > max_diff) max_diff = d;
+      double v = std::abs(a(i,j));
+      if (v > max_val) max_val = v;
+    }
+  }
+  return (max_val > 0) ? max_diff / max_val : max_diff;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: check if test data directory exists
+// ---------------------------------------------------------------------------
+static bool test_data_available()
+{
+  std::string path = std::string(TEST_DATA_DIR);
+  if (path.empty()) return false;
+  struct stat st;
+  return (stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode));
+}
+
+// ===========================================================================
+// Tests
+// ===========================================================================
+
+/// @brief For a single-equation problem, step_equation(0) must produce
+/// bit-identical results to step().
+TEST(StepEquation, SingleEquationMatchesStep)
+{
+  if (!test_data_available()) GTEST_SKIP() << "Test data not available";
+
+  std::string xml = std::string(TEST_DATA_DIR) + "/fluid/newtonian/solver.xml";
+
+  // Run with step()
+  auto sim_a = setup_simulation(xml);
+  run_one_timestep_step(sim_a);
+  Array<double> Yn_step = sim_a->get_integrator().get_solutions().current.get_velocity();
+  teardown_simulation(sim_a);
+
+  // Run with step_equation(0)
+  auto sim_b = setup_simulation(xml);
+  run_one_timestep_step_equation(sim_b, 1);
+  Array<double> Yn_step_eq = sim_b->get_integrator().get_solutions().current.get_velocity();
+  teardown_simulation(sim_b);
+
+  double diff = rel_diff(Yn_step, Yn_step_eq);
+  EXPECT_LT(diff, 1e-12) << "step_equation(0) should match step() for single-equation problems";
+}
+
+/// @brief For coupled FSI, sequential step_equation converges to step()
+/// result when outer coupling iterations are added.
+TEST(StepEquation, CoupledFSIConvergesWithOuterIterations)
+{
+  if (!test_data_available()) GTEST_SKIP() << "Test data not available";
+
+  std::string xml = std::string(TEST_DATA_DIR) + "/fsi/pipe_3d/solver.xml";
+
+  // Run with step() as reference
+  auto sim_ref = setup_simulation(xml);
+  run_one_timestep_step(sim_ref);
+  Array<double> Yn_ref = sim_ref->get_integrator().get_solutions().current.get_velocity();
+  Array<double> Dn_ref = sim_ref->get_integrator().get_solutions().current.get_displacement();
+  teardown_simulation(sim_ref);
+
+  // Run with 1 outer iteration (single pass) - should have coupling error
+  auto sim_1 = setup_simulation(xml);
+  run_one_timestep_step_equation(sim_1, 1);
+  Array<double> Yn_1 = sim_1->get_integrator().get_solutions().current.get_velocity();
+  Array<double> Dn_1 = sim_1->get_integrator().get_solutions().current.get_displacement();
+  teardown_simulation(sim_1);
+
+  double diff_vel_1 = rel_diff(Yn_ref, Yn_1);
+  double diff_disp_1 = rel_diff(Dn_ref, Dn_1);
+
+  // Run with 4 outer iterations - should converge to step() result
+  auto sim_4 = setup_simulation(xml);
+  run_one_timestep_step_equation(sim_4, 4);
+  Array<double> Yn_4 = sim_4->get_integrator().get_solutions().current.get_velocity();
+  Array<double> Dn_4 = sim_4->get_integrator().get_solutions().current.get_displacement();
+  teardown_simulation(sim_4);
+
+  double diff_vel_4 = rel_diff(Yn_ref, Yn_4);
+  double diff_disp_4 = rel_diff(Dn_ref, Dn_4);
+
+  // With 4 outer iterations, the coupling error should be orders of magnitude
+  // smaller than with 1 iteration
+  EXPECT_LT(diff_vel_4, diff_vel_1 * 1e-4)
+      << "4 outer iterations should reduce velocity coupling error by >4 orders";
+  EXPECT_LT(diff_disp_4, diff_disp_1 * 1e-4)
+      << "4 outer iterations should reduce displacement coupling error by >4 orders";
+
+  // With 4 outer iterations, the result should match step() to near machine precision
+  EXPECT_LT(diff_vel_4, 1e-10)
+      << "Velocity should match step() after 4 outer iterations";
+  EXPECT_LT(diff_disp_4, 1e-10)
+      << "Displacement should match step() after 4 outer iterations";
+}
+
+/// @brief The post_assembly callback fires on every Newton iteration.
+TEST(StepEquation, PostAssemblyCallbackFires)
+{
+  if (!test_data_available()) GTEST_SKIP() << "Test data not available";
+
+  std::string xml = std::string(TEST_DATA_DIR) + "/fluid/newtonian/solver.xml";
+
+  auto sim = setup_simulation(xml);
+  auto& com_mod = sim->com_mod;
+  auto& integrator = sim->get_integrator();
+  auto& solutions = integrator.get_solutions();
+
+  // Set up time step
+  com_mod.cTS += 1;
+  com_mod.time += com_mod.dt;
+  com_mod.cEq = 0;
+  for (auto& eq : com_mod.eq) {
+    eq.itr = 0;
+    eq.ok = false;
+  }
+
+  integrator.predictor();
+  set_bc::set_bc_dir(com_mod, solutions);
+
+  // Count callback invocations
+  int callback_count = 0;
+  integrator.step_equation(0, [&callback_count]() {
+    callback_count++;
+  });
+
+  // Callback should fire once per Newton iteration (at least minItr times)
+  EXPECT_GE(callback_count, com_mod.eq[0].minItr)
+      << "Callback should fire at least minItr times";
+  EXPECT_LE(callback_count, com_mod.eq[0].maxItr)
+      << "Callback should fire at most maxItr times";
+
+  teardown_simulation(sim);
+}