diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0425bb2211..38434af5ee 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -282,6 +282,8 @@ if(RAFT_COMPILE_LIBRARY)
     src/raft_runtime/solver/lanczos_solver_int64_float.cu
     src/raft_runtime/solver/lanczos_solver_int_double.cu
     src/raft_runtime/solver/lanczos_solver_int_float.cu
+    src/raft_runtime/solver/lanczos_svds_float.cu
+    src/raft_runtime/solver/lanczos_svds_double.cu
     src/raft_runtime/solver/randomized_svds_float.cu
     src/raft_runtime/solver/randomized_svds_double.cu
   )
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 7dc510639e..7e8ed4a691 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -101,7 +101,7 @@ if(BUILD_PRIMS_BENCH)
 
   ConfigureBench(
     NAME SPARSE_BENCH PATH sparse/bitmap_to_csr.cu sparse/convert_csr.cu sparse/select_k_csr.cu
-    main.cpp
+    sparse/svds.cu main.cpp
   )
 
 endif()
diff --git a/cpp/bench/prims/sparse/svds.cu b/cpp/bench/prims/sparse/svds.cu
new file mode 100644
index 0000000000..65b4b3aa1f
--- /dev/null
+++ b/cpp/bench/prims/sparse/svds.cu
@@ -0,0 +1,432 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/sparse/solver/lanczos_svds.cuh>
+#include <raft/sparse/solver/randomized_svds.cuh>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <fstream>
+#include <limits>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+constexpr std::uint64_t kCsrFileMagic = 0x3152534354464152ULL;
+
+struct csr_file_header {
+  std::uint64_t magic;
+  std::uint32_t version;
+  std::uint32_t dtype;
+  std::int64_t rows;
+  std::int64_t cols;
+  std::int64_t nnz;
+};
+
+struct svds_input {
+  int rows;
+  int cols;
+  int nnz;
+  int nnz_per_row;
+  int k;
+  int ncv;
+  int n_oversamples;
+  int n_power_iters;
+  int max_iterations;
+  bool file_backed;
+};
+
+csr_file_header read_csr_file_header(std::string const& path)
+{
+  std::ifstream file(path, std::ios::binary);
+  if (!file) { throw std::runtime_error("Could not open CSR benchmark file: " + path); }
+
+  csr_file_header header{};
+  file.read(reinterpret_cast<char*>(&header), sizeof(header));
+  if (!file) { throw std::runtime_error("Could not read CSR benchmark file header: " + path); }
+  if (header.magic != kCsrFileMagic || header.version != 1) {
+    throw std::runtime_error("Invalid CSR benchmark file header: " + path);
+  }
+  if (header.dtype != 1) {
+    throw std::runtime_error("CSR benchmark file must contain float32 values: " + path);
+  }
+  if (header.rows <= 0 || header.cols <= 0 || header.nnz <= 0) {
+    throw std::runtime_error("CSR benchmark file has invalid dimensions: " + path);
+  }
+  if (header.rows > std::numeric_limits<int>::max() ||
+      header.cols > std::numeric_limits<int>::max() ||
+      header.nnz > std::numeric_limits<int>::max()) {
+    throw std::runtime_error("CSR benchmark file dimensions exceed int32 limits: " + path);
+  }
+  return header;
+}
+
+int env_int(char const* name, int default_value)
+{
+  auto const* value = std::getenv(name);
+  return value == nullptr ? default_value : std::atoi(value);
+}
+
+std::vector<double> read_reference_singular_values(std::string const& path, int k)
+{
+  std::ifstream file(path);
+  if (!file) { throw std::runtime_error("Could not open reference singular values file: " + path); }
+
+  std::vector<double> values;
+  values.reserve(k);
+  double value = 0.0;
+  while (file >> value) {
+    values.push_back(value);
+    if (static_cast<int>(values.size()) == k) { break; }
+  }
+  if (static_cast<int>(values.size()) != k) {
+    throw std::runtime_error("Reference singular values file has fewer than k values: " + path);
+  }
+  return values;
+}
+
+template <typename value_t>
+class svds_bench_base : public fixture {
+ public:
+  explicit svds_bench_base(svds_input const& p)
+    : fixture(true),
+      params(p),
+      nnz(p.nnz),
+      indptr(raft::make_device_vector<int, uint32_t>(handle, p.rows + 1)),
+      indices(raft::make_device_vector<int, uint32_t>(handle, nnz)),
+      values(raft::make_device_vector<value_t, uint32_t>(handle, nnz)),
+      singular_values(raft::make_device_vector<value_t, uint32_t>(handle, p.k)),
+      U(raft::make_device_matrix<value_t, uint32_t, raft::col_major>(handle, p.rows, p.k)),
+      Vt(raft::make_device_matrix<value_t, uint32_t, raft::col_major>(handle, p.k, p.cols))
+  {
+    if (params.file_backed) {
+      load_csr_from_file();
+    } else {
+      initialize_synthetic_csr();
+    }
+  }
+
+  void generate_metrics(::benchmark::State& state) override
+  {
+    state.counters["rows"]        = benchmark::Counter(params.rows);
+    state.counters["cols"]        = benchmark::Counter(params.cols);
+    state.counters["nnz"]         = benchmark::Counter(nnz);
+    state.counters["nnz/row"]     = benchmark::Counter(static_cast<double>(nnz) / params.rows);
+    state.counters["components"]  = benchmark::Counter(params.k);
+    state.counters["subspace"]    = benchmark::Counter(params.ncv);
+    state.counters["oversamples"] = benchmark::Counter(params.n_oversamples);
+    add_reference_metrics(state);
+    add_orthogonality_metrics(state);
+  }
+
+ protected:
+  auto csr_view()
+  {
+    auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+      indptr.data_handle(), indices.data_handle(), params.rows, params.cols, nnz);
+    return raft::make_device_csr_matrix_view<value_t const, int, int, int>(values.data_handle(),
+                                                                           csr_structure);
+  }
+
+  void initialize_synthetic_csr()
+  {
+    std::vector<int> h_indptr(params.rows + 1);
+    std::vector<int> h_indices(nnz);
+    std::vector<value_t> h_values(nnz);
+
+    for (int row = 0; row <= params.rows; ++row) {
+      h_indptr[row] = row * params.nnz_per_row;
+    }
+
+    std::vector<int> row_indices(params.nnz_per_row);
+    int const step = std::max(1, params.cols / std::max(1, params.nnz_per_row));
+    for (int row = 0; row < params.rows; ++row) {
+      for (int j = 0; j < params.nnz_per_row; ++j) {
+        row_indices[j] = (row * 131 + j * step + j * 17) % params.cols;
+      }
+      std::sort(row_indices.begin(), row_indices.end());
+      for (int j = 0; j < params.nnz_per_row; ++j) {
+        auto offset       = static_cast<std::size_t>(row) * params.nnz_per_row + j;
+        h_indices[offset] = row_indices[j];
+        h_values[offset] =
+          value_t(0.1) + value_t((row * 17 + j * 29 + row_indices[j] * 7) % 1000) / value_t(1000);
+      }
+    }
+
+    raft::update_device(indptr.data_handle(), h_indptr.data(), h_indptr.size(), stream);
+    raft::update_device(indices.data_handle(), h_indices.data(), h_indices.size(), stream);
+    raft::update_device(values.data_handle(), h_values.data(), h_values.size(), stream);
+    resource::sync_stream(handle, stream);
+  }
+
+  void load_csr_from_file()
+  {
+    auto const* path = std::getenv("RAFT_SVDS_CSR_FILE");
+    if (path == nullptr) { throw std::runtime_error("RAFT_SVDS_CSR_FILE must be set"); }
+
+    auto header = read_csr_file_header(path);
+    if (header.rows != params.rows || header.cols != params.cols || header.nnz != params.nnz) {
+      throw std::runtime_error(
+        "CSR benchmark file dimensions changed after benchmark registration");
+    }
+
+    std::ifstream file(path, std::ios::binary);
+    if (!file) {
+      throw std::runtime_error(std::string("Could not open CSR benchmark file: ") + path);
+    }
+    file.seekg(sizeof(csr_file_header), std::ios::beg);
+
+    std::vector<int> h_indptr(static_cast<std::size_t>(params.rows) + 1);
+    std::vector<int> h_indices(static_cast<std::size_t>(nnz));
+    std::vector<value_t> h_values(static_cast<std::size_t>(nnz));
+
+    file.read(reinterpret_cast<char*>(h_indptr.data()), h_indptr.size() * sizeof(int));
+    file.read(reinterpret_cast<char*>(h_indices.data()), h_indices.size() * sizeof(int));
+    file.read(reinterpret_cast<char*>(h_values.data()), h_values.size() * sizeof(value_t));
+    if (!file) {
+      throw std::runtime_error(std::string("Could not read CSR benchmark file: ") + path);
+    }
+
+    raft::update_device(indptr.data_handle(), h_indptr.data(), h_indptr.size(), stream);
+    raft::update_device(indices.data_handle(), h_indices.data(), h_indices.size(), stream);
+    raft::update_device(values.data_handle(), h_values.data(), h_values.size(), stream);
+    resource::sync_stream(handle, stream);
+  }
+
+  void add_reference_metrics(::benchmark::State& state)
+  {
+    auto const* ref_path = std::getenv("RAFT_SVDS_REF_S");
+    if (ref_path == nullptr) { return; }
+
+    auto ref = read_reference_singular_values(ref_path, params.k);
+    std::vector<value_t> actual(params.k);
+    raft::update_host(actual.data(), singular_values.data_handle(), params.k, stream);
+    resource::sync_stream(handle, stream);
+
+    double max_rel  = 0.0;
+    double mean_rel = 0.0;
+    for (int i = 0; i < params.k; ++i) {
+      auto denom = std::max(std::abs(ref[i]), 1e-30);
+      auto rel   = std::abs(static_cast<double>(actual[i]) - ref[i]) / denom;
+      max_rel    = std::max(max_rel, rel);
+      mean_rel += rel;
+    }
+    mean_rel /= params.k;
+
+    state.counters["s_rel_max"]  = benchmark::Counter(max_rel);
+    state.counters["s_rel_mean"] = benchmark::Counter(mean_rel);
+    state.counters["s0_rel"]     = benchmark::Counter(
+      std::abs(static_cast<double>(actual[0]) - ref[0]) / std::max(std::abs(ref[0]), 1e-30));
+  }
+
+  void add_orthogonality_metrics(::benchmark::State& state)
+  {
+    auto UtU =
+      raft::make_device_matrix<value_t, uint32_t, raft::col_major>(handle, params.k, params.k);
+    auto VVt =
+      raft::make_device_matrix<value_t, uint32_t, raft::col_major>(handle, params.k, params.k);
+
+    value_t one  = 1;
+    value_t zero = 0;
+
+    raft::linalg::gemm(handle,
+                       U.data_handle(),
+                       params.rows,
+                       params.k,
+                       U.data_handle(),
+                       UtU.data_handle(),
+                       params.k,
+                       params.k,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       one,
+                       zero,
+                       stream);
+
+    raft::linalg::gemm(handle,
+                       Vt.data_handle(),
+                       params.k,
+                       params.cols,
+                       Vt.data_handle(),
+                       VVt.data_handle(),
+                       params.k,
+                       params.k,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       one,
+                       zero,
+                       stream);
+
+    std::vector<value_t> h_utu(static_cast<std::size_t>(params.k) * params.k);
+    std::vector<value_t> h_vvt(static_cast<std::size_t>(params.k) * params.k);
+    raft::update_host(h_utu.data(), UtU.data_handle(), h_utu.size(), stream);
+    raft::update_host(h_vvt.data(), VVt.data_handle(), h_vvt.size(), stream);
+    resource::sync_stream(handle, stream);
+
+    auto compute_stats = [this](std::vector<value_t> const& gram) {
+      double fro_sq  = 0.0;
+      double max_abs = 0.0;
+      for (int col = 0; col < params.k; ++col) {
+        for (int row = 0; row < params.k; ++row) {
+          auto const expected = row == col ? 1.0 : 0.0;
+          auto const diff     = std::abs(
+            static_cast<double>(gram[static_cast<std::size_t>(col) * params.k + row]) - expected);
+          fro_sq += diff * diff;
+          max_abs = std::max(max_abs, diff);
+        }
+      }
+      return std::array<double, 2>{std::sqrt(fro_sq), max_abs};
+    };
+
+    auto const u_stats           = compute_stats(h_utu);
+    auto const v_stats           = compute_stats(h_vvt);
+    state.counters["u_orth_fro"] = benchmark::Counter(u_stats[0]);
+    state.counters["u_orth_max"] = benchmark::Counter(u_stats[1]);
+    state.counters["v_orth_fro"] = benchmark::Counter(v_stats[0]);
+    state.counters["v_orth_max"] = benchmark::Counter(v_stats[1]);
+  }
+
+  svds_input params;
+  int nnz;
+  raft::device_vector<int, uint32_t> indptr;
+  raft::device_vector<int, uint32_t> indices;
+  raft::device_vector<value_t, uint32_t> values;
+  raft::device_vector<value_t, uint32_t> singular_values;
+  raft::device_matrix<value_t, uint32_t, raft::col_major> U;
+  raft::device_matrix<value_t, uint32_t, raft::col_major> Vt;
+};
+
+template <typename value_t>
+class lanczos_svds_bench : public svds_bench_base<value_t> {
+ public:
+  using svds_bench_base<value_t>::svds_bench_base;
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    auto csr_matrix = this->csr_view();
+
+    raft::sparse::solver::sparse_lanczos_svd_config<value_t> config;
+    config.n_components   = this->params.k;
+    config.ncv            = this->params.ncv;
+    config.tolerance      = value_t(1e-4);
+    config.max_iterations = this->params.max_iterations;
+    config.seed           = 1234;
+
+    this->loop_on_state(
+      state,
+      [&]() {
+        raft::sparse::solver::sparse_lanczos_svd(this->handle,
+                                                 config,
+                                                 csr_matrix,
+                                                 this->singular_values.view(),
+                                                 this->U.view(),
+                                                 this->Vt.view());
+      },
+      false);
+  }
+};
+
+template <typename value_t>
+class lanczos_mgs2_svds_bench : public svds_bench_base<value_t> {
+ public:
+  using svds_bench_base<value_t>::svds_bench_base;
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    auto csr_matrix = this->csr_view();
+
+    raft::sparse::solver::sparse_lanczos_svd_config<value_t> config;
+    config.n_components               = this->params.k;
+    config.ncv                        = this->params.ncv;
+    config.tolerance                  = value_t(1e-4);
+    config.max_iterations             = this->params.max_iterations;
+    config.seed                       = 1234;
+    config.use_mgs2_orthogonalization = true;
+
+    this->loop_on_state(
+      state,
+      [&]() {
+        raft::sparse::solver::sparse_lanczos_svd(this->handle,
+                                                 config,
+                                                 csr_matrix,
+                                                 this->singular_values.view(),
+                                                 this->U.view(),
+                                                 this->Vt.view());
+      },
+      false);
+  }
+};
+
+template <typename value_t>
+class randomized_svds_bench : public svds_bench_base<value_t> {
+ public:
+  using svds_bench_base<value_t>::svds_bench_base;
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    auto csr_matrix = this->csr_view();
+
+    raft::sparse::solver::sparse_svd_config<value_t> config;
+    config.n_components  = this->params.k;
+    config.n_oversamples = this->params.n_oversamples;
+    config.n_power_iters = this->params.n_power_iters;
+    config.seed          = 1234;
+
+    this->loop_on_state(
+      state,
+      [&]() {
+        raft::sparse::solver::sparse_randomized_svd(this->handle,
+                                                    config,
+                                                    csr_matrix,
+                                                    this->singular_values.view(),
+                                                    this->U.view(),
+                                                    this->Vt.view());
+      },
+      false);
+  }
+};
+
+std::vector<svds_input> get_svds_inputs()
+{
+  std::vector<svds_input> inputs = {
+    {10000, 2000, 10000 * 16, 16, 16, 48, 32, 2, 20, false},
+    {50000, 5000, 50000 * 16, 16, 32, 80, 48, 2, 20, false},
+  };
+
+  auto const* path = std::getenv("RAFT_SVDS_CSR_FILE");
+  if (path != nullptr) {
+    auto header = read_csr_file_header(path);
+    inputs.push_back({static_cast<int>(header.rows),
+                      static_cast<int>(header.cols),
+                      static_cast<int>(header.nnz),
+                      0,
+                      env_int("RAFT_SVDS_K", 50),
+                      env_int("RAFT_SVDS_NCV", 0),
+                      env_int("RAFT_SVDS_RANDOMIZED_OVERSAMPLES", 10),
+                      env_int("RAFT_SVDS_RANDOMIZED_POWER_ITERS", 2),
+                      env_int("RAFT_SVDS_MAXITER", 100),
+                      true});
+  }
+  return inputs;
+}
+
+std::vector<svds_input> const svds_inputs = get_svds_inputs();
+
+RAFT_BENCH_REGISTER((lanczos_svds_bench<float>), "", svds_inputs);
+RAFT_BENCH_REGISTER((lanczos_mgs2_svds_bench<float>), "", svds_inputs);
+RAFT_BENCH_REGISTER((randomized_svds_bench<float>), "", svds_inputs);
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh b/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh
new file mode 100644
index 0000000000..37b5a237bd
--- /dev/null
+++ b/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh
@@ -0,0 +1,767 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/solver/detail/svds_sign_correction.cuh>
+#include <raft/sparse/solver/solver_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <optional>
+#include <random>
+#include <vector>
+
+namespace raft::sparse::solver::detail {
+
+template <typename ValueTypeT>
+RAFT_KERNEL negate_scalar_kernel(ValueTypeT* x)
+{
+  *x = -*x;
+}
+
+template <typename ValueTypeT>
+ValueTypeT vector_norm(raft::resources const& handle, ValueTypeT const* x, int n)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::vector_norm");
+  auto cublas_handle = resource::get_cublas_handle(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+  ValueTypeT result{};
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublas_handle, n, x, 1, &result, stream));
+  resource::sync_stream(handle, stream);
+  return result;
+}
+
+template <typename ValueTypeT>
+void scale_vector(raft::resources const& handle, ValueTypeT* x, int n, ValueTypeT alpha)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::scale_vector");
+  auto cublas_handle = resource::get_cublas_handle(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_handle, n, &alpha, x, 1, stream));
+}
+
+template <typename ValueTypeT>
+void axpy(
+  raft::resources const& handle, int n, ValueTypeT alpha, ValueTypeT const* x, ValueTypeT* y)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::axpy");
+  auto cublas_handle = resource::get_cublas_handle(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(cublas_handle, n, &alpha, x, 1, y, 1, stream));
+}
+
+template <typename ValueTypeT>
+void cgs2_orthogonalize(raft::resources const& handle,
+                        ValueTypeT* target,
+                        ValueTypeT const* basis,
+                        int n_rows,
+                        int n_valid,
+                        ValueTypeT* coeffs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::cgs2_orthogonalize");
+  if (n_valid <= 0) { return; }
+
+  auto cublas_handle = resource::get_cublas_handle(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+
+  ValueTypeT one     = ValueTypeT(1);
+  ValueTypeT zero    = ValueTypeT(0);
+  ValueTypeT neg_one = ValueTypeT(-1);
+  for (int pass = 0; pass < 2; ++pass) {
+    {
+      common::nvtx::range<common::nvtx::domain::raft> project_scope("lanczos_svds::cgs2_project");
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle,
+                                                       CUBLAS_OP_T,
+                                                       n_rows,
+                                                       n_valid,
+                                                       &one,
+                                                       basis,
+                                                       n_rows,
+                                                       target,
+                                                       1,
+                                                       &zero,
+                                                       coeffs,
+                                                       1,
+                                                       stream));
+    }
+    {
+      common::nvtx::range<common::nvtx::domain::raft> subtract_scope("lanczos_svds::cgs2_subtract");
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle,
+                                                       CUBLAS_OP_N,
+                                                       n_rows,
+                                                       n_valid,
+                                                       &neg_one,
+                                                       basis,
+                                                       n_rows,
+                                                       coeffs,
+                                                       1,
+                                                       &one,
+                                                       target,
+                                                       1,
+                                                       stream));
+    }
+  }
+}
+
+template <typename ValueTypeT>
+void mgs2_orthogonalize(raft::resources const& handle,
+                        ValueTypeT* target,
+                        ValueTypeT const* basis,
+                        int n_rows,
+                        int n_valid,
+                        ValueTypeT* coeffs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::mgs2_orthogonalize");
+  if (n_valid <= 0) { return; }
+
+  auto cublas_handle = resource::get_cublas_handle(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+
+  RAFT_CUBLAS_TRY(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
+  for (int pass = 0; pass < 2; ++pass) {
+    for (int j = 0; j < n_valid; ++j) {
+      auto const* q_j = basis + static_cast<std::size_t>(j) * n_rows;
+      auto* coeff     = coeffs + j;
+      {
+        common::nvtx::range<common::nvtx::domain::raft> dot_scope("lanczos_svds::mgs2_dot");
+        RAFT_CUBLAS_TRY(
+          raft::linalg::detail::cublasdot(cublas_handle, n_rows, q_j, 1, target, 1, coeff, stream));
+      }
+      {
+        common::nvtx::range<common::nvtx::domain::raft> negate_scope("lanczos_svds::mgs2_negate");
+        negate_scalar_kernel<<<1, 1, 0, stream>>>(coeff);
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+      }
+      {
+        common::nvtx::range<common::nvtx::domain::raft> subtract_scope(
+          "lanczos_svds::mgs2_subtract");
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
+          cublas_handle, n_rows, coeff, q_j, 1, target, 1, stream));
+      }
+    }
+  }
+  RAFT_CUBLAS_TRY(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST));
+}
+
+template <typename ValueTypeT>
+void orthogonalize(raft::resources const& handle,
+                   ValueTypeT* target,
+                   ValueTypeT const* basis,
+                   int n_rows,
+                   int n_valid,
+                   ValueTypeT* coeffs,
+                   bool use_mgs2)
+{
+  if (use_mgs2) {
+    mgs2_orthogonalize(handle, target, basis, n_rows, n_valid, coeffs);
+  } else {
+    cgs2_orthogonalize(handle, target, basis, n_rows, n_valid, coeffs);
+  }
+}
+
+template <typename ValueTypeT>
+ValueTypeT normalize_or_randomize(raft::resources const& handle,
+                                  raft::random::RngState& rng_state,
+                                  ValueTypeT* target,
+                                  int n_rows,
+                                  ValueTypeT const* basis,
+                                  int n_valid,
+                                  ValueTypeT* coeffs,
+                                  ValueTypeT eps,
+                                  bool use_mgs2,
+                                  bool* used_random_vector)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::normalize_or_randomize");
+  auto stream         = resource::get_cuda_stream(handle);
+  auto nrm            = vector_norm(handle, target, n_rows);
+  *used_random_vector = false;
+
+  if (nrm < eps) {
+    {
+      common::nvtx::range<common::nvtx::domain::raft> random_scope(
+        "lanczos_svds::random_restart_vector");
+      raft::random::normal(
+        handle, rng_state, target, static_cast<std::size_t>(n_rows), ValueTypeT(0), ValueTypeT(1));
+    }
+    orthogonalize(handle, target, basis, n_rows, n_valid, coeffs, use_mgs2);
+    nrm                 = vector_norm(handle, target, n_rows);
+    *used_random_vector = true;
+  }
+
+  RAFT_EXPECTS(nrm >= eps, "Unable to generate a non-zero Lanczos vector");
+  auto inv_nrm = ValueTypeT(1) / nrm;
+  scale_vector(handle, target, n_rows, inv_nrm);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  resource::sync_stream(handle, stream);
+  return nrm;
+}
+
+template <typename ValueTypeT>
+void build_bidiagonal_matrix(raft::resources const& handle,
+                             std::vector<ValueTypeT> const& alphas,
+                             std::vector<ValueTypeT> const& betas,
+                             raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> B)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::build_bidiagonal_matrix");
+  auto stream = resource::get_cuda_stream(handle);
+  int ncv     = static_cast<int>(alphas.size());
+  std::vector<ValueTypeT> h_B(static_cast<std::size_t>(ncv) * ncv, ValueTypeT(0));
+
+  for (int i = 0; i < ncv; ++i) {
+    h_B[static_cast<std::size_t>(i) * ncv + i] = alphas[i];
+    if (i + 1 < ncv) { h_B[static_cast<std::size_t>(i + 1) * ncv + i] = betas[i + 1]; }
+  }
+
+  raft::update_device(B.data_handle(), h_B.data(), static_cast<std::size_t>(ncv) * ncv, stream);
+}
+
+template <typename ValueTypeT, typename OperatorT>
+void lanczos_bidiagonalize(raft::resources const& handle,
+                           OperatorT const& op,
+                           int ncv,
+                           ValueTypeT* v_start,
+                           raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> U_full,
+                           raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> V_full,
+                           int n_locked,
+                           raft::random::RngState& rng_state,
+                           ValueTypeT* ortho_coeffs,
+                           bool use_mgs2,
+                           std::vector<ValueTypeT>& alphas,
+                           std::vector<ValueTypeT>& betas)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::bidiagonalize");
+  int m          = op.rows();
+  int n          = op.cols();
+  ValueTypeT eps = ValueTypeT(1e-9);
+  auto stream    = resource::get_cuda_stream(handle);
+
+  alphas.assign(ncv, ValueTypeT(0));
+  betas.assign(ncv + 1, ValueTypeT(0));
+
+  auto* v0 = V_full.data_handle() + static_cast<std::size_t>(n_locked) * n;
+  raft::copy(v0, v_start, n, stream);
+  orthogonalize(handle, v0, V_full.data_handle(), n, n_locked, ortho_coeffs, use_mgs2);
+
+  bool used_random = false;
+  normalize_or_randomize(handle,
+                         rng_state,
+                         v0,
+                         n,
+                         V_full.data_handle(),
+                         n_locked,
+                         ortho_coeffs,
+                         eps,
+                         use_mgs2,
+                         &used_random);
+
+  for (int i = 0; i < ncv; ++i) {
+    int idx_u = n_locked + i;
+    int idx_v = n_locked + i;
+
+    auto* u = U_full.data_handle() + static_cast<std::size_t>(idx_u) * m;
+    auto* v = V_full.data_handle() + static_cast<std::size_t>(idx_v) * n;
+
+    {
+      common::nvtx::range<common::nvtx::domain::raft> apply_scope("lanczos_svds::apply_A");
+      op.apply(handle,
+               raft::make_device_matrix_view<ValueTypeT const, uint32_t, raft::col_major>(v, n, 1),
+               raft::make_device_matrix_view<ValueTypeT, uint32_t, raft::col_major>(u, m, 1));
+    }
+
+    if (i > 0) {
+      auto* u_prev = U_full.data_handle() + static_cast<std::size_t>(idx_u - 1) * m;
+      axpy(handle, m, -betas[i], u_prev, u);
+    }
+
+    orthogonalize(handle, u, U_full.data_handle(), m, idx_u, ortho_coeffs, use_mgs2);
+    used_random     = false;
+    auto alpha_norm = normalize_or_randomize(handle,
+                                             rng_state,
+                                             u,
+                                             m,
+                                             U_full.data_handle(),
+                                             idx_u,
+                                             ortho_coeffs,
+                                             eps,
+                                             use_mgs2,
+                                             &used_random);
+    alphas[i]       = used_random ? ValueTypeT(0) : alpha_norm;
+
+    auto* v_next = V_full.data_handle() + static_cast<std::size_t>(idx_v + 1) * n;
+    {
+      common::nvtx::range<common::nvtx::domain::raft> apply_t_scope("lanczos_svds::apply_AT");
+      op.apply_transpose(
+        handle,
+        raft::make_device_matrix_view<ValueTypeT const, uint32_t, raft::col_major>(u, m, 1),
+        raft::make_device_matrix_view<ValueTypeT, uint32_t, raft::col_major>(v_next, n, 1));
+    }
+
+    axpy(handle, n, -alphas[i], v, v_next);
+    orthogonalize(handle, v_next, V_full.data_handle(), n, idx_v + 1, ortho_coeffs, use_mgs2);
+
+    used_random   = false;
+    auto beta_nrm = normalize_or_randomize(handle,
+                                           rng_state,
+                                           v_next,
+                                           n,
+                                           V_full.data_handle(),
+                                           idx_v + 1,
+                                           ortho_coeffs,
+                                           eps,
+                                           use_mgs2,
+                                           &used_random);
+    betas[i + 1]  = used_random ? ValueTypeT(0) : beta_nrm;
+  }
+}
+
+template <typename ValueTypeT>
+void compute_ritz_vectors(raft::resources const& handle,
+                          int m,
+                          int n,
+                          int ncv,
+                          int n_locked,
+                          std::vector<int> const& indices,
+                          std::vector<ValueTypeT> const& singular_values,
+                          std::vector<ValueTypeT> const& P,
+                          std::vector<ValueTypeT> const& Qt,
+                          raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> U_full,
+                          raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> V_full,
+                          std::vector<ValueTypeT>& locked_singular_values)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::compute_ritz_vectors");
+  auto stream   = resource::get_cuda_stream(handle);
+  int num_found = static_cast<int>(indices.size());
+  if (num_found == 0) { return; }
+
+  std::vector<ValueTypeT> h_P_good(static_cast<std::size_t>(ncv) * num_found);
+  std::vector<ValueTypeT> h_Qt_good_t(static_cast<std::size_t>(ncv) * num_found);
+  for (int j = 0; j < num_found; ++j) {
+    int src_col = indices[j];
+    for (int row = 0; row < ncv; ++row) {
+      h_P_good[static_cast<std::size_t>(j) * ncv + row] =
+        P[static_cast<std::size_t>(src_col) * ncv + row];
+      h_Qt_good_t[static_cast<std::size_t>(j) * ncv + row] =
+        Qt[static_cast<std::size_t>(row) * ncv + src_col];
+    }
+  }
+
+  auto P_good =
+    raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, ncv, num_found);
+  auto Qt_good_t =
+    raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, ncv, num_found);
+  {
+    common::nvtx::range<common::nvtx::domain::raft> copy_scope("lanczos_svds::ritz_copy_basis");
+    raft::update_device(P_good.data_handle(), h_P_good.data(), h_P_good.size(), stream);
+    raft::update_device(Qt_good_t.data_handle(), h_Qt_good_t.data(), h_Qt_good_t.size(), stream);
+  }
+
+  auto U_ritz =
+    raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, m, num_found);
+  auto V_ritz =
+    raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, n, num_found);
+
+  ValueTypeT one  = 1;
+  ValueTypeT zero = 0;
+  auto* U_segment = U_full.data_handle() + static_cast<std::size_t>(n_locked) * m;
+  auto* V_segment = V_full.data_handle() + static_cast<std::size_t>(n_locked) * n;
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> gemm_scope("lanczos_svds::ritz_gemm_U");
+    raft::linalg::gemm(handle,
+                       U_segment,
+                       m,
+                       ncv,
+                       P_good.data_handle(),
+                       U_ritz.data_handle(),
+                       m,
+                       num_found,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       one,
+                       zero,
+                       stream);
+  }
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> gemm_scope("lanczos_svds::ritz_gemm_V");
+    raft::linalg::gemm(handle,
+                       V_segment,
+                       n,
+                       ncv,
+                       Qt_good_t.data_handle(),
+                       V_ritz.data_handle(),
+                       n,
+                       num_found,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       one,
+                       zero,
+                       stream);
+  }
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> copy_scope("lanczos_svds::ritz_copy_output");
+    raft::copy(U_full.data_handle() + static_cast<std::size_t>(n_locked) * m,
+               U_ritz.data_handle(),
+               static_cast<std::size_t>(m) * num_found,
+               stream);
+    raft::copy(V_full.data_handle() + static_cast<std::size_t>(n_locked) * n,
+               V_ritz.data_handle(),
+               static_cast<std::size_t>(n) * num_found,
+               stream);
+  }
+
+  for (int j = 0; j < num_found; ++j) {
+    locked_singular_values[n_locked + j] = singular_values[indices[j]];
+  }
+}
+
+template <typename ValueTypeT>
+void compute_restart_vector(raft::resources const& handle,
+                            int n,
+                            int ncv,
+                            int n_locked,
+                            std::vector<ValueTypeT> const& coeffs,
+                            raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> V_full,
+                            ValueTypeT* v_start)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::compute_restart_vector");
+  auto stream = resource::get_cuda_stream(handle);
+  auto d_coeffs =
+    raft::make_device_vector<ValueTypeT, uint32_t>(handle, static_cast<uint32_t>(ncv));
+  raft::update_device(d_coeffs.data_handle(), coeffs.data(), ncv, stream);
+
+  auto cublas_handle   = resource::get_cublas_handle(handle);
+  ValueTypeT const one = ValueTypeT(1);
+  ValueTypeT zero      = ValueTypeT(0);
+  auto* V_segment      = V_full.data_handle() + static_cast<std::size_t>(n_locked) * n;
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle,
+                                                   CUBLAS_OP_N,
+                                                   n,
+                                                   ncv,
+                                                   &one,
+                                                   V_segment,
+                                                   n,
+                                                   d_coeffs.data_handle(),
+                                                   1,
+                                                   &zero,
+                                                   v_start,
+                                                   1,
+                                                   stream));
+
+  auto nrm = vector_norm(handle, v_start, n);
+  if (nrm > ValueTypeT(0)) { scale_vector(handle, v_start, n, ValueTypeT(1) / nrm); }
+}
+
+template <typename ValueTypeT>
+void sort_singular_triplets(raft::resources const& handle,
+                            ValueTypeT* U_cols,
+                            int m,
+                            ValueTypeT* V_cols,
+                            int n,
+                            std::vector<ValueTypeT>& singular_values)
+{
+  common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::sort_singular_triplets");
+  int k = static_cast<int>(singular_values.size());
+  std::vector<int> order(k);
+  std::iota(order.begin(), order.end(), 0);
+  std::stable_sort(order.begin(), order.end(), [&](int a, int b) {
+    return singular_values[a] > singular_values[b];
+  });
+
+  bool already_sorted = true;
+  for (int i = 0; i < k; ++i) {
+    already_sorted = already_sorted && order[i] == i;
+  }
+  if (already_sorted) { return; }
+
+  auto stream   = resource::get_cuda_stream(handle);
+  auto U_sorted = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, m, k);
+  auto V_sorted = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(handle, n, k);
+  std::vector<ValueTypeT> sorted_s(k);
+
+  for (int dst = 0; dst < k; ++dst) {
+    int src = order[dst];
+    raft::copy(U_sorted.data_handle() + static_cast<std::size_t>(dst) * m,
+               U_cols + static_cast<std::size_t>(src) * m,
+               m,
+               stream);
+    raft::copy(V_sorted.data_handle() + static_cast<std::size_t>(dst) * n,
+               V_cols + static_cast<std::size_t>(src) * n,
+               n,
+               stream);
+    sorted_s[dst] = singular_values[src];
+  }
+
+  raft::copy(U_cols, U_sorted.data_handle(), static_cast<std::size_t>(m) * k, stream);
+  raft::copy(V_cols, V_sorted.data_handle(), static_cast<std::size_t>(n) * k, stream);
+  singular_values.swap(sorted_s);
+}
+
+/**
+ * @brief Lanczos bidiagonalization SVD for sparse matrices and linear operators.
+ */
+template <typename ValueTypeT, typename OperatorT>
+void sparse_lanczos_svd(
+  raft::resources const& handle,
+  sparse_lanczos_svd_config<ValueTypeT> const& config,
+  OperatorT const& op,
+  raft::device_vector_view<ValueTypeT, uint32_t> singular_values,
+  std::optional<raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> U,
+  std::optional<raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> Vt)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "raft::sparse::solver::sparse_lanczos_svd(%d, %d, %d)",
+    op.rows(),
+    op.cols(),
+    config.n_components);
+
+  int m       = op.rows();
+  int n       = op.cols();
+  int k       = config.n_components;
+  int min_dim = std::min(m, n);
+
+  RAFT_EXPECTS(k > 0, "n_components must be positive");
+  RAFT_EXPECTS(k < min_dim, "n_components must be less than min(m, n)");
+  RAFT_EXPECTS(config.max_iterations > 0, "max_iterations must be positive");
+  RAFT_EXPECTS(config.tolerance >= ValueTypeT(0), "tolerance must be non-negative");
+  RAFT_EXPECTS(singular_values.extent(0) == static_cast<uint32_t>(k),
+               "singular_values must have size n_components");
+  if (U) {
+    RAFT_EXPECTS(
+      U->extent(0) == static_cast<uint32_t>(m) && U->extent(1) == static_cast<uint32_t>(k),
+      "U must have shape (m, n_components)");
+  }
+  if (Vt) {
+    RAFT_EXPECTS(
+      Vt->extent(0) == static_cast<uint32_t>(k) && Vt->extent(1) == static_cast<uint32_t>(n),
+      "Vt must have shape (n_components, n)");
+  }
+
+  int ncv = config.ncv;
+  if (ncv <= 0) {
+    if (m > 100000) {
+      ncv = (k < 75) ? (22 * k + 9) / 10 : static_cast<int>(3.9 * k);
+    } else {
+      ncv = std::max(3 * k, 50);
+    }
+  }
+  ncv = std::max(ncv, k);
+  ncv = std::min(ncv, min_dim - 1);
+  RAFT_EXPECTS(ncv >= k, "ncv must be at least n_components after clamping");
+
+  auto stream   = resource::get_cuda_stream(handle);
+  uint64_t seed = config.seed.value_or(std::random_device{}());
+  raft::random::RngState rng_state(seed);
+
+  int total_capacity = k + ncv + 2;
+  auto U_full        = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+    handle, static_cast<uint32_t>(m), static_cast<uint32_t>(total_capacity));
+  auto V_full = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+    handle, static_cast<uint32_t>(n), static_cast<uint32_t>(total_capacity));
+  auto ortho_coeffs =
+    raft::make_device_vector<ValueTypeT, uint32_t>(handle, static_cast<uint32_t>(total_capacity));
+  auto v_start = raft::make_device_vector<ValueTypeT, uint32_t>(handle, static_cast<uint32_t>(n));
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::initial_random_vector");
+    raft::random::normal(handle,
+                         rng_state,
+                         v_start.data_handle(),
+                         static_cast<std::size_t>(n),
+                         ValueTypeT(0),
+                         ValueTypeT(1));
+  }
+
+  std::vector<ValueTypeT> locked_singular_values(k, ValueTypeT(0));
+  int n_locked   = 0;
+  int total_iter = 0;
+
+  while (n_locked < k && total_iter < config.max_iterations) {
+    common::nvtx::range<common::nvtx::domain::raft> restart_scope(
+      "lanczos_svds::restart_iteration");
+    int active_ncv = std::min(ncv, min_dim - n_locked - 1);
+    RAFT_EXPECTS(active_ncv > 0, "No remaining subspace available for Lanczos restart");
+
+    std::vector<ValueTypeT> alphas;
+    std::vector<ValueTypeT> betas;
+    lanczos_bidiagonalize(handle,
+                          op,
+                          active_ncv,
+                          v_start.data_handle(),
+                          U_full.view(),
+                          V_full.view(),
+                          n_locked,
+                          rng_state,
+                          ortho_coeffs.data_handle(),
+                          config.use_mgs2_orthogonalization,
+                          alphas,
+                          betas);
+
+    auto B = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+      handle, static_cast<uint32_t>(active_ncv), static_cast<uint32_t>(active_ncv));
+    build_bidiagonal_matrix(handle, alphas, betas, B.view());
+
+    auto P = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+      handle, static_cast<uint32_t>(active_ncv), static_cast<uint32_t>(active_ncv));
+    auto Qt = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+      handle, static_cast<uint32_t>(active_ncv), static_cast<uint32_t>(active_ncv));
+    auto S_full =
+      raft::make_device_vector<ValueTypeT, uint32_t>(handle, static_cast<uint32_t>(active_ncv));
+
+    {
+      common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::bidiagonal_svd");
+      raft::linalg::svdQR(handle,
+                          B.data_handle(),
+                          active_ncv,
+                          active_ncv,
+                          S_full.data_handle(),
+                          P.data_handle(),
+                          Qt.data_handle(),
+                          false,
+                          true,
+                          true,
+                          stream);
+    }
+
+    std::vector<ValueTypeT> h_s(active_ncv);
+    std::vector<ValueTypeT> h_P(static_cast<std::size_t>(active_ncv) * active_ncv);
+    std::vector<ValueTypeT> h_Qt(static_cast<std::size_t>(active_ncv) * active_ncv);
+    {
+      common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::copy_small_svd_host");
+      raft::update_host(h_s.data(), S_full.data_handle(), active_ncv, stream);
+      raft::update_host(h_P.data(), P.data_handle(), h_P.size(), stream);
+      raft::update_host(h_Qt.data(), Qt.data_handle(), h_Qt.size(), stream);
+      resource::sync_stream(handle, stream);
+    }
+
+    int remaining        = k - n_locked;
+    ValueTypeT max_s     = h_s.empty() || h_s[0] <= ValueTypeT(0) ? ValueTypeT(1) : h_s[0];
+    int check_components = std::min(remaining + 2, active_ncv);
+
+    std::vector<int> selected;
+    {
+      common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::select_converged");
+      for (int i = 0; i < check_components; ++i) {
+        auto residual = betas[active_ncv] *
+                        std::abs(h_P[static_cast<std::size_t>(i) * active_ncv + active_ncv - 1]);
+        if (residual < config.tolerance * max_s) { selected.push_back(i); }
+      }
+    }
+    if (static_cast<int>(selected.size()) > remaining) { selected.resize(remaining); }
+
+    if (!selected.empty()) {
+      compute_ritz_vectors(handle,
+                           m,
+                           n,
+                           active_ncv,
+                           n_locked,
+                           selected,
+                           h_s,
+                           h_P,
+                           h_Qt,
+                           U_full.view(),
+                           V_full.view(),
+                           locked_singular_values);
+      n_locked += static_cast<int>(selected.size());
+      if (n_locked >= k) { break; }
+    }
+
+    std::vector<ValueTypeT> restart_coeffs(active_ncv, ValueTypeT(0));
+    {
+      common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::build_restart_coeffs");
+      if (!selected.empty()) {
+        int best_unconverged = 0;
+        for (; best_unconverged < active_ncv; ++best_unconverged) {
+          if (std::find(selected.begin(), selected.end(), best_unconverged) == selected.end()) {
+            break;
+          }
+        }
+        if (best_unconverged == active_ncv) { best_unconverged = 0; }
+        for (int col = 0; col < active_ncv; ++col) {
+          restart_coeffs[col] = h_Qt[static_cast<std::size_t>(col) * active_ncv + best_unconverged];
+        }
+      } else {
+        int k_mix = std::min(k - n_locked, active_ncv);
+        for (int col = 0; col < active_ncv; ++col) {
+          ValueTypeT sum = 0;
+          for (int row = 0; row < k_mix; ++row) {
+            sum += h_Qt[static_cast<std::size_t>(col) * active_ncv + row];
+          }
+          restart_coeffs[col] = sum;
+        }
+      }
+    }
+
+    compute_restart_vector(
+      handle, n, active_ncv, n_locked, restart_coeffs, V_full.view(), v_start.data_handle());
+    ++total_iter;
+  }
+
+  RAFT_EXPECTS(n_locked >= k,
+               "sparse_lanczos_svd failed to converge all requested components within "
+               "max_iterations");
+
+  auto U_refined = raft::make_device_matrix<ValueTypeT, uint32_t, raft::col_major>(
+    handle, static_cast<uint32_t>(m), static_cast<uint32_t>(k));
+  {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::post_refine_A_V");
+    op.apply(handle,
+             raft::make_device_matrix_view<ValueTypeT const, uint32_t, raft::col_major>(
+               V_full.data_handle(), n, k),
+             U_refined.view());
+  }
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::post_refine_normalize");
+    for (int j = 0; j < k; ++j) {
+      auto* u_col               = U_refined.data_handle() + static_cast<std::size_t>(j) * m;
+      locked_singular_values[j] = vector_norm(handle, u_col, m);
+      if (locked_singular_values[j] > ValueTypeT(0)) {
+        scale_vector(handle, u_col, m, ValueTypeT(1) / locked_singular_values[j]);
+      }
+    }
+  }
+
+  sort_singular_triplets(
+    handle, U_refined.data_handle(), m, V_full.data_handle(), n, locked_singular_values);
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::copy_outputs");
+    raft::update_device(singular_values.data_handle(), locked_singular_values.data(), k, stream);
+    if (U) {
+      raft::copy(
+        U->data_handle(), U_refined.data_handle(), static_cast<std::size_t>(m) * k, stream);
+    }
+  }
+
+  if (Vt) {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::transpose_vt");
+    raft::linalg::transpose(handle, V_full.data_handle(), Vt->data_handle(), n, k, stream);
+  }
+
+  {
+    common::nvtx::range<common::nvtx::domain::raft> scope("lanczos_svds::sign_correction");
+    svd_sign_correction<ValueTypeT>(handle, U, Vt);
+  }
+}
+
+}  // namespace raft::sparse::solver::detail
diff --git a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
index 85268141ce..72e2bb525a 100644
--- a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
+++ b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
@@ -18,7 +18,7 @@
 #include <raft/random/rng_state.hpp>
 #include <raft/sparse/solver/detail/cholesky_qr.cuh>
 #include <raft/sparse/solver/detail/svds_sign_correction.cuh>
-#include <raft/sparse/solver/svds_config.hpp>
+#include <raft/sparse/solver/solver_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/include/raft/sparse/solver/detail/svds_optional.hpp b/cpp/include/raft/sparse/solver/detail/svds_optional.hpp
new file mode 100644
index 0000000000..daedfe193f
--- /dev/null
+++ b/cpp/include/raft/sparse/solver/detail/svds_optional.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <optional>
+
+namespace raft::sparse::solver::detail {
+
+// Helper alias used to put optional U/Vt parameters in a non-deduced context
+// so template argument deduction picks up the value type from earlier parameters and
+// implicit conversion from `device_matrix_view` to `std::optional<...>` works.
+template <typename T>
+struct nondeduced {
+  using type = T;
+};
+
+template <typename T>
+using nondeduced_optional_matrix_view_t = std::optional<typename nondeduced<T>::type>;
+
+}  // namespace raft::sparse::solver::detail
diff --git a/cpp/include/raft/sparse/solver/lanczos_svds.cuh b/cpp/include/raft/sparse/solver/lanczos_svds.cuh
new file mode 100644
index 0000000000..be70f6aeb6
--- /dev/null
+++ b/cpp/include/raft/sparse/solver/lanczos_svds.cuh
@@ -0,0 +1,104 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/solver/detail/csr_linear_operator.cuh>
+#include <raft/sparse/solver/detail/lanczos_svds.cuh>
+#include <raft/sparse/solver/detail/svds_optional.hpp>
+
+#include <optional>
+
+namespace raft::sparse::solver {
+
+/**
+ * @defgroup sparse_lanczos_svd Sparse Lanczos SVD
+ * @{
+ */
+
+/**
+ * @brief Compute truncated SVD using Lanczos bidiagonalization with a generic linear operator.
+ *
+ * This solver computes the largest singular triplets using implicitly restarted Lanczos
+ * bidiagonalization, full reorthogonalization, and final `A @ V` post-refinement of
+ * returned left singular vectors. It is intended as the higher-accuracy sparse SVD path.
+ * The operator interface matches sparse_randomized_svd and supports implicit operators
+ * such as centered sparse matrices.
+ *
+ * By default, the solver uses two-pass classical Gram-Schmidt (CGS2), which is efficient
+ * on GPUs. The configuration can request two-pass modified Gram-Schmidt (MGS2) as an
+ * alternate path for difficult spectra. If the requested components do not converge
+ * within `config.max_iterations`, this function raises an exception instead of returning
+ * partially converged vectors.
+ *
+ * `OperatorT` must expose:
+ *   - `int rows() const` / `int cols() const`
+ *   - `void apply(handle, X, Y) const` computes `Y = A @ X`
+ *   - `void apply_transpose(handle, X, Z) const` computes `Z = A^T @ X`
+ *
+ * @tparam ValueTypeT Data type (float or double)
+ * @tparam OperatorT Linear operator type satisfying the interface above
+ *
+ * @param[in] handle raft resources handle
+ * @param[in] config SVD configuration parameters
+ * @param[in] op linear operator representing the matrix to decompose
+ * @param[out] singular_values output singular values of shape (n_components,) in descending order
+ * @param[out] U optional output left singular vectors of shape (m, n_components), col-major.
+ *             Pass `std::nullopt` to skip storing U.
+ * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major.
+ *             Pass `std::nullopt` to skip storing Vt.
+ */
+template <typename ValueTypeT, typename OperatorT>
+void sparse_lanczos_svd(
+  raft::resources const& handle,
+  sparse_lanczos_svd_config<ValueTypeT> const& config,
+  OperatorT const& op,
+  raft::device_vector_view<ValueTypeT, uint32_t> singular_values,
+  detail::nondeduced_optional_matrix_view_t<
+    raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> U = std::nullopt,
+  detail::nondeduced_optional_matrix_view_t<
+    raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> Vt = std::nullopt)
+{
+  detail::sparse_lanczos_svd(handle, config, op, singular_values, U, Vt);
+}
+
+/**
+ * @brief Compute truncated SVD of a sparse CSR matrix using Lanczos bidiagonalization.
+ *
+ * Convenience overload that accepts a CSR matrix view directly.
+ *
+ * @tparam ValueTypeT Data type (float or double)
+ * @tparam NNZTypeT Type for number of non-zeros
+ *
+ * @param[in] handle raft resources handle
+ * @param[in] config SVD configuration parameters
+ * @param[in] A input sparse CSR matrix of shape (m, n)
+ * @param[out] singular_values output singular values of shape (n_components,) in descending order
+ * @param[out] U optional output left singular vectors of shape (m, n_components), col-major.
+ *             Pass `std::nullopt` to skip storing U.
+ * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major.
+ *             Pass `std::nullopt` to skip storing Vt.
+ */
+template <typename ValueTypeT, typename NNZTypeT>
+void sparse_lanczos_svd(
+  raft::resources const& handle,
+  sparse_lanczos_svd_config<ValueTypeT> const& config,
+  raft::device_csr_matrix_view<const ValueTypeT, int, int, NNZTypeT> A,
+  raft::device_vector_view<ValueTypeT, uint32_t> singular_values,
+  detail::nondeduced_optional_matrix_view_t<
+    raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> U = std::nullopt,
+  detail::nondeduced_optional_matrix_view_t<
+    raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major>> Vt = std::nullopt)
+{
+  detail::csr_linear_operator<ValueTypeT, NNZTypeT> op(A);
+  detail::sparse_lanczos_svd(handle, config, op, singular_values, U, Vt);
+}
+
+/** @} */
+
+}  // namespace raft::sparse::solver
diff --git a/cpp/include/raft/sparse/solver/randomized_svds.cuh b/cpp/include/raft/sparse/solver/randomized_svds.cuh
index 7ee1afe1c1..29c86ebaa5 100644
--- a/cpp/include/raft/sparse/solver/randomized_svds.cuh
+++ b/cpp/include/raft/sparse/solver/randomized_svds.cuh
@@ -10,23 +10,12 @@
 #include <raft/core/resources.hpp>
 #include <raft/sparse/solver/detail/csr_linear_operator.cuh>
 #include <raft/sparse/solver/detail/randomized_svds.cuh>
+#include <raft/sparse/solver/detail/svds_optional.hpp>
 
 #include <optional>
 
 namespace raft::sparse::solver {
 
-namespace detail {
-// Helper alias used to put the optional U/Vt parameters in a non-deduced context
-// so template argument deduction picks up ValueTypeT from earlier parameters and
-// implicit conversion from `device_matrix_view` to `std::optional<...>` works.
-template <typename T>
-struct nondeduced {
-  using type = T;
-};
-template <typename T>
-using nondeduced_optional_matrix_view_t = std::optional<typename nondeduced<T>::type>;
-}  // namespace detail
-
 /**
  * @defgroup sparse_randomized_svd Sparse Randomized SVD
  * @{
@@ -48,7 +37,7 @@ using nondeduced_optional_matrix_view_t = std::optional<typename nondeduced<T>::
  * where `X`, `Y`, `Z` are `raft::device_matrix_view<..., uint32_t, raft::col_major>`.
  * This is intentionally a narrow, SVD-specific operator: only the two matrix products
  * above are required, and no other operations (addition, scaling, inverse, eigensolves,
- * etc.) are assumed or supported — it is not a general-purpose linear operator like
+ * etc.) are assumed or supported; it is not a general-purpose linear operator like
  * `scipy.sparse.linalg.LinearOperator`.
  *
  * @tparam ValueTypeT Data type (float or double)
diff --git a/cpp/include/raft/sparse/solver/solver_types.hpp b/cpp/include/raft/sparse/solver/solver_types.hpp
new file mode 100644
index 0000000000..6f5f04f567
--- /dev/null
+++ b/cpp/include/raft/sparse/solver/solver_types.hpp
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+
+namespace raft::sparse::solver {
+
+/**
+ * @addtogroup sparse_randomized_svd
+ * @{
+ */
+
+/**
+ * @brief Configuration parameters for the sparse randomized SVD solver
+ *
+ * @tparam ValueTypeT Data type for values (float or double)
+ */
+template <typename ValueTypeT>
+struct sparse_svd_config {
+  /** @brief Number of singular values/vectors to compute. Must be set by the user. */
+  int n_components = 0;
+
+  /** @brief Number of extra random vectors for better approximation.
+   *  Total subspace dimension is n_components + n_oversamples. */
+  int n_oversamples = 10;
+
+  /** @brief Number of power iteration passes. More iterations improve accuracy
+   *  for matrices with slowly decaying singular values. */
+  int n_power_iters = 2;
+
+  /** @brief Random seed for reproducibility */
+  std::optional<uint64_t> seed = std::nullopt;
+};
+
+/** @} */
+
+/**
+ * @addtogroup sparse_lanczos_svd
+ * @{
+ */
+
+/**
+ * @brief Configuration parameters for the sparse Lanczos SVD solver
+ *
+ * @tparam ValueTypeT Data type for values (float or double)
+ */
+template <typename ValueTypeT>
+struct sparse_lanczos_svd_config {
+  /** @brief Number of singular values/vectors to compute. Must be set by the user.
+   *  @note Must satisfy 0 < n_components < min(m, n), where (m, n) is the matrix shape. */
+  int n_components = 0;
+
+  /**
+   * @brief Number of Lanczos vectors per restart.
+   *
+   * If zero, a matrix-shape dependent default is selected. Larger values can improve
+   * convergence margin and orthogonality for clustered spectra, but increase sparse
+   * matrix-vector work and memory use.
+   *
+   * @note When nonzero, the value is clamped to [n_components, min(m, n) - 1].
+   */
+  int ncv = 0;
+
+  /** @brief Convergence tolerance for Lanczos Ritz residual estimates. */
+  ValueTypeT tolerance = ValueTypeT(1e-4);
+
+  /** @brief Maximum number of restart iterations before reporting non-convergence. */
+  int max_iterations = 100;
+
+  /** @brief Random seed for reproducibility. */
+  std::optional<uint64_t> seed = std::nullopt;
+
+  /**
+   * @brief Use launch-heavy MGS2 instead of the default GPU-efficient CGS2 reorthogonalization.
+   *
+   * MGS2 is kept as an alternate path for difficult spectra; CGS2 is the default used in
+   * normal GPU workloads.
+   */
+  bool use_mgs2_orthogonalization = false;
+};
+
+/** @} */
+
+}  // namespace raft::sparse::solver
diff --git a/cpp/include/raft/sparse/solver/svds_config.hpp b/cpp/include/raft/sparse/solver/svds_config.hpp
deleted file mode 100644
index bbb56a7db6..0000000000
--- a/cpp/include/raft/sparse/solver/svds_config.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <cstdint>
-#include <optional>
-
-namespace raft::sparse::solver {
-
-/**
- * @addtogroup sparse_randomized_svd
- * @{
- */
-
-/**
- * @brief Configuration parameters for the sparse randomized SVD solver
- *
- * @tparam ValueTypeT Data type for values (float or double)
- */
-template <typename ValueTypeT>
-struct sparse_svd_config {
-  /** @brief Number of singular values/vectors to compute. Must be set by the user. */
-  int n_components = 0;
-
-  /** @brief Number of extra random vectors for better approximation.
-   *  Total subspace dimension is n_components + n_oversamples. */
-  int n_oversamples = 10;
-
-  /** @brief Number of power iteration passes. More iterations improve accuracy
-   *  for matrices with slowly decaying singular values. */
-  int n_power_iters = 2;
-
-  /** @brief Random seed for reproducibility */
-  std::optional<uint64_t> seed = std::nullopt;
-};
-
-/** @} */
-
-}  // namespace raft::sparse::solver
diff --git a/cpp/include/raft_runtime/solver/lanczos_svds.hpp b/cpp/include/raft_runtime/solver/lanczos_svds.hpp
new file mode 100644
index 0000000000..73300dfe78
--- /dev/null
+++ b/cpp/include/raft_runtime/solver/lanczos_svds.hpp
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/solver/solver_types.hpp>
+
+#include <cstdint>
+#include <optional>
+
+namespace raft::runtime::solver {
+
+/**
+ * @defgroup sparse_lanczos_svd_runtime Sparse Lanczos SVD Runtime API
+ * @{
+ */
+
+#define LANCZOS_SVD_FUNC_DECL(ValueType)                                             \
+  RAFT_EXPORT void sparse_lanczos_svd(                                               \
+    const raft::resources& handle,                                                   \
+    const raft::sparse::solver::sparse_lanczos_svd_config<ValueType>& config,        \
+    raft::device_vector_view<int, uint32_t, raft::row_major> indptr,                 \
+    raft::device_vector_view<int, uint32_t, raft::row_major> indices,                \
+    raft::device_vector_view<ValueType, uint32_t, raft::row_major> data,             \
+    int n_rows,                                                                      \
+    int n_cols,                                                                      \
+    int nnz,                                                                         \
+    raft::device_vector_view<ValueType, uint32_t> singular_values,                   \
+    std::optional<raft::device_matrix_view<ValueType, uint32_t, raft::col_major>> U, \
+    std::optional<raft::device_matrix_view<ValueType, uint32_t, raft::col_major>> Vt)
+
+LANCZOS_SVD_FUNC_DECL(float);
+LANCZOS_SVD_FUNC_DECL(double);
+
+#undef LANCZOS_SVD_FUNC_DECL
+
+/** @} */
+
+}  // namespace raft::runtime::solver
diff --git a/cpp/include/raft_runtime/solver/randomized_svds.hpp b/cpp/include/raft_runtime/solver/randomized_svds.hpp
index 07bc3ec33a..c6f6df066a 100644
--- a/cpp/include/raft_runtime/solver/randomized_svds.hpp
+++ b/cpp/include/raft_runtime/solver/randomized_svds.hpp
@@ -8,7 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
-#include <raft/sparse/solver/svds_config.hpp>
+#include <raft/sparse/solver/solver_types.hpp>
 
 #include <cstdint>
 #include <optional>
diff --git a/cpp/src/raft_runtime/solver/lanczos_svds.cuh b/cpp/src/raft_runtime/solver/lanczos_svds.cuh
new file mode 100644
index 0000000000..d948dd1c22
--- /dev/null
+++ b/cpp/src/raft_runtime/solver/lanczos_svds.cuh
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <raft/sparse/solver/lanczos_svds.cuh>
+
+#include <raft_runtime/solver/lanczos_svds.hpp>
+
+#include <optional>
+
+#define FUNC_DEF(ValueType)                                                                       \
+  void sparse_lanczos_svd(                                                                        \
+    const raft::resources& handle,                                                                \
+    const raft::sparse::solver::sparse_lanczos_svd_config<ValueType>& config,                     \
+    raft::device_vector_view<int, uint32_t, raft::row_major> indptr,                              \
+    raft::device_vector_view<int, uint32_t, raft::row_major> indices,                             \
+    raft::device_vector_view<ValueType, uint32_t, raft::row_major> data,                          \
+    int n_rows,                                                                                   \
+    int n_cols,                                                                                   \
+    int nnz,                                                                                      \
+    raft::device_vector_view<ValueType, uint32_t> singular_values,                                \
+    std::optional<raft::device_matrix_view<ValueType, uint32_t, raft::col_major>> U,              \
+    std::optional<raft::device_matrix_view<ValueType, uint32_t, raft::col_major>> Vt)             \
+  {                                                                                               \
+    auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(              \
+      indptr.data_handle(), indices.data_handle(), n_rows, n_cols, nnz);                          \
+    auto csr_matrix = raft::make_device_csr_matrix_view<const ValueType, int, int, int>(          \
+      data.data_handle(), csr_structure);                                                         \
+    raft::sparse::solver::sparse_lanczos_svd(handle, config, csr_matrix, singular_values, U, Vt); \
+  }
diff --git a/cpp/src/raft_runtime/solver/lanczos_svds_double.cu b/cpp/src/raft_runtime/solver/lanczos_svds_double.cu
new file mode 100644
index 0000000000..10344b473c
--- /dev/null
+++ b/cpp/src/raft_runtime/solver/lanczos_svds_double.cu
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "lanczos_svds.cuh"
+
+namespace raft::runtime::solver {
+
+FUNC_DEF(double);
+
+}  // namespace raft::runtime::solver
+
+#undef FUNC_DEF
diff --git a/cpp/src/raft_runtime/solver/lanczos_svds_float.cu b/cpp/src/raft_runtime/solver/lanczos_svds_float.cu
new file mode 100644
index 0000000000..850da77d9f
--- /dev/null
+++ b/cpp/src/raft_runtime/solver/lanczos_svds_float.cu
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "lanczos_svds.cuh"
+
+namespace raft::runtime::solver {
+
+FUNC_DEF(float);
+
+}  // namespace raft::runtime::solver
+
+#undef FUNC_DEF
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 41b6e7ee3c..ab80f3e415 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -241,8 +241,17 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH linalg/eigen_solvers.cu lap/lap.cu sparse/mst.cu
-    sparse/solver/lanczos.cu sparse/solver/randomized_svds.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME
+    SOLVERS_TEST
+    PATH
+    linalg/eigen_solvers.cu
+    lap/lap.cu
+    sparse/mst.cu
+    sparse/solver/lanczos.cu
+    sparse/solver/lanczos_svds.cu
+    sparse/solver/randomized_svds.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
diff --git a/cpp/tests/sparse/solver/lanczos_svds.cu b/cpp/tests/sparse/solver/lanczos_svds.cu
new file mode 100644
index 0000000000..052de68424
--- /dev/null
+++ b/cpp/tests/sparse/solver/lanczos_svds.cu
@@ -0,0 +1,544 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/solver/lanczos_svds.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+namespace raft::sparse::solver {
+
+template <typename ValueType>
+void check_orthonormal_columns(std::vector<ValueType> const& Q, int n_rows, int n_cols, double tol)
+{
+  for (int i = 0; i < n_cols; ++i) {
+    for (int j = 0; j < n_cols; ++j) {
+      double dot = 0;
+      for (int row = 0; row < n_rows; ++row) {
+        dot += static_cast<double>(Q[static_cast<std::size_t>(i) * n_rows + row]) *
+               static_cast<double>(Q[static_cast<std::size_t>(j) * n_rows + row]);
+      }
+      double expected = (i == j) ? 1.0 : 0.0;
+      ASSERT_NEAR(dot, expected, tol);
+    }
+  }
+}
+
+template <typename ValueType>
+void check_orthonormal_vt_rows(std::vector<ValueType> const& Vt, int k, int n, double tol)
+{
+  for (int i = 0; i < k; ++i) {
+    for (int j = 0; j < k; ++j) {
+      double dot = 0;
+      for (int col = 0; col < n; ++col) {
+        dot += static_cast<double>(Vt[static_cast<std::size_t>(col) * k + i]) *
+               static_cast<double>(Vt[static_cast<std::size_t>(col) * k + j]);
+      }
+      double expected = (i == j) ? 1.0 : 0.0;
+      ASSERT_NEAR(dot, expected, tol);
+    }
+  }
+}
+
+template <typename ValueType>
+class LanczosSvdsTest : public ::testing::Test {
+ public:
+  LanczosSvdsTest()
+    : stream(resource::get_cuda_stream(handle)),
+      m(12),
+      n(8),
+      k(3),
+      nnz(8),
+      d_indptr(raft::make_device_vector<int, uint32_t>(handle, m + 1)),
+      d_indices(raft::make_device_vector<int, uint32_t>(handle, nnz)),
+      d_values(raft::make_device_vector<ValueType, uint32_t>(handle, nnz))
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    h_indptr = {0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8};
+    h_indices.resize(nnz);
+    h_values.resize(nnz);
+    for (int i = 0; i < nnz; ++i) {
+      h_indices[i] = i;
+      h_values[i]  = ValueType(8 - i);
+    }
+
+    raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream);
+    raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream);
+    raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream);
+  }
+
+  void Run(bool use_mgs2 = false)
+  {
+    auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+      d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz);
+    auto csr_matrix = raft::make_device_csr_matrix_view<const ValueType, int, int, int>(
+      d_values.data_handle(), csr_structure);
+
+    sparse_lanczos_svd_config<ValueType> config;
+    config.n_components               = k;
+    config.ncv                        = 6;
+    config.tolerance                  = ValueType(1e-6);
+    config.max_iterations             = 20;
+    config.seed                       = 42;
+    config.use_mgs2_orthogonalization = use_mgs2;
+
+    auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
+    auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
+    auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
+
+    sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+
+    std::vector<ValueType> h_S(k);
+    std::vector<ValueType> h_U(static_cast<std::size_t>(m) * k);
+    std::vector<ValueType> h_Vt(static_cast<std::size_t>(k) * n);
+    raft::update_host(h_S.data(), S.data_handle(), k, stream);
+    raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream);
+    raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream);
+    resource::sync_stream(handle, stream);
+
+    ValueType tol       = std::is_same_v<ValueType, float> ? ValueType(2e-3) : ValueType(1e-8);
+    double residual_tol = std::is_same_v<ValueType, float> ? 2e-2 : 1e-5;
+    for (int i = 0; i < k; ++i) {
+      ASSERT_NEAR(h_S[i], ValueType(8 - i), tol);
+    }
+
+    check_orthonormal_columns(h_U, m, k, tol);
+    check_orthonormal_vt_rows(h_Vt, k, n, tol);
+
+    for (int comp = 0; comp < k; ++comp) {
+      double left_residual_sq  = 0;
+      double right_residual_sq = 0;
+      for (int row = 0; row < m; ++row) {
+        ValueType av =
+          row < n ? h_values[row] * h_Vt[static_cast<std::size_t>(row) * k + comp] : ValueType(0);
+        ValueType su = h_S[comp] * h_U[static_cast<std::size_t>(comp) * m + row];
+        double diff  = static_cast<double>(av - su);
+        left_residual_sq += diff * diff;
+      }
+      for (int col = 0; col < n; ++col) {
+        ValueType atu = h_values[col] * h_U[static_cast<std::size_t>(comp) * m + col];
+        ValueType sv  = h_S[comp] * h_Vt[static_cast<std::size_t>(col) * k + comp];
+        double diff   = static_cast<double>(atu - sv);
+        right_residual_sq += diff * diff;
+      }
+      ASSERT_LT(std::sqrt(left_residual_sq), residual_tol);
+      ASSERT_LT(std::sqrt(right_residual_sq), residual_tol);
+    }
+  }
+
+  raft::resources handle;
+  cudaStream_t stream;
+  int m, n, k, nnz;
+  std::vector<int> h_indptr;
+  std::vector<int> h_indices;
+  std::vector<ValueType> h_values;
+  raft::device_vector<int, uint32_t> d_indptr;
+  raft::device_vector<int, uint32_t> d_indices;
+  raft::device_vector<ValueType, uint32_t> d_values;
+};
+
+using LanczosSvdsTestF = LanczosSvdsTest<float>;
+TEST_F(LanczosSvdsTestF, DiagonalSpectrum) { Run(); }
+TEST_F(LanczosSvdsTestF, DiagonalSpectrumMgs2) { Run(true); }
+
+using LanczosSvdsTestD = LanczosSvdsTest<double>;
+TEST_F(LanczosSvdsTestD, DiagonalSpectrum) { Run(); }
+
+template <typename ValueType>
+class LanczosClusteredSpectrumTest : public ::testing::Test {
+ public:
+  LanczosClusteredSpectrumTest()
+    : stream(resource::get_cuda_stream(handle)),
+      m(48),
+      n(32),
+      k(8),
+      nnz(32),
+      d_indptr(raft::make_device_vector<int, uint32_t>(handle, m + 1)),
+      d_indices(raft::make_device_vector<int, uint32_t>(handle, nnz)),
+      d_values(raft::make_device_vector<ValueType, uint32_t>(handle, nnz))
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    h_indptr.resize(m + 1);
+    h_indices.resize(nnz);
+    h_values.resize(nnz);
+    for (int i = 0; i <= m; ++i) {
+      h_indptr[i] = std::min(i, nnz);
+    }
+    for (int i = 0; i < nnz; ++i) {
+      h_indices[i] = i;
+    }
+
+    std::vector<double> singular_values = {10.0, 9.999, 9.998, 9.5, 9.499, 9.0, 8.999, 8.5};
+    for (int i = 0; i < nnz; ++i) {
+      h_values[i] = i < static_cast<int>(singular_values.size())
+                      ? static_cast<ValueType>(singular_values[i])
+                      : static_cast<ValueType>(1.0 / (i + 1));
+    }
+
+    raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream);
+    raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream);
+    raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream);
+  }
+
+  void Run(bool use_mgs2 = false)
+  {
+    auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+      d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz);
+    auto csr_matrix = raft::make_device_csr_matrix_view<const ValueType, int, int, int>(
+      d_values.data_handle(), csr_structure);
+
+    sparse_lanczos_svd_config<ValueType> config;
+    config.n_components   = k;
+    config.ncv            = 18;
+    config.tolerance      = std::is_same_v<ValueType, float> ? ValueType(1e-5) : ValueType(1e-10);
+    config.max_iterations = 80;
+    config.seed           = 123;
+    config.use_mgs2_orthogonalization = use_mgs2;
+
+    auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
+    auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
+    auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
+
+    sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+
+    std::vector<ValueType> h_S(k);
+    std::vector<ValueType> h_U(static_cast<std::size_t>(m) * k);
+    std::vector<ValueType> h_Vt(static_cast<std::size_t>(k) * n);
+    raft::update_host(h_S.data(), S.data_handle(), k, stream);
+    raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream);
+    raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream);
+    resource::sync_stream(handle, stream);
+
+    double sv_tol     = std::is_same_v<ValueType, float> ? 1e-3 : 1e-8;
+    double orthog_tol = std::is_same_v<ValueType, float> ? 2e-4 : 1e-10;
+    for (int i = 0; i < k; ++i) {
+      ASSERT_NEAR(static_cast<double>(h_S[i]), static_cast<double>(h_values[i]), sv_tol);
+    }
+    check_orthonormal_columns(h_U, m, k, orthog_tol);
+    check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol);
+  }
+
+  raft::resources handle;
+  cudaStream_t stream;
+  int m, n, k, nnz;
+  std::vector<int> h_indptr;
+  std::vector<int> h_indices;
+  std::vector<ValueType> h_values;
+  raft::device_vector<int, uint32_t> d_indptr;
+  raft::device_vector<int, uint32_t> d_indices;
+  raft::device_vector<ValueType, uint32_t> d_values;
+};
+
+using LanczosClusteredSpectrumTestF = LanczosClusteredSpectrumTest<float>;
+TEST_F(LanczosClusteredSpectrumTestF, Orthonormality) { Run(); }
+TEST_F(LanczosClusteredSpectrumTestF, OrthonormalityMgs2) { Run(true); }
+
+using LanczosClusteredSpectrumTestD = LanczosClusteredSpectrumTest<double>;
+TEST_F(LanczosClusteredSpectrumTestD, Orthonormality) { Run(); }
+
+template <typename ValueType>
+void run_diagonal_hardening_case(int m,
+                                 int n,
+                                 std::vector<double> const& diagonal,
+                                 int k,
+                                 int ncv,
+                                 int max_iterations,
+                                 ValueType tolerance,
+                                 double singular_value_tol,
+                                 double orthog_tol,
+                                 double residual_tol,
+                                 bool use_mgs2 = false)
+{
+  raft::resources handle;
+  auto stream = resource::get_cuda_stream(handle);
+
+  std::vector<int> h_indptr(m + 1);
+  std::vector<int> h_indices;
+  std::vector<ValueType> h_values;
+  int min_dim = std::min(m, n);
+  for (int row = 0; row < m; ++row) {
+    h_indptr[row] = static_cast<int>(h_values.size());
+    if (row < min_dim && row < static_cast<int>(diagonal.size()) && diagonal[row] != 0.0) {
+      h_indices.push_back(row);
+      h_values.push_back(static_cast<ValueType>(diagonal[row]));
+    }
+  }
+  h_indptr[m] = static_cast<int>(h_values.size());
+
+  auto nnz       = static_cast<int>(h_values.size());
+  auto d_indptr  = raft::make_device_vector<int, uint32_t>(handle, m + 1);
+  auto d_indices = raft::make_device_vector<int, uint32_t>(handle, nnz);
+  auto d_values  = raft::make_device_vector<ValueType, uint32_t>(handle, nnz);
+  raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream);
+  raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream);
+  raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream);
+
+  auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+    d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz);
+  auto csr_matrix = raft::make_device_csr_matrix_view<const ValueType, int, int, int>(
+    d_values.data_handle(), csr_structure);
+
+  sparse_lanczos_svd_config<ValueType> config;
+  config.n_components               = k;
+  config.ncv                        = ncv;
+  config.tolerance                  = tolerance;
+  config.max_iterations             = max_iterations;
+  config.seed                       = 2026;
+  config.use_mgs2_orthogonalization = use_mgs2;
+
+  auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
+  auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
+  auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
+
+  sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+
+  std::vector<ValueType> h_S(k);
+  std::vector<ValueType> h_U(static_cast<std::size_t>(m) * k);
+  std::vector<ValueType> h_Vt(static_cast<std::size_t>(k) * n);
+  raft::update_host(h_S.data(), S.data_handle(), k, stream);
+  raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream);
+  raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream);
+  resource::sync_stream(handle, stream);
+
+  for (int i = 0; i < k; ++i) {
+    ASSERT_NEAR(static_cast<double>(h_S[i]), diagonal[i], singular_value_tol);
+  }
+  check_orthonormal_columns(h_U, m, k, orthog_tol);
+  check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol);
+
+  auto diag_at = [&](int i) {
+    return (i < min_dim && i < static_cast<int>(diagonal.size()))
+             ? static_cast<ValueType>(diagonal[i])
+             : ValueType(0);
+  };
+
+  for (int comp = 0; comp < k; ++comp) {
+    double left_residual_sq  = 0;
+    double right_residual_sq = 0;
+    for (int row = 0; row < m; ++row) {
+      ValueType av =
+        row < n ? diag_at(row) * h_Vt[static_cast<std::size_t>(row) * k + comp] : ValueType(0);
+      ValueType su = h_S[comp] * h_U[static_cast<std::size_t>(comp) * m + row];
+      double diff  = static_cast<double>(av - su);
+      left_residual_sq += diff * diff;
+    }
+    for (int col = 0; col < n; ++col) {
+      ValueType atu =
+        col < m ? diag_at(col) * h_U[static_cast<std::size_t>(comp) * m + col] : ValueType(0);
+      ValueType sv = h_S[comp] * h_Vt[static_cast<std::size_t>(col) * k + comp];
+      double diff  = static_cast<double>(atu - sv);
+      right_residual_sq += diff * diff;
+    }
+    ASSERT_LT(std::sqrt(left_residual_sq), residual_tol);
+    ASSERT_LT(std::sqrt(right_residual_sq), residual_tol);
+  }
+}
+
+TEST(LanczosHardeningF, RepeatedSpectrum)
+{
+  run_diagonal_hardening_case<float>(64,
+                                     40,
+                                     {10.0, 10.0, 10.0, 9.0, 9.0, 8.0, 7.0, 6.0, 1.0, 0.5},
+                                     6,
+                                     18,
+                                     120,
+                                     1e-5f,
+                                     2e-3,
+                                     2e-3,
+                                     2e-2);
+}
+
+TEST(LanczosHardeningF, RankDeficientTall)
+{
+  run_diagonal_hardening_case<float>(
+    96, 28, {12.0, 8.0, 6.0, 3.0, 1.0, 0.0, 0.0, 0.0}, 4, 12, 80, 1e-5f, 2e-3, 2e-3, 2e-2);
+}
+
+TEST(LanczosHardeningF, WideMatrix)
+{
+  run_diagonal_hardening_case<float>(
+    24, 96, {12.0, 11.5, 8.0, 4.0, 2.0, 1.0, 0.5}, 4, 12, 80, 1e-5f, 2e-3, 2e-3, 2e-2);
+}
+
+TEST(LanczosHardeningD, RankDeficientTall)
+{
+  run_diagonal_hardening_case<double>(
+    96, 28, {12.0, 8.0, 6.0, 3.0, 1.0, 0.0, 0.0, 0.0}, 4, 12, 80, 1e-10, 1e-8, 1e-10, 1e-5);
+}
+
+TEST(LanczosHardeningF, ReportsNonConvergence)
+{
+  EXPECT_THROW(run_diagonal_hardening_case<float>(
+                 64, 40, {10.0, 9.0, 8.0, 7.0, 6.0, 5.0}, 4, 6, 1, 0.0f, 2e-3, 2e-3, 2e-2),
+               raft::logic_error);
+}
+
+template <typename ValueType>
+void check_sparse_residuals(std::vector<int> const& indptr,
+                            std::vector<int> const& indices,
+                            std::vector<ValueType> const& values,
+                            std::vector<ValueType> const& S,
+                            std::vector<ValueType> const& U,
+                            std::vector<ValueType> const& Vt,
+                            int m,
+                            int n,
+                            int k,
+                            double residual_tol)
+{
+  for (int comp = 0; comp < k; ++comp) {
+    std::vector<double> av(m, 0.0);
+    std::vector<double> atu(n, 0.0);
+    for (int row = 0; row < m; ++row) {
+      for (int nz = indptr[row]; nz < indptr[row + 1]; ++nz) {
+        int col      = indices[nz];
+        double value = static_cast<double>(values[nz]);
+        av[row] += value * static_cast<double>(Vt[static_cast<std::size_t>(col) * k + comp]);
+        atu[col] += value * static_cast<double>(U[static_cast<std::size_t>(comp) * m + row]);
+      }
+    }
+
+    double left_residual_sq  = 0.0;
+    double right_residual_sq = 0.0;
+    for (int row = 0; row < m; ++row) {
+      double su = static_cast<double>(S[comp]) *
+                  static_cast<double>(U[static_cast<std::size_t>(comp) * m + row]);
+      double diff = av[row] - su;
+      left_residual_sq += diff * diff;
+    }
+    for (int col = 0; col < n; ++col) {
+      double sv = static_cast<double>(S[comp]) *
+                  static_cast<double>(Vt[static_cast<std::size_t>(col) * k + comp]);
+      double diff = atu[col] - sv;
+      right_residual_sq += diff * diff;
+    }
+    ASSERT_LT(std::sqrt(left_residual_sq), residual_tol);
+    ASSERT_LT(std::sqrt(right_residual_sq), residual_tol);
+  }
+}
+
+template <typename ValueType>
+void run_rotated_block_hardening_case(bool use_mgs2 = false)
+{
+  constexpr int m                = 64;
+  constexpr int n                = 48;
+  constexpr int k                = 8;
+  std::vector<double> expected_s = {10.0, 9.9995, 9.999, 9.5, 9.4995, 9.0, 8.5, 8.0};
+  std::vector<double> all_s      = expected_s;
+  all_s.push_back(0.75);
+  all_s.push_back(0.25);
+
+  std::vector<int> h_indptr(m + 1);
+  std::vector<int> h_indices;
+  std::vector<ValueType> h_values;
+
+  auto add_block = [&](int row0, int col0, double s0, double s1, double theta, double phi) {
+    double cu  = std::cos(theta);
+    double su  = std::sin(theta);
+    double cv  = std::cos(phi);
+    double sv  = std::sin(phi);
+    double a00 = cu * s0 * cv + su * s1 * sv;
+    double a01 = cu * s0 * sv - su * s1 * cv;
+    double a10 = su * s0 * cv - cu * s1 * sv;
+    double a11 = su * s0 * sv + cu * s1 * cv;
+    h_indices.push_back(col0);
+    h_values.push_back(static_cast<ValueType>(a00));
+    h_indices.push_back(col0 + 1);
+    h_values.push_back(static_cast<ValueType>(a01));
+    h_indptr[row0 + 1] = static_cast<int>(h_values.size());
+    h_indices.push_back(col0);
+    h_values.push_back(static_cast<ValueType>(a10));
+    h_indices.push_back(col0 + 1);
+    h_values.push_back(static_cast<ValueType>(a11));
+    h_indptr[row0 + 2] = static_cast<int>(h_values.size());
+  };
+
+  for (int block = 0; block < static_cast<int>(all_s.size() / 2); ++block) {
+    int row0       = 2 * block;
+    int col0       = 2 * block;
+    h_indptr[row0] = static_cast<int>(h_values.size());
+    add_block(
+      row0, col0, all_s[2 * block], all_s[2 * block + 1], 0.17 + 0.11 * block, 0.31 + 0.07 * block);
+  }
+  for (int row = static_cast<int>(all_s.size()); row <= m; ++row) {
+    h_indptr[row] = static_cast<int>(h_values.size());
+  }
+
+  raft::resources handle;
+  auto stream    = resource::get_cuda_stream(handle);
+  auto nnz       = static_cast<int>(h_values.size());
+  auto d_indptr  = raft::make_device_vector<int, uint32_t>(handle, m + 1);
+  auto d_indices = raft::make_device_vector<int, uint32_t>(handle, nnz);
+  auto d_values  = raft::make_device_vector<ValueType, uint32_t>(handle, nnz);
+  raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream);
+  raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream);
+  raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream);
+
+  auto csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+    d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz);
+  auto csr_matrix = raft::make_device_csr_matrix_view<const ValueType, int, int, int>(
+    d_values.data_handle(), csr_structure);
+
+  sparse_lanczos_svd_config<ValueType> config;
+  config.n_components   = k;
+  config.ncv            = 20;
+  config.tolerance      = std::is_same_v<ValueType, float> ? ValueType(1e-5) : ValueType(1e-10);
+  config.max_iterations = 120;
+  config.seed           = 2027;
+  config.use_mgs2_orthogonalization = use_mgs2;
+
+  auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
+  auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
+  auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
+  sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+
+  std::vector<ValueType> h_S(k);
+  std::vector<ValueType> h_U(static_cast<std::size_t>(m) * k);
+  std::vector<ValueType> h_Vt(static_cast<std::size_t>(k) * n);
+  raft::update_host(h_S.data(), S.data_handle(), k, stream);
+  raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream);
+  raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream);
+  resource::sync_stream(handle, stream);
+
+  double sv_tol       = std::is_same_v<ValueType, float> ? 2e-3 : 1e-8;
+  double orthog_tol   = std::is_same_v<ValueType, float> ? 2e-3 : 1e-10;
+  double residual_tol = std::is_same_v<ValueType, float> ? 2e-2 : 1e-5;
+  for (int i = 0; i < k; ++i) {
+    ASSERT_NEAR(static_cast<double>(h_S[i]), expected_s[i], sv_tol);
+  }
+  check_orthonormal_columns(h_U, m, k, orthog_tol);
+  check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol);
+  check_sparse_residuals(h_indptr, h_indices, h_values, h_S, h_U, h_Vt, m, n, k, residual_tol);
+}
+
+TEST(LanczosHardeningF, RotatedClusteredSpectrum) { run_rotated_block_hardening_case<float>(); }
+
+TEST(LanczosHardeningF, RotatedClusteredSpectrumMgs2)
+{
+  run_rotated_block_hardening_case<float>(true);
+}
+
+TEST(LanczosHardeningD, RotatedClusteredSpectrum) { run_rotated_block_hardening_case<double>(); }
+
+}  // namespace raft::sparse::solver
diff --git a/docs/source/cpp_api/sparse_solver.rst b/docs/source/cpp_api/sparse_solver.rst
index e8abc37719..1285d06795 100644
--- a/docs/source/cpp_api/sparse_solver.rst
+++ b/docs/source/cpp_api/sparse_solver.rst
@@ -13,3 +13,11 @@ Sparse Randomized SVD
     :project: RAFT
     :members:
     :content-only:
+
+Sparse Lanczos SVD
+------------------
+
+.. doxygengroup:: sparse_lanczos_svd
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/python/pylibraft/benchmarks/sparse_svds.py b/python/pylibraft/benchmarks/sparse_svds.py
new file mode 100644
index 0000000000..f065df2990
--- /dev/null
+++ b/python/pylibraft/benchmarks/sparse_svds.py
@@ -0,0 +1,477 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import math
+import time
+from pathlib import Path
+
+import cupy as cp
+import cupyx.scipy.sparse as cupy_sparse
+import cupyx.scipy.sparse.linalg as cupy_sparse_linalg
+import numpy as np
+import scipy.sparse
+import scipy.sparse.linalg
+from anndata import read_h5ad
+
+CSR_FILE_MAGIC = 0x3152534354464152
+
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[3]
+
+
+def _default_data_path() -> Path:
+    root = _repo_root()
+    candidates = [
+        root.parent / "rapids-singlecell-notebooks/h5/pca.h5ad",
+        root.parent / "rapids_singlecell/data/pbmc3k_raw.h5ad",
+        root.parent / "scanpy/src/scanpy/datasets/10x_pbmc68k_reduced.h5ad",
+    ]
+    for path in candidates:
+        if path.exists():
+            return path
+    return candidates[0]
+
+
+def _import_from_path(module_name: str, path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load {module_name} from {path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _load_rsc_modules(rsc_source: Path):
+    sparse_pca = rsc_source / "src/rapids_singlecell/preprocessing/_sparse_pca"
+    lanczos = _import_from_path(
+        "rsc_svd_lanczos", sparse_pca / "_svd_lanczos.py"
+    )
+    randomized = _import_from_path(
+        "rsc_block_lanczos", sparse_pca / "_block_lanczos.py"
+    )
+    return lanczos.lanczos_svd, randomized.randomized_svd
+
+
+def _load_raft_svds():
+    try:
+        from pylibraft.sparse.linalg import svds
+    except Exception as exc:  # noqa: BLE001
+        raise RuntimeError(
+            f"Could not import pylibraft.sparse.linalg.svds: {exc}"
+        ) from exc
+    return svds
+
+
+def _read_h5ad_csr(path: Path, *, layer: str | None, dtype: np.dtype):
+    adata = read_h5ad(path)
+    X = adata.layers[layer] if layer is not None else adata.X
+    if scipy.sparse.issparse(X):
+        X = X.tocsr()
+    else:
+        X = scipy.sparse.csr_matrix(X)
+    X.sort_indices()
+    if X.dtype != dtype:
+        X = X.astype(dtype)
+    return X
+
+
+def _slice_matrix(X, *, max_rows: int | None, max_cols: int | None):
+    if max_rows is not None:
+        X = X[:max_rows, :]
+    if max_cols is not None:
+        X = X[:, :max_cols]
+    X = X.tocsr()
+    X.sort_indices()
+    return X
+
+
+def _to_cupy_csr(X):
+    return cupy_sparse.csr_matrix(
+        (cp.asarray(X.data), cp.asarray(X.indices), cp.asarray(X.indptr)),
+        shape=X.shape,
+    )
+
+
+def _export_csr_bin(X, path: Path):
+    if X.dtype != np.float32:
+        raise TypeError(
+            f"CSR binary export currently supports float32 values, got {X.dtype}"
+        )
+    dtype_code = 1
+
+    int32_max = np.iinfo(np.int32).max
+    if X.nnz > int32_max:
+        raise OverflowError(f"nnz={X.nnz} exceeds int32 max")
+    if X.shape[1] > int32_max:
+        raise OverflowError(f"n_cols={X.shape[1]} exceeds int32 max")
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    header_dtype = np.dtype(
+        [
+            ("magic", "<u8"),
+            ("version", "<u4"),
+            ("dtype", "<u4"),
+            ("rows", "<i8"),
+            ("cols", "<i8"),
+            ("nnz", "<i8"),
+        ]
+    )
+    header = np.array(
+        [(CSR_FILE_MAGIC, 1, dtype_code, X.shape[0], X.shape[1], X.nnz)],
+        dtype=header_dtype,
+    )
+    with path.open("wb") as f:
+        header.tofile(f)
+        np.asarray(X.indptr, dtype=np.int32).tofile(f)
+        np.asarray(X.indices, dtype=np.int32).tofile(f)
+        np.asarray(X.data, dtype=X.dtype).tofile(f)
+
+
+def _time_cuda(fn, *, warmups: int, repeats: int):
+    for _ in range(warmups):
+        result = fn()
+        cp.cuda.Stream.null.synchronize()
+
+    times_ms = []
+    result = None
+    for _ in range(repeats):
+        start = cp.cuda.Event()
+        stop = cp.cuda.Event()
+        start.record()
+        result = fn()
+        stop.record()
+        stop.synchronize()
+        times_ms.append(cp.cuda.get_elapsed_time(start, stop))
+    return result, times_ms
+
+
+def _time_cpu(fn, *, warmups: int, repeats: int):
+    for _ in range(warmups):
+        fn()
+
+    times_ms = []
+    result = None
+    for _ in range(repeats):
+        start = time.perf_counter()
+        result = fn()
+        times_ms.append((time.perf_counter() - start) * 1000)
+    return result, times_ms
+
+
+def _orthogonality_error(Q, *, rows_are_vectors: bool):
+    if Q is None:
+        return math.nan
+    Q_metric = Q.astype(cp.float64, copy=False)
+    k = Q.shape[0] if rows_are_vectors else Q.shape[1]
+    gram = Q_metric @ Q_metric.T if rows_are_vectors else Q_metric.T @ Q_metric
+    eye = cp.eye(k, dtype=Q_metric.dtype)
+    return float(cp.linalg.norm(gram - eye).get())
+
+
+def _residual_error(A, U, S, Vt):
+    if U is None or Vt is None:
+        return math.nan
+    AV = A @ Vt.T
+    US = U * S[cp.newaxis, :]
+    return float((cp.linalg.norm(AV - US) / cp.maximum(S[0], 1e-30)).get())
+
+
+def _orthogonality_error_cpu(Q, *, rows_are_vectors: bool):
+    if Q is None:
+        return math.nan
+    Q_metric = Q.astype(np.float64, copy=False)
+    k = Q.shape[0] if rows_are_vectors else Q.shape[1]
+    gram = Q_metric @ Q_metric.T if rows_are_vectors else Q_metric.T @ Q_metric
+    eye = np.eye(k, dtype=Q_metric.dtype)
+    return float(np.linalg.norm(gram - eye))
+
+
+def _residual_error_cpu(A, U, S, Vt):
+    if U is None or Vt is None:
+        return math.nan
+    AV = A @ Vt.T
+    US = U * S[np.newaxis, :]
+    return float(np.linalg.norm(AV - US) / max(float(S[0]), 1e-30))
+
+
+def _sort_svd_desc(U, S, Vt):
+    order = cp.argsort(S)[::-1]
+    U_sorted = U[:, order] if U is not None else None
+    Vt_sorted = Vt[order, :] if Vt is not None else None
+    return U_sorted, S[order], Vt_sorted
+
+
+def _sort_svd_desc_cpu(U, S, Vt):
+    order = np.argsort(S)[::-1]
+    U_sorted = U[:, order] if U is not None else None
+    Vt_sorted = Vt[order, :] if Vt is not None else None
+    return U_sorted, S[order], Vt_sorted
+
+
+def _cupy_svds(A, args, ncv):
+    cp.random.seed(args.seed)
+    U, S, Vt = cupy_sparse_linalg.svds(
+        A,
+        k=args.k,
+        ncv=ncv,
+        tol=args.tol,
+        maxiter=args.maxiter,
+        return_singular_vectors=True,
+    )
+    return _sort_svd_desc(U, S, Vt)
+
+
+def _scipy_svds(A, args, *, solver: str):
+    U, S, Vt = scipy.sparse.linalg.svds(
+        A,
+        k=args.k,
+        tol=args.tol,
+        maxiter=args.maxiter,
+        return_singular_vectors=True,
+        solver=solver,
+        random_state=args.seed,
+    )
+    return _sort_svd_desc_cpu(U, S, Vt)
+
+
+def _run_solver(name, fn, A, args, *, compute_metrics: bool):
+    def call():
+        return fn()
+
+    (U, S, Vt), times_ms = _time_cuda(
+        call, warmups=args.warmups, repeats=args.repeats
+    )
+    cp.cuda.Stream.null.synchronize()
+    s0 = float(S[0].get())
+    s_head = cp.asnumpy(S[: min(5, S.shape[0])])
+    row = {
+        "solver": name,
+        "median_ms": float(np.median(times_ms)),
+        "mean_ms": float(np.mean(times_ms)),
+        "min_ms": float(np.min(times_ms)),
+        "s0": s0,
+        "s_head": s_head,
+        "u_orth": math.nan,
+        "vt_orth": math.nan,
+        "residual": math.nan,
+    }
+    if compute_metrics:
+        row["u_orth"] = _orthogonality_error(U, rows_are_vectors=False)
+        row["vt_orth"] = _orthogonality_error(Vt, rows_are_vectors=True)
+        row["residual"] = _residual_error(A, U, S, Vt)
+    return row
+
+
+def _run_solver_cpu(name, fn, A, args, *, compute_metrics: bool):
+    (U, S, Vt), times_ms = _time_cpu(
+        fn, warmups=args.warmups, repeats=args.repeats
+    )
+    s0 = float(S[0])
+    s_head = np.asarray(S[: min(5, S.shape[0])])
+    row = {
+        "solver": name,
+        "median_ms": float(np.median(times_ms)),
+        "mean_ms": float(np.mean(times_ms)),
+        "min_ms": float(np.min(times_ms)),
+        "s0": s0,
+        "s_head": s_head,
+        "u_orth": math.nan,
+        "vt_orth": math.nan,
+        "residual": math.nan,
+    }
+    if compute_metrics:
+        row["u_orth"] = _orthogonality_error_cpu(U, rows_are_vectors=False)
+        row["vt_orth"] = _orthogonality_error_cpu(Vt, rows_are_vectors=True)
+        row["residual"] = _residual_error_cpu(A, U, S, Vt)
+    return row
+
+
+def _format_row(row):
+    s_head = np.array2string(row["s_head"], precision=4, separator=", ")
+    return (
+        f"{row['solver']:<22} median={row['median_ms']:>9.3f} ms "
+        f"mean={row['mean_ms']:>9.3f} ms min={row['min_ms']:>9.3f} ms "
+        f"s0={row['s0']:.6g} "
+        f"u_orth={row['u_orth']:.3e} vt_orth={row['vt_orth']:.3e} "
+        f"residual={row['residual']:.3e} s[:5]={s_head}"
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Benchmark sparse SVD solvers on CSR/H5AD inputs."
+    )
+    parser.add_argument("--data", type=Path, default=_default_data_path())
+    parser.add_argument("--layer", default=None)
+    parser.add_argument(
+        "--rsc-source",
+        type=Path,
+        default=_repo_root().parent / "rapids_singlecell",
+    )
+    parser.add_argument("--k", type=int, default=50)
+    parser.add_argument("--ncv", type=int, default=None)
+    parser.add_argument("--n-oversamples", type=int, default=10)
+    parser.add_argument("--n-power-iters", type=int, default=2)
+    parser.add_argument("--maxiter", type=int, default=100)
+    parser.add_argument("--tol", type=float, default=1e-4)
+    parser.add_argument("--seed", type=int, default=1234)
+    parser.add_argument(
+        "--dtype", choices=("float32", "float64"), default="float32"
+    )
+    parser.add_argument("--max-rows", type=int, default=None)
+    parser.add_argument("--max-cols", type=int, default=None)
+    parser.add_argument("--export-csr-bin", type=Path, default=None)
+    parser.add_argument("--export-only", action="store_true")
+    parser.add_argument("--warmups", type=int, default=1)
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument("--metrics", action="store_true")
+    parser.add_argument(
+        "--solvers",
+        default="raft_lanczos,raft_randomized",
+        help="Comma-separated subset of raft_lanczos, raft_lanczos_mgs2, "
+        "raft_randomized, rsc_lanczos, rsc_randomized, cupy_svds, "
+        "scipy_propack, scipy_lobpcg.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    dtype = np.dtype(args.dtype)
+    ncv = args.ncv
+    selected = {
+        name.strip() for name in args.solvers.split(",") if name.strip()
+    }
+
+    X = _read_h5ad_csr(args.data, layer=args.layer, dtype=dtype)
+    X = _slice_matrix(X, max_rows=args.max_rows, max_cols=args.max_cols)
+    if args.export_csr_bin is not None:
+        _export_csr_bin(X, args.export_csr_bin)
+        print(
+            f"exported CSR binary to {args.export_csr_bin} "
+            f"shape={X.shape} nnz={X.nnz} dtype={X.dtype}"
+        )
+        if args.export_only:
+            return
+
+    needs_gpu = any(
+        name == "cupy_svds"
+        or name.startswith("raft_")
+        or name.startswith("rsc_")
+        for name in selected
+    )
+    A = _to_cupy_csr(X) if needs_gpu else None
+
+    print(
+        f"data={args.data} shape={X.shape} nnz={X.nnz} density={X.nnz / (X.shape[0] * X.shape[1]):.6g} "
+        f"dtype={X.dtype} k={args.k} ncv={ncv if ncv is not None else 'default'} repeats={args.repeats}"
+    )
+
+    needs_raft = any(name.startswith("raft_") for name in selected)
+    needs_rsc = any(name.startswith("rsc_") for name in selected)
+
+    raft_svds = _load_raft_svds() if needs_raft else None
+    rsc_lanczos, rsc_randomized = (
+        _load_rsc_modules(args.rsc_source) if needs_rsc else (None, None)
+    )
+
+    gpu_solvers = {}
+    cpu_solvers = {
+        "scipy_propack": lambda: _scipy_svds(X, args, solver="propack"),
+        "scipy_lobpcg": lambda: _scipy_svds(X, args, solver="lobpcg"),
+    }
+    if A is not None:
+        gpu_solvers["cupy_svds"] = lambda: _cupy_svds(A, args, ncv)
+    if raft_svds is not None:
+        gpu_solvers["raft_lanczos"] = lambda: raft_svds(
+            A,
+            k=args.k,
+            solver="lanczos",
+            ncv=ncv,
+            tol=args.tol,
+            maxiter=args.maxiter,
+            seed=args.seed,
+            orthogonalization="cgs2",
+        )
+        gpu_solvers["raft_lanczos_mgs2"] = lambda: raft_svds(
+            A,
+            k=args.k,
+            solver="lanczos",
+            ncv=ncv,
+            tol=args.tol,
+            maxiter=args.maxiter,
+            seed=args.seed,
+            orthogonalization="mgs2",
+        )
+        gpu_solvers["raft_randomized"] = lambda: raft_svds(
+            A,
+            k=args.k,
+            solver="randomized",
+            n_oversamples=args.n_oversamples,
+            n_power_iters=args.n_power_iters,
+            seed=args.seed,
+        )
+
+    if rsc_lanczos is not None:
+        gpu_solvers["rsc_lanczos"] = lambda: rsc_lanczos(
+            A,
+            k=args.k,
+            ncv=ncv,
+            tol=args.tol,
+            max_iter=args.maxiter,
+            random_state=args.seed,
+            refine_results=True,
+        )
+        gpu_solvers["rsc_randomized"] = lambda: rsc_randomized(
+            A,
+            k=args.k,
+            n_oversamples=args.n_oversamples,
+            n_iter=args.n_power_iters,
+            random_state=args.seed,
+        )
+
+    solvers = {**gpu_solvers, **cpu_solvers}
+    unknown = selected.difference(solvers)
+    if unknown:
+        raise ValueError(f"Unknown or unavailable solvers: {sorted(unknown)}")
+
+    solver_order = [
+        "cupy_svds",
+        "raft_lanczos",
+        "raft_lanczos_mgs2",
+        "raft_randomized",
+        "rsc_lanczos",
+        "rsc_randomized",
+        "scipy_propack",
+        "scipy_lobpcg",
+    ]
+    for name in [name for name in solver_order if name in selected]:
+        try:
+            if name in gpu_solvers:
+                row = _run_solver(
+                    name,
+                    gpu_solvers[name],
+                    A,
+                    args,
+                    compute_metrics=args.metrics,
+                )
+            else:
+                row = _run_solver_cpu(
+                    name,
+                    cpu_solvers[name],
+                    X,
+                    args,
+                    compute_metrics=args.metrics,
+                )
+            print(_format_row(row))
+        except Exception as exc:  # noqa: BLE001
+            print(f"{name:<22} failed: {type(exc).__name__}: {exc}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/pylibraft/pylibraft/sparse/linalg/svds.pyx b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx
index b2bbca41e2..6463da9b17 100644
--- a/python/pylibraft/pylibraft/sparse/linalg/svds.pyx
+++ b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx
@@ -15,6 +15,7 @@ import numpy as np
 
 from cython.operator cimport dereference as deref
 from libc.stdint cimport int64_t, uint32_t, uint64_t, uintptr_t
+from libcpp cimport bool
 
 from pylibraft.common import Handle, cai_wrapper, device_ndarray
 from pylibraft.common.handle import auto_sync_handle
@@ -31,7 +32,7 @@ from pylibraft.common.cpp.optional cimport optional
 from pylibraft.common.handle cimport device_resources
 
 
-cdef extern from "raft/sparse/solver/svds_config.hpp" \
+cdef extern from "raft/sparse/solver/solver_types.hpp" \
         namespace "raft::sparse::solver" nogil:
 
     cdef cppclass sparse_svd_config[ValueTypeT]:
@@ -40,6 +41,14 @@ cdef extern from "raft/sparse/solver/svds_config.hpp" \
         int n_power_iters
         optional[uint64_t] seed
 
+    cdef cppclass sparse_lanczos_svd_config[ValueTypeT]:
+        int n_components
+        int ncv
+        ValueTypeT tolerance
+        int max_iterations
+        optional[uint64_t] seed
+        bool use_mgs2_orthogonalization
+
 
 cdef extern from "raft_runtime/solver/randomized_svds.hpp" \
         namespace "raft::runtime::solver" nogil:
@@ -69,12 +78,42 @@ cdef extern from "raft_runtime/solver/randomized_svds.hpp" \
         optional[device_matrix_view[double, uint32_t, col_major]] Vt) except +
 
 
+cdef extern from "raft_runtime/solver/lanczos_svds.hpp" \
+        namespace "raft::runtime::solver" nogil:
+
+    cdef void sparse_lanczos_svd_f \
+            "raft::runtime::solver::sparse_lanczos_svd"(
+        const device_resources &handle,
+        const sparse_lanczos_svd_config[float] &config,
+        device_vector_view[int, uint32_t] indptr,
+        device_vector_view[int, uint32_t] indices,
+        device_vector_view[float, uint32_t] data,
+        int n_rows, int n_cols, int nnz,
+        device_vector_view[float, uint32_t] singular_values,
+        optional[device_matrix_view[float, uint32_t, col_major]] U,
+        optional[device_matrix_view[float, uint32_t, col_major]] Vt) except +
+
+    cdef void sparse_lanczos_svd_d \
+            "raft::runtime::solver::sparse_lanczos_svd"(
+        const device_resources &handle,
+        const sparse_lanczos_svd_config[double] &config,
+        device_vector_view[int, uint32_t] indptr,
+        device_vector_view[int, uint32_t] indices,
+        device_vector_view[double, uint32_t] data,
+        int n_rows, int n_cols, int nnz,
+        device_vector_view[double, uint32_t] singular_values,
+        optional[device_matrix_view[double, uint32_t, col_major]] U,
+        optional[device_matrix_view[double, uint32_t, col_major]] Vt) except +
+
+
 @auto_sync_handle
 def svds(A, k=6, n_oversamples=10, n_power_iters=2,
-         seed=None, return_singular_vectors=True, handle=None):
+         seed=None, return_singular_vectors=True, handle=None,
+         solver="randomized", ncv=None, tol=1e-4, maxiter=None,
+         orthogonalization="cgs2"):
     """
     Compute the largest ``k`` singular values and corresponding singular
-    vectors of a sparse matrix using randomized SVD.
+    vectors of a sparse matrix.
 
     Computes the truncated SVD: ``A ~ U @ diag(S) @ Vt``.
 
@@ -86,10 +125,11 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
             ``1 <= k < min(m, n)``. Default 6.
         n_oversamples (int): Number of extra random vectors for better
             approximation. Total subspace dimension is ``k + n_oversamples``.
-            Default 10.
+            Used only when ``solver="randomized"``. Default 10.
         n_power_iters (int): Number of power iteration passes. More
             iterations improve accuracy for matrices with slowly decaying
-            singular values. Default 2.
+            singular values. Used only when ``solver="randomized"``.
+            Default 2.
         seed (int or None): Random seed for reproducibility. If ``None``,
             a non-deterministic seed is used.
         return_singular_vectors (bool or {{"u", "vh"}}): Controls which
@@ -101,9 +141,28 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
             - ``"u"``: skip ``Vt``, return ``(U, S, None)``.
             - ``"vh"``: skip ``U``, return ``(None, S, Vt)``.
 
-            Skipping a side avoids the corresponding output buffer and
-            final matrix multiplication.
+            Skipping a side avoids the corresponding output buffer. Depending
+            on the selected solver, some intermediate vector products may
+            still be required for numerical refinement.
         handle: RAFT resource handle. If ``None``, a default is created.
+        solver ({{ "randomized", "lanczos" }}): Solver to use. The default
+            ``"randomized"`` preserves the existing approximate randomized SVD
+            behavior. ``"lanczos"`` uses Lanczos bidiagonalization and is the
+            higher-accuracy, ARPACK-like sparse SVD path.
+        ncv (int or None): Number of Lanczos vectors. Used only when
+            ``solver="lanczos"``. If ``None``, a default is selected. Larger
+            values can improve convergence margin and orthogonality for
+            difficult spectra, but increase sparse matrix-vector work.
+        tol (float): Lanczos residual tolerance. Used only when
+            ``solver="lanczos"``.
+        maxiter (int or None): Maximum Lanczos restart iterations. Used only
+            when ``solver="lanczos"``. If ``None``, defaults to 100. The
+            Lanczos solver raises an error if all requested singular triplets
+            do not converge within this limit.
+        orthogonalization ({{ "cgs2", "mgs2" }}): Lanczos reorthogonalization
+            method. ``"cgs2"`` is the GPU-efficient default. ``"mgs2"`` is
+            more launch-heavy and intended as an alternate path for difficult
+            float32 spectra.
 
     Returns:
         tuple:
@@ -116,9 +175,11 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
         :func:`scipy.sparse.linalg.svds`
 
     .. note::
-        This function uses randomized SVD (Halko et al. 2009) with
-        CholeskyQR2 orthogonalization (Tomas et al. 2024) for efficient
-        GPU execution.
+        With ``solver="randomized"``, this function uses randomized SVD
+        (Halko et al. 2009) with CholeskyQR2 orthogonalization (Tomas et al.
+        2024). With ``solver="lanczos"``, it uses implicitly restarted
+        Lanczos bidiagonalization with full reorthogonalization and final
+        ``A @ V`` post-refinement of the returned left singular vectors.
 
     """
 
@@ -136,6 +197,17 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
             "Use A.tocsr() to convert." % type(A).__name__
         )
 
+    if solver not in ("randomized", "lanczos"):
+        raise ValueError(
+            "solver must be 'randomized' or 'lanczos', got %r" % (solver,)
+        )
+
+    if orthogonalization not in ("cgs2", "mgs2"):
+        raise ValueError(
+            "orthogonalization must be 'cgs2' or 'mgs2', got %r"
+            % (orthogonalization,)
+        )
+
     if not (
         return_singular_vectors is True
         or return_singular_vectors is False
@@ -157,19 +229,17 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
         )
 
     # Extract CSR arrays and ensure int32 indices. raft's runtime layer takes
-    # `int` (int32) for nnz / indptr / indices, so any overflow on conversion
-    # cannot be recovered downstream — error out before the cast.
-    # TODO: parameterize the runtime FUNC_DECL on IndexType (mirror lanczos's
-    # int / int64_t overloads) so int64 indptr/indices are accepted natively
-    # — including mixed int64-indptr / int32-indices for matrices where
-    # nnz > INT32_MAX but column count fits in int32.
+    # `int` (int32) for nnz / indptr / indices, so overflow on conversion
+    # cannot be recovered downstream; error out before the cast.
+    # The runtime API is currently int32-only. Native int64 support should
+    # mirror the sparse Lanczos eigensolver overloads when it is added.
     indptr = A.indptr
     indices = A.indices
     data = A.data
 
     INT32_MAX = (1 << 31) - 1
 
-    # Validate indptr and indices dtypes independently — CuPy CSR matrices
+    # Validate indptr and indices dtypes independently. CuPy CSR matrices
     # normally have matching dtypes, but don't assume it.
     for name, arr in (("indptr", indptr), ("indices", indices)):
         if arr.dtype != np.int32 and arr.dtype != np.int64:
@@ -237,55 +307,98 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2,
 
     cdef sparse_svd_config[float] cfg_float
     cdef sparse_svd_config[double] cfg_double
+    cdef sparse_lanczos_svd_config[float] lanczos_cfg_float
+    cdef sparse_lanczos_svd_config[double] lanczos_cfg_double
     cdef optional[device_matrix_view[float, uint32_t, col_major]] U_opt_f
     cdef optional[device_matrix_view[float, uint32_t, col_major]] Vt_opt_f
     cdef optional[device_matrix_view[double, uint32_t, col_major]] U_opt_d
     cdef optional[device_matrix_view[double, uint32_t, col_major]] Vt_opt_d
+    cdef int ncv_value = 0 if ncv is None else <int>ncv
+    cdef int maxiter_value = 100 if maxiter is None else <int>maxiter
+    cdef bool use_mgs2_value = orthogonalization == "mgs2"
 
     if ValueType == np.float32:
-        cfg_float.n_components = k
-        cfg_float.n_oversamples = n_oversamples
-        cfg_float.n_power_iters = n_power_iters
-        cfg_float.seed = seed_opt
         if want_U:
             U_opt_f = make_device_matrix_view[float, uint32_t, col_major](
                 <float *>U_ptr, <uint32_t>m, <uint32_t>k)
         if want_Vt:
             Vt_opt_f = make_device_matrix_view[float, uint32_t, col_major](
                 <float *>Vt_ptr, <uint32_t>k, <uint32_t>n)
-        sparse_randomized_svd_f(
-            deref(h),
-            cfg_float,
-            make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
-            make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
-            make_device_vector_view(<float *>data_ptr, <uint32_t>nnz),
-            m, n, nnz,
-            make_device_vector_view(<float *>S_ptr, <uint32_t>k),
-            U_opt_f,
-            Vt_opt_f,
-        )
+        if solver == "randomized":
+            cfg_float.n_components = k
+            cfg_float.n_oversamples = n_oversamples
+            cfg_float.n_power_iters = n_power_iters
+            cfg_float.seed = seed_opt
+            sparse_randomized_svd_f(
+                deref(h),
+                cfg_float,
+                make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
+                make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
+                make_device_vector_view(<float *>data_ptr, <uint32_t>nnz),
+                m, n, nnz,
+                make_device_vector_view(<float *>S_ptr, <uint32_t>k),
+                U_opt_f,
+                Vt_opt_f,
+            )
+        else:
+            lanczos_cfg_float.n_components = k
+            lanczos_cfg_float.ncv = ncv_value
+            lanczos_cfg_float.tolerance = <float>tol
+            lanczos_cfg_float.max_iterations = maxiter_value
+            lanczos_cfg_float.seed = seed_opt
+            lanczos_cfg_float.use_mgs2_orthogonalization = use_mgs2_value
+            sparse_lanczos_svd_f(
+                deref(h),
+                lanczos_cfg_float,
+                make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
+                make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
+                make_device_vector_view(<float *>data_ptr, <uint32_t>nnz),
+                m, n, nnz,
+                make_device_vector_view(<float *>S_ptr, <uint32_t>k),
+                U_opt_f,
+                Vt_opt_f,
+            )
     elif ValueType == np.float64:
-        cfg_double.n_components = k
-        cfg_double.n_oversamples = n_oversamples
-        cfg_double.n_power_iters = n_power_iters
-        cfg_double.seed = seed_opt
         if want_U:
             U_opt_d = make_device_matrix_view[double, uint32_t, col_major](
                 <double *>U_ptr, <uint32_t>m, <uint32_t>k)
         if want_Vt:
             Vt_opt_d = make_device_matrix_view[double, uint32_t, col_major](
                 <double *>Vt_ptr, <uint32_t>k, <uint32_t>n)
-        sparse_randomized_svd_d(
-            deref(h),
-            cfg_double,
-            make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
-            make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
-            make_device_vector_view(<double *>data_ptr, <uint32_t>nnz),
-            m, n, nnz,
-            make_device_vector_view(<double *>S_ptr, <uint32_t>k),
-            U_opt_d,
-            Vt_opt_d,
-        )
+        if solver == "randomized":
+            cfg_double.n_components = k
+            cfg_double.n_oversamples = n_oversamples
+            cfg_double.n_power_iters = n_power_iters
+            cfg_double.seed = seed_opt
+            sparse_randomized_svd_d(
+                deref(h),
+                cfg_double,
+                make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
+                make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
+                make_device_vector_view(<double *>data_ptr, <uint32_t>nnz),
+                m, n, nnz,
+                make_device_vector_view(<double *>S_ptr, <uint32_t>k),
+                U_opt_d,
+                Vt_opt_d,
+            )
+        else:
+            lanczos_cfg_double.n_components = k
+            lanczos_cfg_double.ncv = ncv_value
+            lanczos_cfg_double.tolerance = <double>tol
+            lanczos_cfg_double.max_iterations = maxiter_value
+            lanczos_cfg_double.seed = seed_opt
+            lanczos_cfg_double.use_mgs2_orthogonalization = use_mgs2_value
+            sparse_lanczos_svd_d(
+                deref(h),
+                lanczos_cfg_double,
+                make_device_vector_view(<int *>indptr_ptr, <uint32_t>(m + 1)),
+                make_device_vector_view(<int *>indices_ptr, <uint32_t>nnz),
+                make_device_vector_view(<double *>data_ptr, <uint32_t>nnz),
+                m, n, nnz,
+                make_device_vector_view(<double *>S_ptr, <uint32_t>k),
+                U_opt_d,
+                Vt_opt_d,
+            )
     else:
         raise TypeError("dtype ValueType=%s not supported, "
                         "expected float32 or float64" % ValueType)
diff --git a/python/pylibraft/pylibraft/tests/test_sparse.py b/python/pylibraft/pylibraft/tests/test_sparse.py
index b6b66721a4..c460f0f92a 100644
--- a/python/pylibraft/pylibraft/tests/test_sparse.py
+++ b/python/pylibraft/pylibraft/tests/test_sparse.py
@@ -225,6 +225,184 @@ def test_shapes(self):
         assert S.shape == (k,)
         assert Vt.shape == (k, n)
 
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    @pytest.mark.parametrize("orthogonalization", ["cgs2", "mgs2"])
+    def test_lanczos_diagonal_spectrum(self, dtype, orthogonalization):
+        """Lanczos SVD must recover the leading singular triplets."""
+        diag = cupy.asarray([8, 7, 6, 5, 4, 3, 2, 1], dtype=dtype)
+        A = sparse.diags(diag, offsets=0, shape=(12, 8), format="csr")
+        U, S, Vt = svds(
+            A,
+            k=3,
+            solver="lanczos",
+            ncv=6,
+            tol=1e-6,
+            maxiter=20,
+            seed=42,
+            orthogonalization=orthogonalization,
+        )
+
+        tol = 2e-3 if dtype == numpy.float32 else 1e-8
+        assert cupy.allclose(S, diag[:3], atol=tol)
+        assert cupy.allclose(U.T @ U, cupy.eye(3, dtype=dtype), atol=tol)
+        assert cupy.allclose(Vt @ Vt.T, cupy.eye(3, dtype=dtype), atol=tol)
+
+    def test_lanczos_return_singular_vectors_modes(self):
+        """Lanczos supports scipy-style vector return modes."""
+        diag = cupy.asarray([8, 7, 6, 5, 4, 3, 2, 1], dtype=numpy.float32)
+        A = sparse.diags(diag, offsets=0, shape=(12, 8), format="csr")
+
+        U_ref, S_ref, Vt_ref = svds(A, k=3, solver="lanczos", ncv=6, seed=42)
+
+        U_f, S_f, Vt_f = svds(
+            A,
+            k=3,
+            solver="lanczos",
+            ncv=6,
+            seed=42,
+            return_singular_vectors=False,
+        )
+        assert U_f is None
+        assert Vt_f is None
+        assert cupy.allclose(S_f, S_ref)
+
+        U_u, S_u, Vt_u = svds(
+            A,
+            k=3,
+            solver="lanczos",
+            ncv=6,
+            seed=42,
+            return_singular_vectors="u",
+        )
+        assert U_u.shape == U_ref.shape
+        assert Vt_u is None
+        assert cupy.allclose(S_u, S_ref)
+
+        U_v, S_v, Vt_v = svds(
+            A,
+            k=3,
+            solver="lanczos",
+            ncv=6,
+            seed=42,
+            return_singular_vectors="vh",
+        )
+        assert U_v is None
+        assert Vt_v.shape == Vt_ref.shape
+        assert cupy.allclose(S_v, S_ref)
+
+    def test_lanczos_rotated_clustered_spectrum(self):
+        """Lanczos recovers clustered singular values for a non-diagonal sparse matrix."""
+        m, n, k = 64, 48, 8
+        expected = numpy.asarray(
+            [10.0, 9.9995, 9.999, 9.5, 9.4995, 9.0, 8.5, 8.0],
+            dtype=numpy.float32,
+        )
+        all_s = numpy.concatenate(
+            [expected, numpy.asarray([0.75, 0.25], dtype=numpy.float32)]
+        )
+        indptr = [0]
+        indices = []
+        values = []
+        for block in range(len(all_s) // 2):
+            col0 = 2 * block
+            s0, s1 = all_s[2 * block], all_s[2 * block + 1]
+            theta = 0.17 + 0.11 * block
+            phi = 0.31 + 0.07 * block
+            cu, su = numpy.cos(theta), numpy.sin(theta)
+            cv, sv = numpy.cos(phi), numpy.sin(phi)
+            block_values = [
+                cu * s0 * cv + su * s1 * sv,
+                cu * s0 * sv - su * s1 * cv,
+                su * s0 * cv - cu * s1 * sv,
+                su * s0 * sv + cu * s1 * cv,
+            ]
+            indices.extend([col0, col0 + 1])
+            values.extend(block_values[:2])
+            indptr.append(len(values))
+            indices.extend([col0, col0 + 1])
+            values.extend(block_values[2:])
+            indptr.append(len(values))
+        indptr.extend([len(values)] * (m + 1 - len(indptr)))
+
+        A = sparse.csr_matrix(
+            (
+                cupy.asarray(values, dtype=numpy.float32),
+                cupy.asarray(indices, dtype=numpy.int32),
+                cupy.asarray(indptr, dtype=numpy.int32),
+            ),
+            shape=(m, n),
+        )
+
+        U, S, Vt = svds(
+            A,
+            k=k,
+            solver="lanczos",
+            ncv=20,
+            tol=1e-5,
+            maxiter=120,
+            seed=2027,
+        )
+
+        assert cupy.allclose(S, cupy.asarray(expected), atol=2e-3)
+        eye = cupy.eye(k, dtype=numpy.float64)
+        U64 = U.astype(cupy.float64, copy=False)
+        Vt64 = Vt.astype(cupy.float64, copy=False)
+        assert cupy.linalg.norm(U64.T @ U64 - eye) < 2e-3
+        assert cupy.linalg.norm(Vt64 @ Vt64.T - eye) < 2e-3
+        residual = cupy.linalg.norm(A @ Vt.T - U * S[None, :]) / S[0]
+        assert residual < 1e-4
+
+    @pytest.mark.parametrize("shape", [(96, 28), (24, 96)])
+    def test_lanczos_rank_deficient_and_edge_shapes(self, shape):
+        """Lanczos handles rank deficiency and tall/wide sparse inputs."""
+        diag = cupy.zeros(min(shape), dtype=numpy.float32)
+        diag[:8] = cupy.asarray([12, 8, 6, 3, 1, 0, 0, 0], dtype=diag.dtype)
+        A = sparse.diags(diag, offsets=0, shape=shape, format="csr")
+
+        U, S, Vt = svds(
+            A,
+            k=4,
+            solver="lanczos",
+            ncv=12,
+            tol=1e-5,
+            maxiter=80,
+            seed=2026,
+        )
+
+        assert cupy.allclose(S, diag[:4], atol=2e-3)
+        eye = cupy.eye(4, dtype=numpy.float64)
+        U64 = U.astype(cupy.float64, copy=False)
+        Vt64 = Vt.astype(cupy.float64, copy=False)
+        assert cupy.linalg.norm(U64.T @ U64 - eye) < 2e-3
+        assert cupy.linalg.norm(Vt64 @ Vt64.T - eye) < 2e-3
+        residual = cupy.linalg.norm(A @ Vt.T - U * S[None, :]) / S[0]
+        assert residual < 1e-4
+
+    def test_lanczos_non_convergence_raises(self):
+        """Lanczos reports non-convergence instead of returning forced Ritz vectors."""
+        diag = cupy.zeros(40, dtype=numpy.float32)
+        diag[:6] = cupy.asarray([10, 9, 8, 7, 6, 5], dtype=diag.dtype)
+        A = sparse.diags(diag, offsets=0, shape=(64, 40), format="csr")
+
+        with pytest.raises(RuntimeError, match="failed to converge"):
+            svds(
+                A,
+                k=4,
+                solver="lanczos",
+                ncv=6,
+                tol=0,
+                maxiter=1,
+                seed=2026,
+            )
+
+    def test_invalid_solver(self):
+        """Unknown sparse SVD solvers must be rejected."""
+        A = sparse.random(20, 15, density=0.3, format="csr")
+        with pytest.raises(ValueError, match="solver"):
+            svds(A, k=3, solver="bogus")
+        with pytest.raises(ValueError, match="orthogonalization"):
+            svds(A, k=3, solver="lanczos", orthogonalization="bogus")
+
     def test_int64_indices(self):
         """int64 indices should be auto-converted and work, with a warning."""
         m, n, k = 50, 30, 3