diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0425bb2211..38434af5ee 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -282,6 +282,8 @@ if(RAFT_COMPILE_LIBRARY) src/raft_runtime/solver/lanczos_solver_int64_float.cu src/raft_runtime/solver/lanczos_solver_int_double.cu src/raft_runtime/solver/lanczos_solver_int_float.cu + src/raft_runtime/solver/lanczos_svds_float.cu + src/raft_runtime/solver/lanczos_svds_double.cu src/raft_runtime/solver/randomized_svds_float.cu src/raft_runtime/solver/randomized_svds_double.cu ) diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 7dc510639e..7e8ed4a691 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -101,7 +101,7 @@ if(BUILD_PRIMS_BENCH) ConfigureBench( NAME SPARSE_BENCH PATH sparse/bitmap_to_csr.cu sparse/convert_csr.cu sparse/select_k_csr.cu - main.cpp + sparse/svds.cu main.cpp ) endif() diff --git a/cpp/bench/prims/sparse/svds.cu b/cpp/bench/prims/sparse/svds.cu new file mode 100644 index 0000000000..65b4b3aa1f --- /dev/null +++ b/cpp/bench/prims/sparse/svds.cu @@ -0,0 +1,432 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::sparse { + +constexpr std::uint64_t kCsrFileMagic = 0x3152534354464152ULL; + +struct csr_file_header { + std::uint64_t magic; + std::uint32_t version; + std::uint32_t dtype; + std::int64_t rows; + std::int64_t cols; + std::int64_t nnz; +}; + +struct svds_input { + int rows; + int cols; + int nnz; + int nnz_per_row; + int k; + int ncv; + int n_oversamples; + int n_power_iters; + int max_iterations; + bool file_backed; +}; + +csr_file_header read_csr_file_header(std::string const& path) +{ + std::ifstream file(path, std::ios::binary); + if (!file) { throw std::runtime_error("Could not open CSR benchmark file: " + path); } + + csr_file_header header{}; + file.read(reinterpret_cast(&header), sizeof(header)); + if (!file) { throw std::runtime_error("Could not read CSR benchmark file header: " + path); } + if (header.magic != kCsrFileMagic || header.version != 1) { + throw std::runtime_error("Invalid CSR benchmark file header: " + path); + } + if (header.dtype != 1) { + throw std::runtime_error("CSR benchmark file must contain float32 values: " + path); + } + if (header.rows <= 0 || header.cols <= 0 || header.nnz <= 0) { + throw std::runtime_error("CSR benchmark file has invalid dimensions: " + path); + } + if (header.rows > std::numeric_limits::max() || + header.cols > std::numeric_limits::max() || + header.nnz > std::numeric_limits::max()) { + throw std::runtime_error("CSR benchmark file dimensions exceed int32 limits: " + path); + } + return header; +} + +int env_int(char const* name, int default_value) +{ + auto const* value = std::getenv(name); + return value == nullptr ? default_value : std::atoi(value); +} + +std::vector read_reference_singular_values(std::string const& path, int k) +{ + std::ifstream file(path); + if (!file) { throw std::runtime_error("Could not open reference singular values file: " + path); } + + std::vector values; + values.reserve(k); + double value = 0.0; + while (file >> value) { + values.push_back(value); + if (static_cast(values.size()) == k) { break; } + } + if (static_cast(values.size()) != k) { + throw std::runtime_error("Reference singular values file has fewer than k values: " + path); + } + return values; +} + +template +class svds_bench_base : public fixture { + public: + explicit svds_bench_base(svds_input const& p) + : fixture(true), + params(p), + nnz(p.nnz), + indptr(raft::make_device_vector(handle, p.rows + 1)), + indices(raft::make_device_vector(handle, nnz)), + values(raft::make_device_vector(handle, nnz)), + singular_values(raft::make_device_vector(handle, p.k)), + U(raft::make_device_matrix(handle, p.rows, p.k)), + Vt(raft::make_device_matrix(handle, p.k, p.cols)) + { + if (params.file_backed) { + load_csr_from_file(); + } else { + initialize_synthetic_csr(); + } + } + + void generate_metrics(::benchmark::State& state) override + { + state.counters["rows"] = benchmark::Counter(params.rows); + state.counters["cols"] = benchmark::Counter(params.cols); + state.counters["nnz"] = benchmark::Counter(nnz); + state.counters["nnz/row"] = benchmark::Counter(static_cast(nnz) / params.rows); + state.counters["components"] = benchmark::Counter(params.k); + state.counters["subspace"] = benchmark::Counter(params.ncv); + state.counters["oversamples"] = benchmark::Counter(params.n_oversamples); + add_reference_metrics(state); + add_orthogonality_metrics(state); + } + + protected: + auto csr_view() + { + auto csr_structure = raft::make_device_compressed_structure_view( + indptr.data_handle(), indices.data_handle(), params.rows, params.cols, nnz); + return raft::make_device_csr_matrix_view(values.data_handle(), + csr_structure); + } + + void initialize_synthetic_csr() + { + std::vector h_indptr(params.rows + 1); + std::vector h_indices(nnz); + std::vector h_values(nnz); + + for (int row = 0; row <= params.rows; ++row) { + h_indptr[row] = row * params.nnz_per_row; + } + + std::vector row_indices(params.nnz_per_row); + int const step = std::max(1, params.cols / std::max(1, params.nnz_per_row)); + for (int row = 0; row < params.rows; ++row) { + for (int j = 0; j < params.nnz_per_row; ++j) { + row_indices[j] = (row * 131 + j * step + j * 17) % params.cols; + } + std::sort(row_indices.begin(), row_indices.end()); + for (int j = 0; j < params.nnz_per_row; ++j) { + auto offset = static_cast(row) * params.nnz_per_row + j; + h_indices[offset] = row_indices[j]; + h_values[offset] = + value_t(0.1) + value_t((row * 17 + j * 29 + row_indices[j] * 7) % 1000) / value_t(1000); + } + } + + raft::update_device(indptr.data_handle(), h_indptr.data(), h_indptr.size(), stream); + raft::update_device(indices.data_handle(), h_indices.data(), h_indices.size(), stream); + raft::update_device(values.data_handle(), h_values.data(), h_values.size(), stream); + resource::sync_stream(handle, stream); + } + + void load_csr_from_file() + { + auto const* path = std::getenv("RAFT_SVDS_CSR_FILE"); + if (path == nullptr) { throw std::runtime_error("RAFT_SVDS_CSR_FILE must be set"); } + + auto header = read_csr_file_header(path); + if (header.rows != params.rows || header.cols != params.cols || header.nnz != params.nnz) { + throw std::runtime_error( + "CSR benchmark file dimensions changed after benchmark registration"); + } + + std::ifstream file(path, std::ios::binary); + if (!file) { + throw std::runtime_error(std::string("Could not open CSR benchmark file: ") + path); + } + file.seekg(sizeof(csr_file_header), std::ios::beg); + + std::vector h_indptr(static_cast(params.rows) + 1); + std::vector h_indices(static_cast(nnz)); + std::vector h_values(static_cast(nnz)); + + file.read(reinterpret_cast(h_indptr.data()), h_indptr.size() * sizeof(int)); + file.read(reinterpret_cast(h_indices.data()), h_indices.size() * sizeof(int)); + file.read(reinterpret_cast(h_values.data()), h_values.size() * sizeof(value_t)); + if (!file) { + throw std::runtime_error(std::string("Could not read CSR benchmark file: ") + path); + } + + raft::update_device(indptr.data_handle(), h_indptr.data(), h_indptr.size(), stream); + raft::update_device(indices.data_handle(), h_indices.data(), h_indices.size(), stream); + raft::update_device(values.data_handle(), h_values.data(), h_values.size(), stream); + resource::sync_stream(handle, stream); + } + + void add_reference_metrics(::benchmark::State& state) + { + auto const* ref_path = std::getenv("RAFT_SVDS_REF_S"); + if (ref_path == nullptr) { return; } + + auto ref = read_reference_singular_values(ref_path, params.k); + std::vector actual(params.k); + raft::update_host(actual.data(), singular_values.data_handle(), params.k, stream); + resource::sync_stream(handle, stream); + + double max_rel = 0.0; + double mean_rel = 0.0; + for (int i = 0; i < params.k; ++i) { + auto denom = std::max(std::abs(ref[i]), 1e-30); + auto rel = std::abs(static_cast(actual[i]) - ref[i]) / denom; + max_rel = std::max(max_rel, rel); + mean_rel += rel; + } + mean_rel /= params.k; + + state.counters["s_rel_max"] = benchmark::Counter(max_rel); + state.counters["s_rel_mean"] = benchmark::Counter(mean_rel); + state.counters["s0_rel"] = benchmark::Counter( + std::abs(static_cast(actual[0]) - ref[0]) / std::max(std::abs(ref[0]), 1e-30)); + } + + void add_orthogonality_metrics(::benchmark::State& state) + { + auto UtU = + raft::make_device_matrix(handle, params.k, params.k); + auto VVt = + raft::make_device_matrix(handle, params.k, params.k); + + value_t one = 1; + value_t zero = 0; + + raft::linalg::gemm(handle, + U.data_handle(), + params.rows, + params.k, + U.data_handle(), + UtU.data_handle(), + params.k, + params.k, + CUBLAS_OP_T, + CUBLAS_OP_N, + one, + zero, + stream); + + raft::linalg::gemm(handle, + Vt.data_handle(), + params.k, + params.cols, + Vt.data_handle(), + VVt.data_handle(), + params.k, + params.k, + CUBLAS_OP_N, + CUBLAS_OP_T, + one, + zero, + stream); + + std::vector h_utu(static_cast(params.k) * params.k); + std::vector h_vvt(static_cast(params.k) * params.k); + raft::update_host(h_utu.data(), UtU.data_handle(), h_utu.size(), stream); + raft::update_host(h_vvt.data(), VVt.data_handle(), h_vvt.size(), stream); + resource::sync_stream(handle, stream); + + auto compute_stats = [this](std::vector const& gram) { + double fro_sq = 0.0; + double max_abs = 0.0; + for (int col = 0; col < params.k; ++col) { + for (int row = 0; row < params.k; ++row) { + auto const expected = row == col ? 1.0 : 0.0; + auto const diff = std::abs( + static_cast(gram[static_cast(col) * params.k + row]) - expected); + fro_sq += diff * diff; + max_abs = std::max(max_abs, diff); + } + } + return std::array{std::sqrt(fro_sq), max_abs}; + }; + + auto const u_stats = compute_stats(h_utu); + auto const v_stats = compute_stats(h_vvt); + state.counters["u_orth_fro"] = benchmark::Counter(u_stats[0]); + state.counters["u_orth_max"] = benchmark::Counter(u_stats[1]); + state.counters["v_orth_fro"] = benchmark::Counter(v_stats[0]); + state.counters["v_orth_max"] = benchmark::Counter(v_stats[1]); + } + + svds_input params; + int nnz; + raft::device_vector indptr; + raft::device_vector indices; + raft::device_vector values; + raft::device_vector singular_values; + raft::device_matrix U; + raft::device_matrix Vt; +}; + +template +class lanczos_svds_bench : public svds_bench_base { + public: + using svds_bench_base::svds_bench_base; + + void run_benchmark(::benchmark::State& state) override + { + auto csr_matrix = this->csr_view(); + + raft::sparse::solver::sparse_lanczos_svd_config config; + config.n_components = this->params.k; + config.ncv = this->params.ncv; + config.tolerance = value_t(1e-4); + config.max_iterations = this->params.max_iterations; + config.seed = 1234; + + this->loop_on_state( + state, + [&]() { + raft::sparse::solver::sparse_lanczos_svd(this->handle, + config, + csr_matrix, + this->singular_values.view(), + this->U.view(), + this->Vt.view()); + }, + false); + } +}; + +template +class lanczos_mgs2_svds_bench : public svds_bench_base { + public: + using svds_bench_base::svds_bench_base; + + void run_benchmark(::benchmark::State& state) override + { + auto csr_matrix = this->csr_view(); + + raft::sparse::solver::sparse_lanczos_svd_config config; + config.n_components = this->params.k; + config.ncv = this->params.ncv; + config.tolerance = value_t(1e-4); + config.max_iterations = this->params.max_iterations; + config.seed = 1234; + config.use_mgs2_orthogonalization = true; + + this->loop_on_state( + state, + [&]() { + raft::sparse::solver::sparse_lanczos_svd(this->handle, + config, + csr_matrix, + this->singular_values.view(), + this->U.view(), + this->Vt.view()); + }, + false); + } +}; + +template +class randomized_svds_bench : public svds_bench_base { + public: + using svds_bench_base::svds_bench_base; + + void run_benchmark(::benchmark::State& state) override + { + auto csr_matrix = this->csr_view(); + + raft::sparse::solver::sparse_svd_config config; + config.n_components = this->params.k; + config.n_oversamples = this->params.n_oversamples; + config.n_power_iters = this->params.n_power_iters; + config.seed = 1234; + + this->loop_on_state( + state, + [&]() { + raft::sparse::solver::sparse_randomized_svd(this->handle, + config, + csr_matrix, + this->singular_values.view(), + this->U.view(), + this->Vt.view()); + }, + false); + } +}; + +std::vector get_svds_inputs() +{ + std::vector inputs = { + {10000, 2000, 10000 * 16, 16, 16, 48, 32, 2, 20, false}, + {50000, 5000, 50000 * 16, 16, 32, 80, 48, 2, 20, false}, + }; + + auto const* path = std::getenv("RAFT_SVDS_CSR_FILE"); + if (path != nullptr) { + auto header = read_csr_file_header(path); + inputs.push_back({static_cast(header.rows), + static_cast(header.cols), + static_cast(header.nnz), + 0, + env_int("RAFT_SVDS_K", 50), + env_int("RAFT_SVDS_NCV", 0), + env_int("RAFT_SVDS_RANDOMIZED_OVERSAMPLES", 10), + env_int("RAFT_SVDS_RANDOMIZED_POWER_ITERS", 2), + env_int("RAFT_SVDS_MAXITER", 100), + true}); + } + return inputs; +} + +std::vector const svds_inputs = get_svds_inputs(); + +RAFT_BENCH_REGISTER((lanczos_svds_bench), "", svds_inputs); +RAFT_BENCH_REGISTER((lanczos_mgs2_svds_bench), "", svds_inputs); +RAFT_BENCH_REGISTER((randomized_svds_bench), "", svds_inputs); + +} // namespace raft::bench::sparse diff --git a/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh b/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh new file mode 100644 index 0000000000..37b5a237bd --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/lanczos_svds.cuh @@ -0,0 +1,767 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace raft::sparse::solver::detail { + +template +RAFT_KERNEL negate_scalar_kernel(ValueTypeT* x) +{ + *x = -*x; +} + +template +ValueTypeT vector_norm(raft::resources const& handle, ValueTypeT const* x, int n) +{ + common::nvtx::range scope("lanczos_svds::vector_norm"); + auto cublas_handle = resource::get_cublas_handle(handle); + auto stream = resource::get_cuda_stream(handle); + ValueTypeT result{}; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublas_handle, n, x, 1, &result, stream)); + resource::sync_stream(handle, stream); + return result; +} + +template +void scale_vector(raft::resources const& handle, ValueTypeT* x, int n, ValueTypeT alpha) +{ + common::nvtx::range scope("lanczos_svds::scale_vector"); + auto cublas_handle = resource::get_cublas_handle(handle); + auto stream = resource::get_cuda_stream(handle); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_handle, n, &alpha, x, 1, stream)); +} + +template +void axpy( + raft::resources const& handle, int n, ValueTypeT alpha, ValueTypeT const* x, ValueTypeT* y) +{ + common::nvtx::range scope("lanczos_svds::axpy"); + auto cublas_handle = resource::get_cublas_handle(handle); + auto stream = resource::get_cuda_stream(handle); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(cublas_handle, n, &alpha, x, 1, y, 1, stream)); +} + +template +void cgs2_orthogonalize(raft::resources const& handle, + ValueTypeT* target, + ValueTypeT const* basis, + int n_rows, + int n_valid, + ValueTypeT* coeffs) +{ + common::nvtx::range scope("lanczos_svds::cgs2_orthogonalize"); + if (n_valid <= 0) { return; } + + auto cublas_handle = resource::get_cublas_handle(handle); + auto stream = resource::get_cuda_stream(handle); + + ValueTypeT one = ValueTypeT(1); + ValueTypeT zero = ValueTypeT(0); + ValueTypeT neg_one = ValueTypeT(-1); + for (int pass = 0; pass < 2; ++pass) { + { + common::nvtx::range project_scope("lanczos_svds::cgs2_project"); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle, + CUBLAS_OP_T, + n_rows, + n_valid, + &one, + basis, + n_rows, + target, + 1, + &zero, + coeffs, + 1, + stream)); + } + { + common::nvtx::range subtract_scope("lanczos_svds::cgs2_subtract"); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle, + CUBLAS_OP_N, + n_rows, + n_valid, + &neg_one, + basis, + n_rows, + coeffs, + 1, + &one, + target, + 1, + stream)); + } + } +} + +template +void mgs2_orthogonalize(raft::resources const& handle, + ValueTypeT* target, + ValueTypeT const* basis, + int n_rows, + int n_valid, + ValueTypeT* coeffs) +{ + common::nvtx::range scope("lanczos_svds::mgs2_orthogonalize"); + if (n_valid <= 0) { return; } + + auto cublas_handle = resource::get_cublas_handle(handle); + auto stream = resource::get_cuda_stream(handle); + + RAFT_CUBLAS_TRY(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE)); + for (int pass = 0; pass < 2; ++pass) { + for (int j = 0; j < n_valid; ++j) { + auto const* q_j = basis + static_cast(j) * n_rows; + auto* coeff = coeffs + j; + { + common::nvtx::range dot_scope("lanczos_svds::mgs2_dot"); + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_handle, n_rows, q_j, 1, target, 1, coeff, stream)); + } + { + common::nvtx::range negate_scope("lanczos_svds::mgs2_negate"); + negate_scalar_kernel<<<1, 1, 0, stream>>>(coeff); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + { + common::nvtx::range subtract_scope( + "lanczos_svds::mgs2_subtract"); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy( + cublas_handle, n_rows, coeff, q_j, 1, target, 1, stream)); + } + } + } + RAFT_CUBLAS_TRY(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST)); +} + +template +void orthogonalize(raft::resources const& handle, + ValueTypeT* target, + ValueTypeT const* basis, + int n_rows, + int n_valid, + ValueTypeT* coeffs, + bool use_mgs2) +{ + if (use_mgs2) { + mgs2_orthogonalize(handle, target, basis, n_rows, n_valid, coeffs); + } else { + cgs2_orthogonalize(handle, target, basis, n_rows, n_valid, coeffs); + } +} + +template +ValueTypeT normalize_or_randomize(raft::resources const& handle, + raft::random::RngState& rng_state, + ValueTypeT* target, + int n_rows, + ValueTypeT const* basis, + int n_valid, + ValueTypeT* coeffs, + ValueTypeT eps, + bool use_mgs2, + bool* used_random_vector) +{ + common::nvtx::range scope("lanczos_svds::normalize_or_randomize"); + auto stream = resource::get_cuda_stream(handle); + auto nrm = vector_norm(handle, target, n_rows); + *used_random_vector = false; + + if (nrm < eps) { + { + common::nvtx::range random_scope( + "lanczos_svds::random_restart_vector"); + raft::random::normal( + handle, rng_state, target, static_cast(n_rows), ValueTypeT(0), ValueTypeT(1)); + } + orthogonalize(handle, target, basis, n_rows, n_valid, coeffs, use_mgs2); + nrm = vector_norm(handle, target, n_rows); + *used_random_vector = true; + } + + RAFT_EXPECTS(nrm >= eps, "Unable to generate a non-zero Lanczos vector"); + auto inv_nrm = ValueTypeT(1) / nrm; + scale_vector(handle, target, n_rows, inv_nrm); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + resource::sync_stream(handle, stream); + return nrm; +} + +template +void build_bidiagonal_matrix(raft::resources const& handle, + std::vector const& alphas, + std::vector const& betas, + raft::device_matrix_view B) +{ + common::nvtx::range scope("lanczos_svds::build_bidiagonal_matrix"); + auto stream = resource::get_cuda_stream(handle); + int ncv = static_cast(alphas.size()); + std::vector h_B(static_cast(ncv) * ncv, ValueTypeT(0)); + + for (int i = 0; i < ncv; ++i) { + h_B[static_cast(i) * ncv + i] = alphas[i]; + if (i + 1 < ncv) { h_B[static_cast(i + 1) * ncv + i] = betas[i + 1]; } + } + + raft::update_device(B.data_handle(), h_B.data(), static_cast(ncv) * ncv, stream); +} + +template +void lanczos_bidiagonalize(raft::resources const& handle, + OperatorT const& op, + int ncv, + ValueTypeT* v_start, + raft::device_matrix_view U_full, + raft::device_matrix_view V_full, + int n_locked, + raft::random::RngState& rng_state, + ValueTypeT* ortho_coeffs, + bool use_mgs2, + std::vector& alphas, + std::vector& betas) +{ + common::nvtx::range scope("lanczos_svds::bidiagonalize"); + int m = op.rows(); + int n = op.cols(); + ValueTypeT eps = ValueTypeT(1e-9); + auto stream = resource::get_cuda_stream(handle); + + alphas.assign(ncv, ValueTypeT(0)); + betas.assign(ncv + 1, ValueTypeT(0)); + + auto* v0 = V_full.data_handle() + static_cast(n_locked) * n; + raft::copy(v0, v_start, n, stream); + orthogonalize(handle, v0, V_full.data_handle(), n, n_locked, ortho_coeffs, use_mgs2); + + bool used_random = false; + normalize_or_randomize(handle, + rng_state, + v0, + n, + V_full.data_handle(), + n_locked, + ortho_coeffs, + eps, + use_mgs2, + &used_random); + + for (int i = 0; i < ncv; ++i) { + int idx_u = n_locked + i; + int idx_v = n_locked + i; + + auto* u = U_full.data_handle() + static_cast(idx_u) * m; + auto* v = V_full.data_handle() + static_cast(idx_v) * n; + + { + common::nvtx::range apply_scope("lanczos_svds::apply_A"); + op.apply(handle, + raft::make_device_matrix_view(v, n, 1), + raft::make_device_matrix_view(u, m, 1)); + } + + if (i > 0) { + auto* u_prev = U_full.data_handle() + static_cast(idx_u - 1) * m; + axpy(handle, m, -betas[i], u_prev, u); + } + + orthogonalize(handle, u, U_full.data_handle(), m, idx_u, ortho_coeffs, use_mgs2); + used_random = false; + auto alpha_norm = normalize_or_randomize(handle, + rng_state, + u, + m, + U_full.data_handle(), + idx_u, + ortho_coeffs, + eps, + use_mgs2, + &used_random); + alphas[i] = used_random ? ValueTypeT(0) : alpha_norm; + + auto* v_next = V_full.data_handle() + static_cast(idx_v + 1) * n; + { + common::nvtx::range apply_t_scope("lanczos_svds::apply_AT"); + op.apply_transpose( + handle, + raft::make_device_matrix_view(u, m, 1), + raft::make_device_matrix_view(v_next, n, 1)); + } + + axpy(handle, n, -alphas[i], v, v_next); + orthogonalize(handle, v_next, V_full.data_handle(), n, idx_v + 1, ortho_coeffs, use_mgs2); + + used_random = false; + auto beta_nrm = normalize_or_randomize(handle, + rng_state, + v_next, + n, + V_full.data_handle(), + idx_v + 1, + ortho_coeffs, + eps, + use_mgs2, + &used_random); + betas[i + 1] = used_random ? ValueTypeT(0) : beta_nrm; + } +} + +template +void compute_ritz_vectors(raft::resources const& handle, + int m, + int n, + int ncv, + int n_locked, + std::vector const& indices, + std::vector const& singular_values, + std::vector const& P, + std::vector const& Qt, + raft::device_matrix_view U_full, + raft::device_matrix_view V_full, + std::vector& locked_singular_values) +{ + common::nvtx::range scope("lanczos_svds::compute_ritz_vectors"); + auto stream = resource::get_cuda_stream(handle); + int num_found = static_cast(indices.size()); + if (num_found == 0) { return; } + + std::vector h_P_good(static_cast(ncv) * num_found); + std::vector h_Qt_good_t(static_cast(ncv) * num_found); + for (int j = 0; j < num_found; ++j) { + int src_col = indices[j]; + for (int row = 0; row < ncv; ++row) { + h_P_good[static_cast(j) * ncv + row] = + P[static_cast(src_col) * ncv + row]; + h_Qt_good_t[static_cast(j) * ncv + row] = + Qt[static_cast(row) * ncv + src_col]; + } + } + + auto P_good = + raft::make_device_matrix(handle, ncv, num_found); + auto Qt_good_t = + raft::make_device_matrix(handle, ncv, num_found); + { + common::nvtx::range copy_scope("lanczos_svds::ritz_copy_basis"); + raft::update_device(P_good.data_handle(), h_P_good.data(), h_P_good.size(), stream); + raft::update_device(Qt_good_t.data_handle(), h_Qt_good_t.data(), h_Qt_good_t.size(), stream); + } + + auto U_ritz = + raft::make_device_matrix(handle, m, num_found); + auto V_ritz = + raft::make_device_matrix(handle, n, num_found); + + ValueTypeT one = 1; + ValueTypeT zero = 0; + auto* U_segment = U_full.data_handle() + static_cast(n_locked) * m; + auto* V_segment = V_full.data_handle() + static_cast(n_locked) * n; + + { + common::nvtx::range gemm_scope("lanczos_svds::ritz_gemm_U"); + raft::linalg::gemm(handle, + U_segment, + m, + ncv, + P_good.data_handle(), + U_ritz.data_handle(), + m, + num_found, + CUBLAS_OP_N, + CUBLAS_OP_N, + one, + zero, + stream); + } + + { + common::nvtx::range gemm_scope("lanczos_svds::ritz_gemm_V"); + raft::linalg::gemm(handle, + V_segment, + n, + ncv, + Qt_good_t.data_handle(), + V_ritz.data_handle(), + n, + num_found, + CUBLAS_OP_N, + CUBLAS_OP_N, + one, + zero, + stream); + } + + { + common::nvtx::range copy_scope("lanczos_svds::ritz_copy_output"); + raft::copy(U_full.data_handle() + static_cast(n_locked) * m, + U_ritz.data_handle(), + static_cast(m) * num_found, + stream); + raft::copy(V_full.data_handle() + static_cast(n_locked) * n, + V_ritz.data_handle(), + static_cast(n) * num_found, + stream); + } + + for (int j = 0; j < num_found; ++j) { + locked_singular_values[n_locked + j] = singular_values[indices[j]]; + } +} + +template +void compute_restart_vector(raft::resources const& handle, + int n, + int ncv, + int n_locked, + std::vector const& coeffs, + raft::device_matrix_view V_full, + ValueTypeT* v_start) +{ + common::nvtx::range scope("lanczos_svds::compute_restart_vector"); + auto stream = resource::get_cuda_stream(handle); + auto d_coeffs = + raft::make_device_vector(handle, static_cast(ncv)); + raft::update_device(d_coeffs.data_handle(), coeffs.data(), ncv, stream); + + auto cublas_handle = resource::get_cublas_handle(handle); + ValueTypeT const one = ValueTypeT(1); + ValueTypeT zero = ValueTypeT(0); + auto* V_segment = V_full.data_handle() + static_cast(n_locked) * n; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle, + CUBLAS_OP_N, + n, + ncv, + &one, + V_segment, + n, + d_coeffs.data_handle(), + 1, + &zero, + v_start, + 1, + stream)); + + auto nrm = vector_norm(handle, v_start, n); + if (nrm > ValueTypeT(0)) { scale_vector(handle, v_start, n, ValueTypeT(1) / nrm); } +} + +template +void sort_singular_triplets(raft::resources const& handle, + ValueTypeT* U_cols, + int m, + ValueTypeT* V_cols, + int n, + std::vector& singular_values) +{ + common::nvtx::range scope("lanczos_svds::sort_singular_triplets"); + int k = static_cast(singular_values.size()); + std::vector order(k); + std::iota(order.begin(), order.end(), 0); + std::stable_sort(order.begin(), order.end(), [&](int a, int b) { + return singular_values[a] > singular_values[b]; + }); + + bool already_sorted = true; + for (int i = 0; i < k; ++i) { + already_sorted = already_sorted && order[i] == i; + } + if (already_sorted) { return; } + + auto stream = resource::get_cuda_stream(handle); + auto U_sorted = raft::make_device_matrix(handle, m, k); + auto V_sorted = raft::make_device_matrix(handle, n, k); + std::vector sorted_s(k); + + for (int dst = 0; dst < k; ++dst) { + int src = order[dst]; + raft::copy(U_sorted.data_handle() + static_cast(dst) * m, + U_cols + static_cast(src) * m, + m, + stream); + raft::copy(V_sorted.data_handle() + static_cast(dst) * n, + V_cols + static_cast(src) * n, + n, + stream); + sorted_s[dst] = singular_values[src]; + } + + raft::copy(U_cols, U_sorted.data_handle(), static_cast(m) * k, stream); + raft::copy(V_cols, V_sorted.data_handle(), static_cast(n) * k, stream); + singular_values.swap(sorted_s); +} + +/** + * @brief Lanczos bidiagonalization SVD for sparse matrices and linear operators. + */ +template +void sparse_lanczos_svd( + raft::resources const& handle, + sparse_lanczos_svd_config const& config, + OperatorT const& op, + raft::device_vector_view singular_values, + std::optional> U, + std::optional> Vt) +{ + common::nvtx::range fun_scope( + "raft::sparse::solver::sparse_lanczos_svd(%d, %d, %d)", + op.rows(), + op.cols(), + config.n_components); + + int m = op.rows(); + int n = op.cols(); + int k = config.n_components; + int min_dim = std::min(m, n); + + RAFT_EXPECTS(k > 0, "n_components must be positive"); + RAFT_EXPECTS(k < min_dim, "n_components must be less than min(m, n)"); + RAFT_EXPECTS(config.max_iterations > 0, "max_iterations must be positive"); + RAFT_EXPECTS(config.tolerance >= ValueTypeT(0), "tolerance must be non-negative"); + RAFT_EXPECTS(singular_values.extent(0) == static_cast(k), + "singular_values must have size n_components"); + if (U) { + RAFT_EXPECTS( + U->extent(0) == static_cast(m) && U->extent(1) == static_cast(k), + "U must have shape (m, n_components)"); + } + if (Vt) { + RAFT_EXPECTS( + Vt->extent(0) == static_cast(k) && Vt->extent(1) == static_cast(n), + "Vt must have shape (n_components, n)"); + } + + int ncv = config.ncv; + if (ncv <= 0) { + if (m > 100000) { + ncv = (k < 75) ? (22 * k + 9) / 10 : static_cast(3.9 * k); + } else { + ncv = std::max(3 * k, 50); + } + } + ncv = std::max(ncv, k); + ncv = std::min(ncv, min_dim - 1); + RAFT_EXPECTS(ncv >= k, "ncv must be at least n_components after clamping"); + + auto stream = resource::get_cuda_stream(handle); + uint64_t seed = config.seed.value_or(std::random_device{}()); + raft::random::RngState rng_state(seed); + + int total_capacity = k + ncv + 2; + auto U_full = raft::make_device_matrix( + handle, static_cast(m), static_cast(total_capacity)); + auto V_full = raft::make_device_matrix( + handle, static_cast(n), static_cast(total_capacity)); + auto ortho_coeffs = + raft::make_device_vector(handle, static_cast(total_capacity)); + auto v_start = raft::make_device_vector(handle, static_cast(n)); + + { + common::nvtx::range scope("lanczos_svds::initial_random_vector"); + raft::random::normal(handle, + rng_state, + v_start.data_handle(), + static_cast(n), + ValueTypeT(0), + ValueTypeT(1)); + } + + std::vector locked_singular_values(k, ValueTypeT(0)); + int n_locked = 0; + int total_iter = 0; + + while (n_locked < k && total_iter < config.max_iterations) { + common::nvtx::range restart_scope( + "lanczos_svds::restart_iteration"); + int active_ncv = std::min(ncv, min_dim - n_locked - 1); + RAFT_EXPECTS(active_ncv > 0, "No remaining subspace available for Lanczos restart"); + + std::vector alphas; + std::vector betas; + lanczos_bidiagonalize(handle, + op, + active_ncv, + v_start.data_handle(), + U_full.view(), + V_full.view(), + n_locked, + rng_state, + ortho_coeffs.data_handle(), + config.use_mgs2_orthogonalization, + alphas, + betas); + + auto B = raft::make_device_matrix( + handle, static_cast(active_ncv), static_cast(active_ncv)); + build_bidiagonal_matrix(handle, alphas, betas, B.view()); + + auto P = raft::make_device_matrix( + handle, static_cast(active_ncv), static_cast(active_ncv)); + auto Qt = raft::make_device_matrix( + handle, static_cast(active_ncv), static_cast(active_ncv)); + auto S_full = + raft::make_device_vector(handle, static_cast(active_ncv)); + + { + common::nvtx::range scope("lanczos_svds::bidiagonal_svd"); + raft::linalg::svdQR(handle, + B.data_handle(), + active_ncv, + active_ncv, + S_full.data_handle(), + P.data_handle(), + Qt.data_handle(), + false, + true, + true, + stream); + } + + std::vector h_s(active_ncv); + std::vector h_P(static_cast(active_ncv) * active_ncv); + std::vector h_Qt(static_cast(active_ncv) * active_ncv); + { + common::nvtx::range scope("lanczos_svds::copy_small_svd_host"); + raft::update_host(h_s.data(), S_full.data_handle(), active_ncv, stream); + raft::update_host(h_P.data(), P.data_handle(), h_P.size(), stream); + raft::update_host(h_Qt.data(), Qt.data_handle(), h_Qt.size(), stream); + resource::sync_stream(handle, stream); + } + + int remaining = k - n_locked; + ValueTypeT max_s = h_s.empty() || h_s[0] <= ValueTypeT(0) ? ValueTypeT(1) : h_s[0]; + int check_components = std::min(remaining + 2, active_ncv); + + std::vector selected; + { + common::nvtx::range scope("lanczos_svds::select_converged"); + for (int i = 0; i < check_components; ++i) { + auto residual = betas[active_ncv] * + std::abs(h_P[static_cast(i) * active_ncv + active_ncv - 1]); + if (residual < config.tolerance * max_s) { selected.push_back(i); } + } + } + if (static_cast(selected.size()) > remaining) { selected.resize(remaining); } + + if (!selected.empty()) { + compute_ritz_vectors(handle, + m, + n, + active_ncv, + n_locked, + selected, + h_s, + h_P, + h_Qt, + U_full.view(), + V_full.view(), + locked_singular_values); + n_locked += static_cast(selected.size()); + if (n_locked >= k) { break; } + } + + std::vector restart_coeffs(active_ncv, ValueTypeT(0)); + { + common::nvtx::range scope("lanczos_svds::build_restart_coeffs"); + if (!selected.empty()) { + int best_unconverged = 0; + for (; best_unconverged < active_ncv; ++best_unconverged) { + if (std::find(selected.begin(), selected.end(), best_unconverged) == selected.end()) { + break; + } + } + if (best_unconverged == active_ncv) { best_unconverged = 0; } + for (int col = 0; col < active_ncv; ++col) { + restart_coeffs[col] = h_Qt[static_cast(col) * active_ncv + best_unconverged]; + } + } else { + int k_mix = std::min(k - n_locked, active_ncv); + for (int col = 0; col < active_ncv; ++col) { + ValueTypeT sum = 0; + for (int row = 0; row < k_mix; ++row) { + sum += h_Qt[static_cast(col) * active_ncv + row]; + } + restart_coeffs[col] = sum; + } + } + } + + compute_restart_vector( + handle, n, active_ncv, n_locked, restart_coeffs, V_full.view(), v_start.data_handle()); + ++total_iter; + } + + RAFT_EXPECTS(n_locked >= k, + "sparse_lanczos_svd failed to converge all requested components within " + "max_iterations"); + + auto U_refined = raft::make_device_matrix( + handle, static_cast(m), static_cast(k)); + { + common::nvtx::range scope("lanczos_svds::post_refine_A_V"); + op.apply(handle, + raft::make_device_matrix_view( + V_full.data_handle(), n, k), + U_refined.view()); + } + + { + common::nvtx::range scope("lanczos_svds::post_refine_normalize"); + for (int j = 0; j < k; ++j) { + auto* u_col = U_refined.data_handle() + static_cast(j) * m; + locked_singular_values[j] = vector_norm(handle, u_col, m); + if (locked_singular_values[j] > ValueTypeT(0)) { + scale_vector(handle, u_col, m, ValueTypeT(1) / locked_singular_values[j]); + } + } + } + + sort_singular_triplets( + handle, U_refined.data_handle(), m, V_full.data_handle(), n, locked_singular_values); + + { + common::nvtx::range scope("lanczos_svds::copy_outputs"); + raft::update_device(singular_values.data_handle(), locked_singular_values.data(), k, stream); + if (U) { + raft::copy( + U->data_handle(), U_refined.data_handle(), static_cast(m) * k, stream); + } + } + + if (Vt) { + common::nvtx::range scope("lanczos_svds::transpose_vt"); + raft::linalg::transpose(handle, V_full.data_handle(), Vt->data_handle(), n, k, stream); + } + + { + common::nvtx::range scope("lanczos_svds::sign_correction"); + svd_sign_correction(handle, U, Vt); + } +} + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh index 85268141ce..72e2bb525a 100644 --- a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh +++ b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/solver/detail/svds_optional.hpp b/cpp/include/raft/sparse/solver/detail/svds_optional.hpp new file mode 100644 index 0000000000..daedfe193f --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/svds_optional.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace raft::sparse::solver::detail { + +// Helper alias used to put optional U/Vt parameters in a non-deduced context +// so template argument deduction picks up the value type from earlier parameters and +// implicit conversion from `device_matrix_view` to `std::optional<...>` works. +template +struct nondeduced { + using type = T; +}; + +template +using nondeduced_optional_matrix_view_t = std::optional::type>; + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/lanczos_svds.cuh b/cpp/include/raft/sparse/solver/lanczos_svds.cuh new file mode 100644 index 0000000000..be70f6aeb6 --- /dev/null +++ b/cpp/include/raft/sparse/solver/lanczos_svds.cuh @@ -0,0 +1,104 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::sparse::solver { + +/** + * @defgroup sparse_lanczos_svd Sparse Lanczos SVD + * @{ + */ + +/** + * @brief Compute truncated SVD using Lanczos bidiagonalization with a generic linear operator. + * + * This solver computes the largest singular triplets using implicitly restarted Lanczos + * bidiagonalization, full reorthogonalization, and final `A @ V` post-refinement of + * returned left singular vectors. It is intended as the higher-accuracy sparse SVD path. + * The operator interface matches sparse_randomized_svd and supports implicit operators + * such as centered sparse matrices. + * + * By default, the solver uses two-pass classical Gram-Schmidt (CGS2), which is efficient + * on GPUs. The configuration can request two-pass modified Gram-Schmidt (MGS2) as an + * alternate path for difficult spectra. If the requested components do not converge + * within `config.max_iterations`, this function raises an exception instead of returning + * partially converged vectors. + * + * `OperatorT` must expose: + * - `int rows() const` / `int cols() const` + * - `void apply(handle, X, Y) const` computes `Y = A @ X` + * - `void apply_transpose(handle, X, Z) const` computes `Z = A^T @ X` + * + * @tparam ValueTypeT Data type (float or double) + * @tparam OperatorT Linear operator type satisfying the interface above + * + * @param[in] handle raft resources handle + * @param[in] config SVD configuration parameters + * @param[in] op linear operator representing the matrix to decompose + * @param[out] singular_values output singular values of shape (n_components,) in descending order + * @param[out] U optional output left singular vectors of shape (m, n_components), col-major. + * Pass `std::nullopt` to skip storing U. + * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major. + * Pass `std::nullopt` to skip storing Vt. + */ +template +void sparse_lanczos_svd( + raft::resources const& handle, + sparse_lanczos_svd_config const& config, + OperatorT const& op, + raft::device_vector_view singular_values, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> U = std::nullopt, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> Vt = std::nullopt) +{ + detail::sparse_lanczos_svd(handle, config, op, singular_values, U, Vt); +} + +/** + * @brief Compute truncated SVD of a sparse CSR matrix using Lanczos bidiagonalization. + * + * Convenience overload that accepts a CSR matrix view directly. + * + * @tparam ValueTypeT Data type (float or double) + * @tparam NNZTypeT Type for number of non-zeros + * + * @param[in] handle raft resources handle + * @param[in] config SVD configuration parameters + * @param[in] A input sparse CSR matrix of shape (m, n) + * @param[out] singular_values output singular values of shape (n_components,) in descending order + * @param[out] U optional output left singular vectors of shape (m, n_components), col-major. + * Pass `std::nullopt` to skip storing U. + * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major. + * Pass `std::nullopt` to skip storing Vt. + */ +template +void sparse_lanczos_svd( + raft::resources const& handle, + sparse_lanczos_svd_config const& config, + raft::device_csr_matrix_view A, + raft::device_vector_view singular_values, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> U = std::nullopt, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> Vt = std::nullopt) +{ + detail::csr_linear_operator op(A); + detail::sparse_lanczos_svd(handle, config, op, singular_values, U, Vt); +} + +/** @} */ + +} // namespace raft::sparse::solver diff --git a/cpp/include/raft/sparse/solver/randomized_svds.cuh b/cpp/include/raft/sparse/solver/randomized_svds.cuh index 7ee1afe1c1..29c86ebaa5 100644 --- a/cpp/include/raft/sparse/solver/randomized_svds.cuh +++ b/cpp/include/raft/sparse/solver/randomized_svds.cuh @@ -10,23 +10,12 @@ #include #include #include +#include #include namespace raft::sparse::solver { -namespace detail { -// Helper alias used to put the optional U/Vt parameters in a non-deduced context -// so template argument deduction picks up ValueTypeT from earlier parameters and -// implicit conversion from `device_matrix_view` to `std::optional<...>` works. -template -struct nondeduced { - using type = T; -}; -template -using nondeduced_optional_matrix_view_t = std::optional::type>; -} // namespace detail - /** * @defgroup sparse_randomized_svd Sparse Randomized SVD * @{ @@ -48,7 +37,7 @@ using nondeduced_optional_matrix_view_t = std::optional:: * where `X`, `Y`, `Z` are `raft::device_matrix_view<..., uint32_t, raft::col_major>`. * This is intentionally a narrow, SVD-specific operator: only the two matrix products * above are required, and no other operations (addition, scaling, inverse, eigensolves, - * etc.) are assumed or supported — it is not a general-purpose linear operator like + * etc.) are assumed or supported; it is not a general-purpose linear operator like * `scipy.sparse.linalg.LinearOperator`. * * @tparam ValueTypeT Data type (float or double) diff --git a/cpp/include/raft/sparse/solver/solver_types.hpp b/cpp/include/raft/sparse/solver/solver_types.hpp new file mode 100644 index 0000000000..6f5f04f567 --- /dev/null +++ b/cpp/include/raft/sparse/solver/solver_types.hpp @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +namespace raft::sparse::solver { + +/** + * @addtogroup sparse_randomized_svd + * @{ + */ + +/** + * @brief Configuration parameters for the sparse randomized SVD solver + * + * @tparam ValueTypeT Data type for values (float or double) + */ +template +struct sparse_svd_config { + /** @brief Number of singular values/vectors to compute. Must be set by the user. */ + int n_components = 0; + + /** @brief Number of extra random vectors for better approximation. + * Total subspace dimension is n_components + n_oversamples. */ + int n_oversamples = 10; + + /** @brief Number of power iteration passes. More iterations improve accuracy + * for matrices with slowly decaying singular values. */ + int n_power_iters = 2; + + /** @brief Random seed for reproducibility */ + std::optional seed = std::nullopt; +}; + +/** @} */ + +/** + * @addtogroup sparse_lanczos_svd + * @{ + */ + +/** + * @brief Configuration parameters for the sparse Lanczos SVD solver + * + * @tparam ValueTypeT Data type for values (float or double) + */ +template +struct sparse_lanczos_svd_config { + /** @brief Number of singular values/vectors to compute. Must be set by the user. + * @note Must satisfy 0 < n_components < min(m, n), where (m, n) is the matrix shape. */ + int n_components = 0; + + /** + * @brief Number of Lanczos vectors per restart. + * + * If zero, a matrix-shape dependent default is selected. Larger values can improve + * convergence margin and orthogonality for clustered spectra, but increase sparse + * matrix-vector work and memory use. + * + * @note When nonzero, the value is clamped to [n_components, min(m, n) - 1]. + */ + int ncv = 0; + + /** @brief Convergence tolerance for Lanczos Ritz residual estimates. */ + ValueTypeT tolerance = ValueTypeT(1e-4); + + /** @brief Maximum number of restart iterations before reporting non-convergence. */ + int max_iterations = 100; + + /** @brief Random seed for reproducibility. */ + std::optional seed = std::nullopt; + + /** + * @brief Use launch-heavy MGS2 instead of the default GPU-efficient CGS2 reorthogonalization. + * + * MGS2 is kept as an alternate path for difficult spectra; CGS2 is the default used in + * normal GPU workloads. + */ + bool use_mgs2_orthogonalization = false; +}; + +/** @} */ + +} // namespace raft::sparse::solver diff --git a/cpp/include/raft/sparse/solver/svds_config.hpp b/cpp/include/raft/sparse/solver/svds_config.hpp deleted file mode 100644 index bbb56a7db6..0000000000 --- a/cpp/include/raft/sparse/solver/svds_config.hpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include -#include - -namespace raft::sparse::solver { - -/** - * @addtogroup sparse_randomized_svd - * @{ - */ - -/** - * @brief Configuration parameters for the sparse randomized SVD solver - * - * @tparam ValueTypeT Data type for values (float or double) - */ -template -struct sparse_svd_config { - /** @brief Number of singular values/vectors to compute. Must be set by the user. */ - int n_components = 0; - - /** @brief Number of extra random vectors for better approximation. - * Total subspace dimension is n_components + n_oversamples. */ - int n_oversamples = 10; - - /** @brief Number of power iteration passes. More iterations improve accuracy - * for matrices with slowly decaying singular values. */ - int n_power_iters = 2; - - /** @brief Random seed for reproducibility */ - std::optional seed = std::nullopt; -}; - -/** @} */ - -} // namespace raft::sparse::solver diff --git a/cpp/include/raft_runtime/solver/lanczos_svds.hpp b/cpp/include/raft_runtime/solver/lanczos_svds.hpp new file mode 100644 index 0000000000..73300dfe78 --- /dev/null +++ b/cpp/include/raft_runtime/solver/lanczos_svds.hpp @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace raft::runtime::solver { + +/** + * @defgroup sparse_lanczos_svd_runtime Sparse Lanczos SVD Runtime API + * @{ + */ + +#define LANCZOS_SVD_FUNC_DECL(ValueType) \ + RAFT_EXPORT void sparse_lanczos_svd( \ + const raft::resources& handle, \ + const raft::sparse::solver::sparse_lanczos_svd_config& config, \ + raft::device_vector_view indptr, \ + raft::device_vector_view indices, \ + raft::device_vector_view data, \ + int n_rows, \ + int n_cols, \ + int nnz, \ + raft::device_vector_view singular_values, \ + std::optional> U, \ + std::optional> Vt) + +LANCZOS_SVD_FUNC_DECL(float); +LANCZOS_SVD_FUNC_DECL(double); + +#undef LANCZOS_SVD_FUNC_DECL + +/** @} */ + +} // namespace raft::runtime::solver diff --git a/cpp/include/raft_runtime/solver/randomized_svds.hpp b/cpp/include/raft_runtime/solver/randomized_svds.hpp index 07bc3ec33a..c6f6df066a 100644 --- a/cpp/include/raft_runtime/solver/randomized_svds.hpp +++ b/cpp/include/raft_runtime/solver/randomized_svds.hpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/src/raft_runtime/solver/lanczos_svds.cuh b/cpp/src/raft_runtime/solver/lanczos_svds.cuh new file mode 100644 index 0000000000..d948dd1c22 --- /dev/null +++ b/cpp/src/raft_runtime/solver/lanczos_svds.cuh @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include + +#define FUNC_DEF(ValueType) \ + void sparse_lanczos_svd( \ + const raft::resources& handle, \ + const raft::sparse::solver::sparse_lanczos_svd_config& config, \ + raft::device_vector_view indptr, \ + raft::device_vector_view indices, \ + raft::device_vector_view data, \ + int n_rows, \ + int n_cols, \ + int nnz, \ + raft::device_vector_view singular_values, \ + std::optional> U, \ + std::optional> Vt) \ + { \ + auto csr_structure = raft::make_device_compressed_structure_view( \ + indptr.data_handle(), indices.data_handle(), n_rows, n_cols, nnz); \ + auto csr_matrix = raft::make_device_csr_matrix_view( \ + data.data_handle(), csr_structure); \ + raft::sparse::solver::sparse_lanczos_svd(handle, config, csr_matrix, singular_values, U, Vt); \ + } diff --git a/cpp/src/raft_runtime/solver/lanczos_svds_double.cu b/cpp/src/raft_runtime/solver/lanczos_svds_double.cu new file mode 100644 index 0000000000..10344b473c --- /dev/null +++ b/cpp/src/raft_runtime/solver/lanczos_svds_double.cu @@ -0,0 +1,14 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "lanczos_svds.cuh" + +namespace raft::runtime::solver { + +FUNC_DEF(double); + +} // namespace raft::runtime::solver + +#undef FUNC_DEF diff --git a/cpp/src/raft_runtime/solver/lanczos_svds_float.cu b/cpp/src/raft_runtime/solver/lanczos_svds_float.cu new file mode 100644 index 0000000000..850da77d9f --- /dev/null +++ b/cpp/src/raft_runtime/solver/lanczos_svds_float.cu @@ -0,0 +1,14 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "lanczos_svds.cuh" + +namespace raft::runtime::solver { + +FUNC_DEF(float); + +} // namespace raft::runtime::solver + +#undef FUNC_DEF diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 41b6e7ee3c..ab80f3e415 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -241,8 +241,17 @@ if(BUILD_TESTS) ) ConfigureTest( - NAME SOLVERS_TEST PATH linalg/eigen_solvers.cu lap/lap.cu sparse/mst.cu - sparse/solver/lanczos.cu sparse/solver/randomized_svds.cu LIB EXPLICIT_INSTANTIATE_ONLY + NAME + SOLVERS_TEST + PATH + linalg/eigen_solvers.cu + lap/lap.cu + sparse/mst.cu + sparse/solver/lanczos.cu + sparse/solver/lanczos_svds.cu + sparse/solver/randomized_svds.cu + LIB + EXPLICIT_INSTANTIATE_ONLY ) ConfigureTest( diff --git a/cpp/tests/sparse/solver/lanczos_svds.cu b/cpp/tests/sparse/solver/lanczos_svds.cu new file mode 100644 index 0000000000..052de68424 --- /dev/null +++ b/cpp/tests/sparse/solver/lanczos_svds.cu @@ -0,0 +1,544 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../test_utils.cuh" + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace raft::sparse::solver { + +template +void check_orthonormal_columns(std::vector const& Q, int n_rows, int n_cols, double tol) +{ + for (int i = 0; i < n_cols; ++i) { + for (int j = 0; j < n_cols; ++j) { + double dot = 0; + for (int row = 0; row < n_rows; ++row) { + dot += static_cast(Q[static_cast(i) * n_rows + row]) * + static_cast(Q[static_cast(j) * n_rows + row]); + } + double expected = (i == j) ? 1.0 : 0.0; + ASSERT_NEAR(dot, expected, tol); + } + } +} + +template +void check_orthonormal_vt_rows(std::vector const& Vt, int k, int n, double tol) +{ + for (int i = 0; i < k; ++i) { + for (int j = 0; j < k; ++j) { + double dot = 0; + for (int col = 0; col < n; ++col) { + dot += static_cast(Vt[static_cast(col) * k + i]) * + static_cast(Vt[static_cast(col) * k + j]); + } + double expected = (i == j) ? 1.0 : 0.0; + ASSERT_NEAR(dot, expected, tol); + } + } +} + +template +class LanczosSvdsTest : public ::testing::Test { + public: + LanczosSvdsTest() + : stream(resource::get_cuda_stream(handle)), + m(12), + n(8), + k(3), + nnz(8), + d_indptr(raft::make_device_vector(handle, m + 1)), + d_indices(raft::make_device_vector(handle, nnz)), + d_values(raft::make_device_vector(handle, nnz)) + { + } + + protected: + void SetUp() override + { + h_indptr = {0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8}; + h_indices.resize(nnz); + h_values.resize(nnz); + for (int i = 0; i < nnz; ++i) { + h_indices[i] = i; + h_values[i] = ValueType(8 - i); + } + + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + } + + void Run(bool use_mgs2 = false) + { + auto csr_structure = raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_lanczos_svd_config config; + config.n_components = k; + config.ncv = 6; + config.tolerance = ValueType(1e-6); + config.max_iterations = 20; + config.seed = 42; + config.use_mgs2_orthogonalization = use_mgs2; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + + sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + std::vector h_S(k); + std::vector h_U(static_cast(m) * k); + std::vector h_Vt(static_cast(k) * n); + raft::update_host(h_S.data(), S.data_handle(), k, stream); + raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream); + raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream); + resource::sync_stream(handle, stream); + + ValueType tol = std::is_same_v ? ValueType(2e-3) : ValueType(1e-8); + double residual_tol = std::is_same_v ? 2e-2 : 1e-5; + for (int i = 0; i < k; ++i) { + ASSERT_NEAR(h_S[i], ValueType(8 - i), tol); + } + + check_orthonormal_columns(h_U, m, k, tol); + check_orthonormal_vt_rows(h_Vt, k, n, tol); + + for (int comp = 0; comp < k; ++comp) { + double left_residual_sq = 0; + double right_residual_sq = 0; + for (int row = 0; row < m; ++row) { + ValueType av = + row < n ? h_values[row] * h_Vt[static_cast(row) * k + comp] : ValueType(0); + ValueType su = h_S[comp] * h_U[static_cast(comp) * m + row]; + double diff = static_cast(av - su); + left_residual_sq += diff * diff; + } + for (int col = 0; col < n; ++col) { + ValueType atu = h_values[col] * h_U[static_cast(comp) * m + col]; + ValueType sv = h_S[comp] * h_Vt[static_cast(col) * k + comp]; + double diff = static_cast(atu - sv); + right_residual_sq += diff * diff; + } + ASSERT_LT(std::sqrt(left_residual_sq), residual_tol); + ASSERT_LT(std::sqrt(right_residual_sq), residual_tol); + } + } + + raft::resources handle; + cudaStream_t stream; + int m, n, k, nnz; + std::vector h_indptr; + std::vector h_indices; + std::vector h_values; + raft::device_vector d_indptr; + raft::device_vector d_indices; + raft::device_vector d_values; +}; + +using LanczosSvdsTestF = LanczosSvdsTest; +TEST_F(LanczosSvdsTestF, DiagonalSpectrum) { Run(); } +TEST_F(LanczosSvdsTestF, DiagonalSpectrumMgs2) { Run(true); } + +using LanczosSvdsTestD = LanczosSvdsTest; +TEST_F(LanczosSvdsTestD, DiagonalSpectrum) { Run(); } + +template +class LanczosClusteredSpectrumTest : public ::testing::Test { + public: + LanczosClusteredSpectrumTest() + : stream(resource::get_cuda_stream(handle)), + m(48), + n(32), + k(8), + nnz(32), + d_indptr(raft::make_device_vector(handle, m + 1)), + d_indices(raft::make_device_vector(handle, nnz)), + d_values(raft::make_device_vector(handle, nnz)) + { + } + + protected: + void SetUp() override + { + h_indptr.resize(m + 1); + h_indices.resize(nnz); + h_values.resize(nnz); + for (int i = 0; i <= m; ++i) { + h_indptr[i] = std::min(i, nnz); + } + for (int i = 0; i < nnz; ++i) { + h_indices[i] = i; + } + + std::vector singular_values = {10.0, 9.999, 9.998, 9.5, 9.499, 9.0, 8.999, 8.5}; + for (int i = 0; i < nnz; ++i) { + h_values[i] = i < static_cast(singular_values.size()) + ? static_cast(singular_values[i]) + : static_cast(1.0 / (i + 1)); + } + + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + } + + void Run(bool use_mgs2 = false) + { + auto csr_structure = raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_lanczos_svd_config config; + config.n_components = k; + config.ncv = 18; + config.tolerance = std::is_same_v ? ValueType(1e-5) : ValueType(1e-10); + config.max_iterations = 80; + config.seed = 123; + config.use_mgs2_orthogonalization = use_mgs2; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + + sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + std::vector h_S(k); + std::vector h_U(static_cast(m) * k); + std::vector h_Vt(static_cast(k) * n); + raft::update_host(h_S.data(), S.data_handle(), k, stream); + raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream); + raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream); + resource::sync_stream(handle, stream); + + double sv_tol = std::is_same_v ? 1e-3 : 1e-8; + double orthog_tol = std::is_same_v ? 2e-4 : 1e-10; + for (int i = 0; i < k; ++i) { + ASSERT_NEAR(static_cast(h_S[i]), static_cast(h_values[i]), sv_tol); + } + check_orthonormal_columns(h_U, m, k, orthog_tol); + check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol); + } + + raft::resources handle; + cudaStream_t stream; + int m, n, k, nnz; + std::vector h_indptr; + std::vector h_indices; + std::vector h_values; + raft::device_vector d_indptr; + raft::device_vector d_indices; + raft::device_vector d_values; +}; + +using LanczosClusteredSpectrumTestF = LanczosClusteredSpectrumTest; +TEST_F(LanczosClusteredSpectrumTestF, Orthonormality) { Run(); } +TEST_F(LanczosClusteredSpectrumTestF, OrthonormalityMgs2) { Run(true); } + +using LanczosClusteredSpectrumTestD = LanczosClusteredSpectrumTest; +TEST_F(LanczosClusteredSpectrumTestD, Orthonormality) { Run(); } + +template +void run_diagonal_hardening_case(int m, + int n, + std::vector const& diagonal, + int k, + int ncv, + int max_iterations, + ValueType tolerance, + double singular_value_tol, + double orthog_tol, + double residual_tol, + bool use_mgs2 = false) +{ + raft::resources handle; + auto stream = resource::get_cuda_stream(handle); + + std::vector h_indptr(m + 1); + std::vector h_indices; + std::vector h_values; + int min_dim = std::min(m, n); + for (int row = 0; row < m; ++row) { + h_indptr[row] = static_cast(h_values.size()); + if (row < min_dim && row < static_cast(diagonal.size()) && diagonal[row] != 0.0) { + h_indices.push_back(row); + h_values.push_back(static_cast(diagonal[row])); + } + } + h_indptr[m] = static_cast(h_values.size()); + + auto nnz = static_cast(h_values.size()); + auto d_indptr = raft::make_device_vector(handle, m + 1); + auto d_indices = raft::make_device_vector(handle, nnz); + auto d_values = raft::make_device_vector(handle, nnz); + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + + auto csr_structure = raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_lanczos_svd_config config; + config.n_components = k; + config.ncv = ncv; + config.tolerance = tolerance; + config.max_iterations = max_iterations; + config.seed = 2026; + config.use_mgs2_orthogonalization = use_mgs2; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + + sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + std::vector h_S(k); + std::vector h_U(static_cast(m) * k); + std::vector h_Vt(static_cast(k) * n); + raft::update_host(h_S.data(), S.data_handle(), k, stream); + raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream); + raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream); + resource::sync_stream(handle, stream); + + for (int i = 0; i < k; ++i) { + ASSERT_NEAR(static_cast(h_S[i]), diagonal[i], singular_value_tol); + } + check_orthonormal_columns(h_U, m, k, orthog_tol); + check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol); + + auto diag_at = [&](int i) { + return (i < min_dim && i < static_cast(diagonal.size())) + ? static_cast(diagonal[i]) + : ValueType(0); + }; + + for (int comp = 0; comp < k; ++comp) { + double left_residual_sq = 0; + double right_residual_sq = 0; + for (int row = 0; row < m; ++row) { + ValueType av = + row < n ? diag_at(row) * h_Vt[static_cast(row) * k + comp] : ValueType(0); + ValueType su = h_S[comp] * h_U[static_cast(comp) * m + row]; + double diff = static_cast(av - su); + left_residual_sq += diff * diff; + } + for (int col = 0; col < n; ++col) { + ValueType atu = + col < m ? diag_at(col) * h_U[static_cast(comp) * m + col] : ValueType(0); + ValueType sv = h_S[comp] * h_Vt[static_cast(col) * k + comp]; + double diff = static_cast(atu - sv); + right_residual_sq += diff * diff; + } + ASSERT_LT(std::sqrt(left_residual_sq), residual_tol); + ASSERT_LT(std::sqrt(right_residual_sq), residual_tol); + } +} + +TEST(LanczosHardeningF, RepeatedSpectrum) +{ + run_diagonal_hardening_case(64, + 40, + {10.0, 10.0, 10.0, 9.0, 9.0, 8.0, 7.0, 6.0, 1.0, 0.5}, + 6, + 18, + 120, + 1e-5f, + 2e-3, + 2e-3, + 2e-2); +} + +TEST(LanczosHardeningF, RankDeficientTall) +{ + run_diagonal_hardening_case( + 96, 28, {12.0, 8.0, 6.0, 3.0, 1.0, 0.0, 0.0, 0.0}, 4, 12, 80, 1e-5f, 2e-3, 2e-3, 2e-2); +} + +TEST(LanczosHardeningF, WideMatrix) +{ + run_diagonal_hardening_case( + 24, 96, {12.0, 11.5, 8.0, 4.0, 2.0, 1.0, 0.5}, 4, 12, 80, 1e-5f, 2e-3, 2e-3, 2e-2); +} + +TEST(LanczosHardeningD, RankDeficientTall) +{ + run_diagonal_hardening_case( + 96, 28, {12.0, 8.0, 6.0, 3.0, 1.0, 0.0, 0.0, 0.0}, 4, 12, 80, 1e-10, 1e-8, 1e-10, 1e-5); +} + +TEST(LanczosHardeningF, ReportsNonConvergence) +{ + EXPECT_THROW(run_diagonal_hardening_case( + 64, 40, {10.0, 9.0, 8.0, 7.0, 6.0, 5.0}, 4, 6, 1, 0.0f, 2e-3, 2e-3, 2e-2), + raft::logic_error); +} + +template +void check_sparse_residuals(std::vector const& indptr, + std::vector const& indices, + std::vector const& values, + std::vector const& S, + std::vector const& U, + std::vector const& Vt, + int m, + int n, + int k, + double residual_tol) +{ + for (int comp = 0; comp < k; ++comp) { + std::vector av(m, 0.0); + std::vector atu(n, 0.0); + for (int row = 0; row < m; ++row) { + for (int nz = indptr[row]; nz < indptr[row + 1]; ++nz) { + int col = indices[nz]; + double value = static_cast(values[nz]); + av[row] += value * static_cast(Vt[static_cast(col) * k + comp]); + atu[col] += value * static_cast(U[static_cast(comp) * m + row]); + } + } + + double left_residual_sq = 0.0; + double right_residual_sq = 0.0; + for (int row = 0; row < m; ++row) { + double su = static_cast(S[comp]) * + static_cast(U[static_cast(comp) * m + row]); + double diff = av[row] - su; + left_residual_sq += diff * diff; + } + for (int col = 0; col < n; ++col) { + double sv = static_cast(S[comp]) * + static_cast(Vt[static_cast(col) * k + comp]); + double diff = atu[col] - sv; + right_residual_sq += diff * diff; + } + ASSERT_LT(std::sqrt(left_residual_sq), residual_tol); + ASSERT_LT(std::sqrt(right_residual_sq), residual_tol); + } +} + +template +void run_rotated_block_hardening_case(bool use_mgs2 = false) +{ + constexpr int m = 64; + constexpr int n = 48; + constexpr int k = 8; + std::vector expected_s = {10.0, 9.9995, 9.999, 9.5, 9.4995, 9.0, 8.5, 8.0}; + std::vector all_s = expected_s; + all_s.push_back(0.75); + all_s.push_back(0.25); + + std::vector h_indptr(m + 1); + std::vector h_indices; + std::vector h_values; + + auto add_block = [&](int row0, int col0, double s0, double s1, double theta, double phi) { + double cu = std::cos(theta); + double su = std::sin(theta); + double cv = std::cos(phi); + double sv = std::sin(phi); + double a00 = cu * s0 * cv + su * s1 * sv; + double a01 = cu * s0 * sv - su * s1 * cv; + double a10 = su * s0 * cv - cu * s1 * sv; + double a11 = su * s0 * sv + cu * s1 * cv; + h_indices.push_back(col0); + h_values.push_back(static_cast(a00)); + h_indices.push_back(col0 + 1); + h_values.push_back(static_cast(a01)); + h_indptr[row0 + 1] = static_cast(h_values.size()); + h_indices.push_back(col0); + h_values.push_back(static_cast(a10)); + h_indices.push_back(col0 + 1); + h_values.push_back(static_cast(a11)); + h_indptr[row0 + 2] = static_cast(h_values.size()); + }; + + for (int block = 0; block < static_cast(all_s.size() / 2); ++block) { + int row0 = 2 * block; + int col0 = 2 * block; + h_indptr[row0] = static_cast(h_values.size()); + add_block( + row0, col0, all_s[2 * block], all_s[2 * block + 1], 0.17 + 0.11 * block, 0.31 + 0.07 * block); + } + for (int row = static_cast(all_s.size()); row <= m; ++row) { + h_indptr[row] = static_cast(h_values.size()); + } + + raft::resources handle; + auto stream = resource::get_cuda_stream(handle); + auto nnz = static_cast(h_values.size()); + auto d_indptr = raft::make_device_vector(handle, m + 1); + auto d_indices = raft::make_device_vector(handle, nnz); + auto d_values = raft::make_device_vector(handle, nnz); + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + + auto csr_structure = raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_lanczos_svd_config config; + config.n_components = k; + config.ncv = 20; + config.tolerance = std::is_same_v ? ValueType(1e-5) : ValueType(1e-10); + config.max_iterations = 120; + config.seed = 2027; + config.use_mgs2_orthogonalization = use_mgs2; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + sparse_lanczos_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + std::vector h_S(k); + std::vector h_U(static_cast(m) * k); + std::vector h_Vt(static_cast(k) * n); + raft::update_host(h_S.data(), S.data_handle(), k, stream); + raft::update_host(h_U.data(), U.data_handle(), h_U.size(), stream); + raft::update_host(h_Vt.data(), Vt.data_handle(), h_Vt.size(), stream); + resource::sync_stream(handle, stream); + + double sv_tol = std::is_same_v ? 2e-3 : 1e-8; + double orthog_tol = std::is_same_v ? 2e-3 : 1e-10; + double residual_tol = std::is_same_v ? 2e-2 : 1e-5; + for (int i = 0; i < k; ++i) { + ASSERT_NEAR(static_cast(h_S[i]), expected_s[i], sv_tol); + } + check_orthonormal_columns(h_U, m, k, orthog_tol); + check_orthonormal_vt_rows(h_Vt, k, n, orthog_tol); + check_sparse_residuals(h_indptr, h_indices, h_values, h_S, h_U, h_Vt, m, n, k, residual_tol); +} + +TEST(LanczosHardeningF, RotatedClusteredSpectrum) { run_rotated_block_hardening_case(); } + +TEST(LanczosHardeningF, RotatedClusteredSpectrumMgs2) +{ + run_rotated_block_hardening_case(true); +} + +TEST(LanczosHardeningD, RotatedClusteredSpectrum) { run_rotated_block_hardening_case(); } + +} // namespace raft::sparse::solver diff --git a/docs/source/cpp_api/sparse_solver.rst b/docs/source/cpp_api/sparse_solver.rst index e8abc37719..1285d06795 100644 --- a/docs/source/cpp_api/sparse_solver.rst +++ b/docs/source/cpp_api/sparse_solver.rst @@ -13,3 +13,11 @@ Sparse Randomized SVD :project: RAFT :members: :content-only: + +Sparse Lanczos SVD +------------------ + +.. doxygengroup:: sparse_lanczos_svd + :project: RAFT + :members: + :content-only: diff --git a/python/pylibraft/benchmarks/sparse_svds.py b/python/pylibraft/benchmarks/sparse_svds.py new file mode 100644 index 0000000000..f065df2990 --- /dev/null +++ b/python/pylibraft/benchmarks/sparse_svds.py @@ -0,0 +1,477 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import argparse +import importlib.util +import math +import time +from pathlib import Path + +import cupy as cp +import cupyx.scipy.sparse as cupy_sparse +import cupyx.scipy.sparse.linalg as cupy_sparse_linalg +import numpy as np +import scipy.sparse +import scipy.sparse.linalg +from anndata import read_h5ad + +CSR_FILE_MAGIC = 0x3152534354464152 + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def _default_data_path() -> Path: + root = _repo_root() + candidates = [ + root.parent / "rapids-singlecell-notebooks/h5/pca.h5ad", + root.parent / "rapids_singlecell/data/pbmc3k_raw.h5ad", + root.parent / "scanpy/src/scanpy/datasets/10x_pbmc68k_reduced.h5ad", + ] + for path in candidates: + if path.exists(): + return path + return candidates[0] + + +def _import_from_path(module_name: str, path: Path): + spec = importlib.util.spec_from_file_location(module_name, path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load {module_name} from {path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _load_rsc_modules(rsc_source: Path): + sparse_pca = rsc_source / "src/rapids_singlecell/preprocessing/_sparse_pca" + lanczos = _import_from_path( + "rsc_svd_lanczos", sparse_pca / "_svd_lanczos.py" + ) + randomized = _import_from_path( + "rsc_block_lanczos", sparse_pca / "_block_lanczos.py" + ) + return lanczos.lanczos_svd, randomized.randomized_svd + + +def _load_raft_svds(): + try: + from pylibraft.sparse.linalg import svds + except Exception as exc: # noqa: BLE001 + raise RuntimeError( + f"Could not import pylibraft.sparse.linalg.svds: {exc}" + ) from exc + return svds + + +def _read_h5ad_csr(path: Path, *, layer: str | None, dtype: np.dtype): + adata = read_h5ad(path) + X = adata.layers[layer] if layer is not None else adata.X + if scipy.sparse.issparse(X): + X = X.tocsr() + else: + X = scipy.sparse.csr_matrix(X) + X.sort_indices() + if X.dtype != dtype: + X = X.astype(dtype) + return X + + +def _slice_matrix(X, *, max_rows: int | None, max_cols: int | None): + if max_rows is not None: + X = X[:max_rows, :] + if max_cols is not None: + X = X[:, :max_cols] + X = X.tocsr() + X.sort_indices() + return X + + +def _to_cupy_csr(X): + return cupy_sparse.csr_matrix( + (cp.asarray(X.data), cp.asarray(X.indices), cp.asarray(X.indptr)), + shape=X.shape, + ) + + +def _export_csr_bin(X, path: Path): + if X.dtype != np.float32: + raise TypeError( + f"CSR binary export currently supports float32 values, got {X.dtype}" + ) + dtype_code = 1 + + int32_max = np.iinfo(np.int32).max + if X.nnz > int32_max: + raise OverflowError(f"nnz={X.nnz} exceeds int32 max") + if X.shape[1] > int32_max: + raise OverflowError(f"n_cols={X.shape[1]} exceeds int32 max") + + path.parent.mkdir(parents=True, exist_ok=True) + header_dtype = np.dtype( + [ + ("magic", "9.3f} ms " + f"mean={row['mean_ms']:>9.3f} ms min={row['min_ms']:>9.3f} ms " + f"s0={row['s0']:.6g} " + f"u_orth={row['u_orth']:.3e} vt_orth={row['vt_orth']:.3e} " + f"residual={row['residual']:.3e} s[:5]={s_head}" + ) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Benchmark sparse SVD solvers on CSR/H5AD inputs." + ) + parser.add_argument("--data", type=Path, default=_default_data_path()) + parser.add_argument("--layer", default=None) + parser.add_argument( + "--rsc-source", + type=Path, + default=_repo_root().parent / "rapids_singlecell", + ) + parser.add_argument("--k", type=int, default=50) + parser.add_argument("--ncv", type=int, default=None) + parser.add_argument("--n-oversamples", type=int, default=10) + parser.add_argument("--n-power-iters", type=int, default=2) + parser.add_argument("--maxiter", type=int, default=100) + parser.add_argument("--tol", type=float, default=1e-4) + parser.add_argument("--seed", type=int, default=1234) + parser.add_argument( + "--dtype", choices=("float32", "float64"), default="float32" + ) + parser.add_argument("--max-rows", type=int, default=None) + parser.add_argument("--max-cols", type=int, default=None) + parser.add_argument("--export-csr-bin", type=Path, default=None) + parser.add_argument("--export-only", action="store_true") + parser.add_argument("--warmups", type=int, default=1) + parser.add_argument("--repeats", type=int, default=3) + parser.add_argument("--metrics", action="store_true") + parser.add_argument( + "--solvers", + default="raft_lanczos,raft_randomized", + help="Comma-separated subset of raft_lanczos, raft_lanczos_mgs2, " + "raft_randomized, rsc_lanczos, rsc_randomized, cupy_svds, " + "scipy_propack, scipy_lobpcg.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + dtype = np.dtype(args.dtype) + ncv = args.ncv + selected = { + name.strip() for name in args.solvers.split(",") if name.strip() + } + + X = _read_h5ad_csr(args.data, layer=args.layer, dtype=dtype) + X = _slice_matrix(X, max_rows=args.max_rows, max_cols=args.max_cols) + if args.export_csr_bin is not None: + _export_csr_bin(X, args.export_csr_bin) + print( + f"exported CSR binary to {args.export_csr_bin} " + f"shape={X.shape} nnz={X.nnz} dtype={X.dtype}" + ) + if args.export_only: + return + + needs_gpu = any( + name == "cupy_svds" + or name.startswith("raft_") + or name.startswith("rsc_") + for name in selected + ) + A = _to_cupy_csr(X) if needs_gpu else None + + print( + f"data={args.data} shape={X.shape} nnz={X.nnz} density={X.nnz / (X.shape[0] * X.shape[1]):.6g} " + f"dtype={X.dtype} k={args.k} ncv={ncv if ncv is not None else 'default'} repeats={args.repeats}" + ) + + needs_raft = any(name.startswith("raft_") for name in selected) + needs_rsc = any(name.startswith("rsc_") for name in selected) + + raft_svds = _load_raft_svds() if needs_raft else None + rsc_lanczos, rsc_randomized = ( + _load_rsc_modules(args.rsc_source) if needs_rsc else (None, None) + ) + + gpu_solvers = {} + cpu_solvers = { + "scipy_propack": lambda: _scipy_svds(X, args, solver="propack"), + "scipy_lobpcg": lambda: _scipy_svds(X, args, solver="lobpcg"), + } + if A is not None: + gpu_solvers["cupy_svds"] = lambda: _cupy_svds(A, args, ncv) + if raft_svds is not None: + gpu_solvers["raft_lanczos"] = lambda: raft_svds( + A, + k=args.k, + solver="lanczos", + ncv=ncv, + tol=args.tol, + maxiter=args.maxiter, + seed=args.seed, + orthogonalization="cgs2", + ) + gpu_solvers["raft_lanczos_mgs2"] = lambda: raft_svds( + A, + k=args.k, + solver="lanczos", + ncv=ncv, + tol=args.tol, + maxiter=args.maxiter, + seed=args.seed, + orthogonalization="mgs2", + ) + gpu_solvers["raft_randomized"] = lambda: raft_svds( + A, + k=args.k, + solver="randomized", + n_oversamples=args.n_oversamples, + n_power_iters=args.n_power_iters, + seed=args.seed, + ) + + if rsc_lanczos is not None: + gpu_solvers["rsc_lanczos"] = lambda: rsc_lanczos( + A, + k=args.k, + ncv=ncv, + tol=args.tol, + max_iter=args.maxiter, + random_state=args.seed, + refine_results=True, + ) + gpu_solvers["rsc_randomized"] = lambda: rsc_randomized( + A, + k=args.k, + n_oversamples=args.n_oversamples, + n_iter=args.n_power_iters, + random_state=args.seed, + ) + + solvers = {**gpu_solvers, **cpu_solvers} + unknown = selected.difference(solvers) + if unknown: + raise ValueError(f"Unknown or unavailable solvers: {sorted(unknown)}") + + solver_order = [ + "cupy_svds", + "raft_lanczos", + "raft_lanczos_mgs2", + "raft_randomized", + "rsc_lanczos", + "rsc_randomized", + "scipy_propack", + "scipy_lobpcg", + ] + for name in [name for name in solver_order if name in selected]: + try: + if name in gpu_solvers: + row = _run_solver( + name, + gpu_solvers[name], + A, + args, + compute_metrics=args.metrics, + ) + else: + row = _run_solver_cpu( + name, + cpu_solvers[name], + X, + args, + compute_metrics=args.metrics, + ) + print(_format_row(row)) + except Exception as exc: # noqa: BLE001 + print(f"{name:<22} failed: {type(exc).__name__}: {exc}") + + +if __name__ == "__main__": + main() diff --git a/python/pylibraft/pylibraft/sparse/linalg/svds.pyx b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx index b2bbca41e2..6463da9b17 100644 --- a/python/pylibraft/pylibraft/sparse/linalg/svds.pyx +++ b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx @@ -15,6 +15,7 @@ import numpy as np from cython.operator cimport dereference as deref from libc.stdint cimport int64_t, uint32_t, uint64_t, uintptr_t +from libcpp cimport bool from pylibraft.common import Handle, cai_wrapper, device_ndarray from pylibraft.common.handle import auto_sync_handle @@ -31,7 +32,7 @@ from pylibraft.common.cpp.optional cimport optional from pylibraft.common.handle cimport device_resources -cdef extern from "raft/sparse/solver/svds_config.hpp" \ +cdef extern from "raft/sparse/solver/solver_types.hpp" \ namespace "raft::sparse::solver" nogil: cdef cppclass sparse_svd_config[ValueTypeT]: @@ -40,6 +41,14 @@ cdef extern from "raft/sparse/solver/svds_config.hpp" \ int n_power_iters optional[uint64_t] seed + cdef cppclass sparse_lanczos_svd_config[ValueTypeT]: + int n_components + int ncv + ValueTypeT tolerance + int max_iterations + optional[uint64_t] seed + bool use_mgs2_orthogonalization + cdef extern from "raft_runtime/solver/randomized_svds.hpp" \ namespace "raft::runtime::solver" nogil: @@ -69,12 +78,42 @@ cdef extern from "raft_runtime/solver/randomized_svds.hpp" \ optional[device_matrix_view[double, uint32_t, col_major]] Vt) except + +cdef extern from "raft_runtime/solver/lanczos_svds.hpp" \ + namespace "raft::runtime::solver" nogil: + + cdef void sparse_lanczos_svd_f \ + "raft::runtime::solver::sparse_lanczos_svd"( + const device_resources &handle, + const sparse_lanczos_svd_config[float] &config, + device_vector_view[int, uint32_t] indptr, + device_vector_view[int, uint32_t] indices, + device_vector_view[float, uint32_t] data, + int n_rows, int n_cols, int nnz, + device_vector_view[float, uint32_t] singular_values, + optional[device_matrix_view[float, uint32_t, col_major]] U, + optional[device_matrix_view[float, uint32_t, col_major]] Vt) except + + + cdef void sparse_lanczos_svd_d \ + "raft::runtime::solver::sparse_lanczos_svd"( + const device_resources &handle, + const sparse_lanczos_svd_config[double] &config, + device_vector_view[int, uint32_t] indptr, + device_vector_view[int, uint32_t] indices, + device_vector_view[double, uint32_t] data, + int n_rows, int n_cols, int nnz, + device_vector_view[double, uint32_t] singular_values, + optional[device_matrix_view[double, uint32_t, col_major]] U, + optional[device_matrix_view[double, uint32_t, col_major]] Vt) except + + + @auto_sync_handle def svds(A, k=6, n_oversamples=10, n_power_iters=2, - seed=None, return_singular_vectors=True, handle=None): + seed=None, return_singular_vectors=True, handle=None, + solver="randomized", ncv=None, tol=1e-4, maxiter=None, + orthogonalization="cgs2"): """ Compute the largest ``k`` singular values and corresponding singular - vectors of a sparse matrix using randomized SVD. + vectors of a sparse matrix. Computes the truncated SVD: ``A ~ U @ diag(S) @ Vt``. @@ -86,10 +125,11 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, ``1 <= k < min(m, n)``. Default 6. n_oversamples (int): Number of extra random vectors for better approximation. Total subspace dimension is ``k + n_oversamples``. - Default 10. + Used only when ``solver="randomized"``. Default 10. n_power_iters (int): Number of power iteration passes. More iterations improve accuracy for matrices with slowly decaying - singular values. Default 2. + singular values. Used only when ``solver="randomized"``. + Default 2. seed (int or None): Random seed for reproducibility. If ``None``, a non-deterministic seed is used. return_singular_vectors (bool or {{"u", "vh"}}): Controls which @@ -101,9 +141,28 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, - ``"u"``: skip ``Vt``, return ``(U, S, None)``. - ``"vh"``: skip ``U``, return ``(None, S, Vt)``. - Skipping a side avoids the corresponding output buffer and - final matrix multiplication. + Skipping a side avoids the corresponding output buffer. Depending + on the selected solver, some intermediate vector products may + still be required for numerical refinement. handle: RAFT resource handle. If ``None``, a default is created. + solver ({{ "randomized", "lanczos" }}): Solver to use. The default + ``"randomized"`` preserves the existing approximate randomized SVD + behavior. ``"lanczos"`` uses Lanczos bidiagonalization and is the + higher-accuracy, ARPACK-like sparse SVD path. + ncv (int or None): Number of Lanczos vectors. Used only when + ``solver="lanczos"``. If ``None``, a default is selected. Larger + values can improve convergence margin and orthogonality for + difficult spectra, but increase sparse matrix-vector work. + tol (float): Lanczos residual tolerance. Used only when + ``solver="lanczos"``. + maxiter (int or None): Maximum Lanczos restart iterations. Used only + when ``solver="lanczos"``. If ``None``, defaults to 100. The + Lanczos solver raises an error if all requested singular triplets + do not converge within this limit. + orthogonalization ({{ "cgs2", "mgs2" }}): Lanczos reorthogonalization + method. ``"cgs2"`` is the GPU-efficient default. ``"mgs2"`` is + more launch-heavy and intended as an alternate path for difficult + float32 spectra. Returns: tuple: @@ -116,9 +175,11 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, :func:`scipy.sparse.linalg.svds` .. note:: - This function uses randomized SVD (Halko et al. 2009) with - CholeskyQR2 orthogonalization (Tomas et al. 2024) for efficient - GPU execution. + With ``solver="randomized"``, this function uses randomized SVD + (Halko et al. 2009) with CholeskyQR2 orthogonalization (Tomas et al. + 2024). With ``solver="lanczos"``, it uses implicitly restarted + Lanczos bidiagonalization with full reorthogonalization and final + ``A @ V`` post-refinement of the returned left singular vectors. """ @@ -136,6 +197,17 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, "Use A.tocsr() to convert." % type(A).__name__ ) + if solver not in ("randomized", "lanczos"): + raise ValueError( + "solver must be 'randomized' or 'lanczos', got %r" % (solver,) + ) + + if orthogonalization not in ("cgs2", "mgs2"): + raise ValueError( + "orthogonalization must be 'cgs2' or 'mgs2', got %r" + % (orthogonalization,) + ) + if not ( return_singular_vectors is True or return_singular_vectors is False @@ -157,19 +229,17 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, ) # Extract CSR arrays and ensure int32 indices. raft's runtime layer takes - # `int` (int32) for nnz / indptr / indices, so any overflow on conversion - # cannot be recovered downstream — error out before the cast. - # TODO: parameterize the runtime FUNC_DECL on IndexType (mirror lanczos's - # int / int64_t overloads) so int64 indptr/indices are accepted natively - # — including mixed int64-indptr / int32-indices for matrices where - # nnz > INT32_MAX but column count fits in int32. + # `int` (int32) for nnz / indptr / indices, so overflow on conversion + # cannot be recovered downstream; error out before the cast. + # The runtime API is currently int32-only. Native int64 support should + # mirror the sparse Lanczos eigensolver overloads when it is added. indptr = A.indptr indices = A.indices data = A.data INT32_MAX = (1 << 31) - 1 - # Validate indptr and indices dtypes independently — CuPy CSR matrices + # Validate indptr and indices dtypes independently. CuPy CSR matrices # normally have matching dtypes, but don't assume it. for name, arr in (("indptr", indptr), ("indices", indices)): if arr.dtype != np.int32 and arr.dtype != np.int64: @@ -237,55 +307,98 @@ def svds(A, k=6, n_oversamples=10, n_power_iters=2, cdef sparse_svd_config[float] cfg_float cdef sparse_svd_config[double] cfg_double + cdef sparse_lanczos_svd_config[float] lanczos_cfg_float + cdef sparse_lanczos_svd_config[double] lanczos_cfg_double cdef optional[device_matrix_view[float, uint32_t, col_major]] U_opt_f cdef optional[device_matrix_view[float, uint32_t, col_major]] Vt_opt_f cdef optional[device_matrix_view[double, uint32_t, col_major]] U_opt_d cdef optional[device_matrix_view[double, uint32_t, col_major]] Vt_opt_d + cdef int ncv_value = 0 if ncv is None else ncv + cdef int maxiter_value = 100 if maxiter is None else maxiter + cdef bool use_mgs2_value = orthogonalization == "mgs2" if ValueType == np.float32: - cfg_float.n_components = k - cfg_float.n_oversamples = n_oversamples - cfg_float.n_power_iters = n_power_iters - cfg_float.seed = seed_opt if want_U: U_opt_f = make_device_matrix_view[float, uint32_t, col_major]( U_ptr, m, k) if want_Vt: Vt_opt_f = make_device_matrix_view[float, uint32_t, col_major]( Vt_ptr, k, n) - sparse_randomized_svd_f( - deref(h), - cfg_float, - make_device_vector_view(indptr_ptr, (m + 1)), - make_device_vector_view(indices_ptr, nnz), - make_device_vector_view(data_ptr, nnz), - m, n, nnz, - make_device_vector_view(S_ptr, k), - U_opt_f, - Vt_opt_f, - ) + if solver == "randomized": + cfg_float.n_components = k + cfg_float.n_oversamples = n_oversamples + cfg_float.n_power_iters = n_power_iters + cfg_float.seed = seed_opt + sparse_randomized_svd_f( + deref(h), + cfg_float, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_f, + Vt_opt_f, + ) + else: + lanczos_cfg_float.n_components = k + lanczos_cfg_float.ncv = ncv_value + lanczos_cfg_float.tolerance = tol + lanczos_cfg_float.max_iterations = maxiter_value + lanczos_cfg_float.seed = seed_opt + lanczos_cfg_float.use_mgs2_orthogonalization = use_mgs2_value + sparse_lanczos_svd_f( + deref(h), + lanczos_cfg_float, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_f, + Vt_opt_f, + ) elif ValueType == np.float64: - cfg_double.n_components = k - cfg_double.n_oversamples = n_oversamples - cfg_double.n_power_iters = n_power_iters - cfg_double.seed = seed_opt if want_U: U_opt_d = make_device_matrix_view[double, uint32_t, col_major]( U_ptr, m, k) if want_Vt: Vt_opt_d = make_device_matrix_view[double, uint32_t, col_major]( Vt_ptr, k, n) - sparse_randomized_svd_d( - deref(h), - cfg_double, - make_device_vector_view(indptr_ptr, (m + 1)), - make_device_vector_view(indices_ptr, nnz), - make_device_vector_view(data_ptr, nnz), - m, n, nnz, - make_device_vector_view(S_ptr, k), - U_opt_d, - Vt_opt_d, - ) + if solver == "randomized": + cfg_double.n_components = k + cfg_double.n_oversamples = n_oversamples + cfg_double.n_power_iters = n_power_iters + cfg_double.seed = seed_opt + sparse_randomized_svd_d( + deref(h), + cfg_double, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_d, + Vt_opt_d, + ) + else: + lanczos_cfg_double.n_components = k + lanczos_cfg_double.ncv = ncv_value + lanczos_cfg_double.tolerance = tol + lanczos_cfg_double.max_iterations = maxiter_value + lanczos_cfg_double.seed = seed_opt + lanczos_cfg_double.use_mgs2_orthogonalization = use_mgs2_value + sparse_lanczos_svd_d( + deref(h), + lanczos_cfg_double, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_d, + Vt_opt_d, + ) else: raise TypeError("dtype ValueType=%s not supported, " "expected float32 or float64" % ValueType) diff --git a/python/pylibraft/pylibraft/tests/test_sparse.py b/python/pylibraft/pylibraft/tests/test_sparse.py index b6b66721a4..c460f0f92a 100644 --- a/python/pylibraft/pylibraft/tests/test_sparse.py +++ b/python/pylibraft/pylibraft/tests/test_sparse.py @@ -225,6 +225,184 @@ def test_shapes(self): assert S.shape == (k,) assert Vt.shape == (k, n) + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + @pytest.mark.parametrize("orthogonalization", ["cgs2", "mgs2"]) + def test_lanczos_diagonal_spectrum(self, dtype, orthogonalization): + """Lanczos SVD must recover the leading singular triplets.""" + diag = cupy.asarray([8, 7, 6, 5, 4, 3, 2, 1], dtype=dtype) + A = sparse.diags(diag, offsets=0, shape=(12, 8), format="csr") + U, S, Vt = svds( + A, + k=3, + solver="lanczos", + ncv=6, + tol=1e-6, + maxiter=20, + seed=42, + orthogonalization=orthogonalization, + ) + + tol = 2e-3 if dtype == numpy.float32 else 1e-8 + assert cupy.allclose(S, diag[:3], atol=tol) + assert cupy.allclose(U.T @ U, cupy.eye(3, dtype=dtype), atol=tol) + assert cupy.allclose(Vt @ Vt.T, cupy.eye(3, dtype=dtype), atol=tol) + + def test_lanczos_return_singular_vectors_modes(self): + """Lanczos supports scipy-style vector return modes.""" + diag = cupy.asarray([8, 7, 6, 5, 4, 3, 2, 1], dtype=numpy.float32) + A = sparse.diags(diag, offsets=0, shape=(12, 8), format="csr") + + U_ref, S_ref, Vt_ref = svds(A, k=3, solver="lanczos", ncv=6, seed=42) + + U_f, S_f, Vt_f = svds( + A, + k=3, + solver="lanczos", + ncv=6, + seed=42, + return_singular_vectors=False, + ) + assert U_f is None + assert Vt_f is None + assert cupy.allclose(S_f, S_ref) + + U_u, S_u, Vt_u = svds( + A, + k=3, + solver="lanczos", + ncv=6, + seed=42, + return_singular_vectors="u", + ) + assert U_u.shape == U_ref.shape + assert Vt_u is None + assert cupy.allclose(S_u, S_ref) + + U_v, S_v, Vt_v = svds( + A, + k=3, + solver="lanczos", + ncv=6, + seed=42, + return_singular_vectors="vh", + ) + assert U_v is None + assert Vt_v.shape == Vt_ref.shape + assert cupy.allclose(S_v, S_ref) + + def test_lanczos_rotated_clustered_spectrum(self): + """Lanczos recovers clustered singular values for a non-diagonal sparse matrix.""" + m, n, k = 64, 48, 8 + expected = numpy.asarray( + [10.0, 9.9995, 9.999, 9.5, 9.4995, 9.0, 8.5, 8.0], + dtype=numpy.float32, + ) + all_s = numpy.concatenate( + [expected, numpy.asarray([0.75, 0.25], dtype=numpy.float32)] + ) + indptr = [0] + indices = [] + values = [] + for block in range(len(all_s) // 2): + col0 = 2 * block + s0, s1 = all_s[2 * block], all_s[2 * block + 1] + theta = 0.17 + 0.11 * block + phi = 0.31 + 0.07 * block + cu, su = numpy.cos(theta), numpy.sin(theta) + cv, sv = numpy.cos(phi), numpy.sin(phi) + block_values = [ + cu * s0 * cv + su * s1 * sv, + cu * s0 * sv - su * s1 * cv, + su * s0 * cv - cu * s1 * sv, + su * s0 * sv + cu * s1 * cv, + ] + indices.extend([col0, col0 + 1]) + values.extend(block_values[:2]) + indptr.append(len(values)) + indices.extend([col0, col0 + 1]) + values.extend(block_values[2:]) + indptr.append(len(values)) + indptr.extend([len(values)] * (m + 1 - len(indptr))) + + A = sparse.csr_matrix( + ( + cupy.asarray(values, dtype=numpy.float32), + cupy.asarray(indices, dtype=numpy.int32), + cupy.asarray(indptr, dtype=numpy.int32), + ), + shape=(m, n), + ) + + U, S, Vt = svds( + A, + k=k, + solver="lanczos", + ncv=20, + tol=1e-5, + maxiter=120, + seed=2027, + ) + + assert cupy.allclose(S, cupy.asarray(expected), atol=2e-3) + eye = cupy.eye(k, dtype=numpy.float64) + U64 = U.astype(cupy.float64, copy=False) + Vt64 = Vt.astype(cupy.float64, copy=False) + assert cupy.linalg.norm(U64.T @ U64 - eye) < 2e-3 + assert cupy.linalg.norm(Vt64 @ Vt64.T - eye) < 2e-3 + residual = cupy.linalg.norm(A @ Vt.T - U * S[None, :]) / S[0] + assert residual < 1e-4 + + @pytest.mark.parametrize("shape", [(96, 28), (24, 96)]) + def test_lanczos_rank_deficient_and_edge_shapes(self, shape): + """Lanczos handles rank deficiency and tall/wide sparse inputs.""" + diag = cupy.zeros(min(shape), dtype=numpy.float32) + diag[:8] = cupy.asarray([12, 8, 6, 3, 1, 0, 0, 0], dtype=diag.dtype) + A = sparse.diags(diag, offsets=0, shape=shape, format="csr") + + U, S, Vt = svds( + A, + k=4, + solver="lanczos", + ncv=12, + tol=1e-5, + maxiter=80, + seed=2026, + ) + + assert cupy.allclose(S, diag[:4], atol=2e-3) + eye = cupy.eye(4, dtype=numpy.float64) + U64 = U.astype(cupy.float64, copy=False) + Vt64 = Vt.astype(cupy.float64, copy=False) + assert cupy.linalg.norm(U64.T @ U64 - eye) < 2e-3 + assert cupy.linalg.norm(Vt64 @ Vt64.T - eye) < 2e-3 + residual = cupy.linalg.norm(A @ Vt.T - U * S[None, :]) / S[0] + assert residual < 1e-4 + + def test_lanczos_non_convergence_raises(self): + """Lanczos reports non-convergence instead of returning forced Ritz vectors.""" + diag = cupy.zeros(40, dtype=numpy.float32) + diag[:6] = cupy.asarray([10, 9, 8, 7, 6, 5], dtype=diag.dtype) + A = sparse.diags(diag, offsets=0, shape=(64, 40), format="csr") + + with pytest.raises(RuntimeError, match="failed to converge"): + svds( + A, + k=4, + solver="lanczos", + ncv=6, + tol=0, + maxiter=1, + seed=2026, + ) + + def test_invalid_solver(self): + """Unknown sparse SVD solvers must be rejected.""" + A = sparse.random(20, 15, density=0.3, format="csr") + with pytest.raises(ValueError, match="solver"): + svds(A, k=3, solver="bogus") + with pytest.raises(ValueError, match="orthogonalization"): + svds(A, k=3, solver="lanczos", orthogonalization="bogus") + def test_int64_indices(self): """int64 indices should be auto-converted and work, with a warning.""" m, n, k = 50, 30, 3