From eada0c5e48e4ee59fd6aed3ccb06e584c44100e8 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 23 Jun 2026 11:17:07 +0000 Subject: [PATCH 1/5] Add ivf_pq extend and debug sizes --- .../neighbors/detail/cagra/cagra_helpers.cpp | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index 3f79df47dd..a517164722 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -84,7 +84,13 @@ std::tuple optimize_workspace_size(size_t n_rows } size_t combine_dev = combine_dev_fixed; - size_t total_host = mst_host + combine_host; + size_t debug_host_size = 0; + if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) { + debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph + + n_rows * sizeof(uint32_t) // in_edge_count + + graph_degree * sizeof(uint32_t); // hist + + size_t total_host = mst_host + combine_host + debug_host_size; size_t total_host_fixed = mst_host_fixed + combine_host_fixed; size_t total_dev = std::max(prune_dev, rev_dev + combine_dev); size_t total_dev_fixed = std::max(prune_dev_fixed, combine_dev_fixed); @@ -92,6 +98,43 @@ std::tuple optimize_workspace_size(size_t n_rows return std::make_tuple(total_host, total_dev, total_host_fixed, total_dev_fixed); } +inline size_t ivf_pq_extend_mem_usage(raft::matrix_extent dataset, + cuvs::neighbors::graph_build_params::ivf_pq_params params, + size_t dtype_size) +{ + constexpr size_t kReasonableMaxBatchSize = 65536; + constexpr size_t kSpecAlignMax = 1024; + + size_t n_rows = dataset.extent(0); + size_t dim = dataset.extent(1); + size_t pq_dim = params.build_params.pq_dim; + size_t pq_bits = params.build_params.pq_bits; + size_t rot_dim = raft::round_up_safe(dim, pq_dim); + size_t n_clusters = params.build_params.n_lists; + + size_t max_batch_size = std::min(n_rows, kReasonableMaxBatchSize); + size_t workspace_size = max_batch_size * dim * dtype_size // vec_batches + + max_batch_size * rot_dim * sizeof(float) // new_vectors_residual + + max_batch_size * dim * sizeof(float); // flat_compute_residuals_tmp + + // each row contains pq codes and index + size_t code_bytes_per_vec = pq_dim * pq_bits / 8; + size_t bytes_per_row = code_bytes_per_vec + sizeof(uint32_t); + + // estimate the "worst-case" for the number of placeholder rows and resize rows + // The worst-case (i.e. max) happens for INTERLEAVED (as oppposed to FLAT) and when each row + // wastes n_cluster * alignment_size + size_t n_rows_placeholder = n_rows + ivf_pq::kIndexGroupSize * n_clusters; + size_t placeholder_dev = n_rows_placeholder * bytes_per_row; + size_t n_rows_resize_lists = n_rows + kSpecAlignMax * n_clusters; + size_t resize_lists_dev = n_rows_resize_lists * bytes_per_row; + + // Placeholder freed before resize_list + size_t device_size = std::max(placeholder_dev, resize_lists_dev); + + return device_size + workspace_size; +} + // All sizes are in bytes inline std::pair ivf_pq_build_mem_usage( raft::resources const& res, @@ -137,6 +180,12 @@ inline std::pair ivf_pq_build_mem_usage( size_t kmeans_pinned_host = 2 * pinned_rows * dim * dtype_size; // two staging double-buffers size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host; + // Extend phase + size_t extend_gpu_mem = 0; + if (pq_params.build_params.add_data_on_build) { + extend_gpu_mem = ivf_pq_extend_mem_usage(dataset, params, dtype_size); + } + // Search phase (build_knn_graph): constexpr size_t kWorkspaceRatio = 5; size_t top_k = intermediate_graph_degree + 1; @@ -155,7 +204,7 @@ inline std::pair ivf_pq_build_mem_usage( + (sizeof(float) + sizeof(int64_t)) * top_k); // refined_* // Phases run sequentially (train/extend -> search -> optimize) - size_t total_dev = std::max({kmeans_gpu_mem, search_phase_dev, gpu_workspace_size}); + size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, search_phase_dev, gpu_workspace_size}); // The graph (and its optimize workspace) stays resident across phases size_t total_host = From e914251f5905978fa5c76f783486d18a6f03cf59 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 24 Jun 2026 07:34:26 -0700 Subject: [PATCH 2/5] Add device memory estimate when attaching graph to index on GPU --- .../neighbors/detail/cagra/cagra_helpers.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index a517164722..edc515da0b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -89,6 +89,7 @@ std::tuple optimize_workspace_size(size_t n_rows debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph + n_rows * sizeof(uint32_t) // in_edge_count + graph_degree * sizeof(uint32_t); // hist + } size_t total_host = mst_host + combine_host + debug_host_size; size_t total_host_fixed = mst_host_fixed + combine_host_fixed; @@ -112,6 +113,8 @@ inline size_t ivf_pq_extend_mem_usage(raft::matrix_extent dataset, size_t rot_dim = raft::round_up_safe(dim, pq_dim); size_t n_clusters = params.build_params.n_lists; + RAFT_EXPECTS(pq_dim > 0, "pq_dim should not be 0"); + size_t max_batch_size = std::min(n_rows, kReasonableMaxBatchSize); size_t workspace_size = max_batch_size * dim * dtype_size // vec_batches + max_batch_size * rot_dim * sizeof(float) // new_vectors_residual @@ -143,7 +146,8 @@ inline std::pair ivf_pq_build_mem_usage( cuvs::neighbors::graph_build_params::ivf_pq_params params, size_t graph_degree, size_t intermediate_graph_degree, - bool guarantee_connectivity) + bool guarantee_connectivity, + bool attach_dataset_on_build) { size_t dtype_size = cuda_data_type_size(dtype); bool input_is_float = (dtype == CUDA_R_32F); @@ -181,10 +185,10 @@ inline std::pair ivf_pq_build_mem_usage( size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host; // Extend phase - size_t extend_gpu_mem = 0; - if (pq_params.build_params.add_data_on_build) { - extend_gpu_mem = ivf_pq_extend_mem_usage(dataset, params, dtype_size); - } + size_t extend_gpu_mem = params.build_params.add_data_on_build ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) : 0; + + // Add graph to index on GPU + size_t attach_graph_gpu_mem = attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0; // Search phase (build_knn_graph): constexpr size_t kWorkspaceRatio = 5; @@ -204,7 +208,7 @@ inline std::pair ivf_pq_build_mem_usage( + (sizeof(float) + sizeof(int64_t)) * top_k); // refined_* // Phases run sequentially (train/extend -> search -> optimize) - size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, search_phase_dev, gpu_workspace_size}); + size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size}); // The graph (and its optimize workspace) stays resident across phases size_t total_host = @@ -258,7 +262,8 @@ std::pair cagra_build_mem_usage(raft::resources const& res, pq_params, cparams.graph_degree, cparams.intermediate_graph_degree, - cparams.guarantee_connectivity); + cparams.guarantee_connectivity, + cparams.attach_dataset_on_build); } else if (std::holds_alternative( cparams.graph_build_params)) { RAFT_LOG_INFO("Considering CAGRA in memory build with NN-descent"); From 45ce85e8f036febf63cf654dbc6964e800d20d00 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 25 Jun 2026 00:52:12 -0700 Subject: [PATCH 3/5] Add missing trainset tmp large workspace allocation when dtype != float --- cpp/src/neighbors/detail/cagra/cagra_helpers.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index edc515da0b..548ad8072b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -172,6 +172,10 @@ inline std::pair ivf_pq_build_mem_usage( params.build_params.n_lists)); size_t kmeans_n_rows = n_rows / kmeans_trainset_ratio; size_t kmeans_gpu_mem = kmeans_n_rows * dim * sizeof(float); + if (dtype != CUDA_R_32F) { + // kmeans trainset tmp allocation + kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; + } // For non-float input, ivf_pq::build first samples into a temporary trainset of type T if (!input_is_float) { kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; } From 677f67ef80d6ac293a12ddafc8a7d085b38add60 Mon Sep 17 00:00:00 2001 From: huuanhhuyn Date: Mon, 29 Jun 2026 11:17:05 +0200 Subject: [PATCH 4/5] Apply suggestion from @achirkin Add comment to explain extra debug allocation Co-authored-by: Artem M. Chirkin <9253178+achirkin@users.noreply.github.com> --- cpp/src/neighbors/detail/cagra/cagra_helpers.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index 548ad8072b..d2f2ec7b39 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -86,6 +86,8 @@ std::tuple optimize_workspace_size(size_t n_rows size_t debug_host_size = 0; if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) { + // cagra::detail::graph::optimize() allocates extra memory to calculate + // graph metrics when debug logging is enabled debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph + n_rows * sizeof(uint32_t) // in_edge_count + graph_degree * sizeof(uint32_t); // hist From 7440e5bddec1583245aa14877ca46980e4c301ad Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 30 Jun 2026 07:16:02 +0000 Subject: [PATCH 5/5] Run formatting --- .../neighbors/detail/cagra/cagra_helpers.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index d2f2ec7b39..2255f91d92 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -86,8 +86,8 @@ std::tuple optimize_workspace_size(size_t n_rows size_t debug_host_size = 0; if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) { - // cagra::detail::graph::optimize() allocates extra memory to calculate - // graph metrics when debug logging is enabled + // cagra::detail::graph::optimize() allocates extra memory to calculate + // graph metrics when debug logging is enabled debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph + n_rows * sizeof(uint32_t) // in_edge_count + graph_degree * sizeof(uint32_t); // hist @@ -191,10 +191,13 @@ inline std::pair ivf_pq_build_mem_usage( size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host; // Extend phase - size_t extend_gpu_mem = params.build_params.add_data_on_build ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) : 0; - + size_t extend_gpu_mem = params.build_params.add_data_on_build + ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) + : 0; + // Add graph to index on GPU - size_t attach_graph_gpu_mem = attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0; + size_t attach_graph_gpu_mem = + attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0; // Search phase (build_knn_graph): constexpr size_t kWorkspaceRatio = 5; @@ -214,7 +217,8 @@ inline std::pair ivf_pq_build_mem_usage( + (sizeof(float) + sizeof(int64_t)) * top_k); // refined_* // Phases run sequentially (train/extend -> search -> optimize) - size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size}); + size_t total_dev = std::max( + {kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size}); // The graph (and its optimize workspace) stays resident across phases size_t total_host =