Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 64 additions & 4 deletions cpp/src/neighbors/detail/cagra/cagra_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,62 @@ std::tuple<size_t, size_t, size_t, size_t> optimize_workspace_size(size_t n_rows
}
size_t combine_dev = combine_dev_fixed;

size_t total_host = mst_host + combine_host;
size_t debug_host_size = 0;
if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) {
Comment thread
huuanhhuyn marked this conversation as resolved.
// cagra::detail::graph::optimize() allocates extra memory to calculate
// graph metrics when debug logging is enabled
debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph
+ n_rows * sizeof(uint32_t) // in_edge_count
+ graph_degree * sizeof(uint32_t); // hist
}

size_t total_host = mst_host + combine_host + debug_host_size;
size_t total_host_fixed = mst_host_fixed + combine_host_fixed;
size_t total_dev = std::max(prune_dev, rev_dev + combine_dev);
size_t total_dev_fixed = std::max(prune_dev_fixed, combine_dev_fixed);

return std::make_tuple(total_host, total_dev, total_host_fixed, total_dev_fixed);
}
Comment thread
huuanhhuyn marked this conversation as resolved.

inline size_t ivf_pq_extend_mem_usage(raft::matrix_extent<int64_t> dataset,
cuvs::neighbors::graph_build_params::ivf_pq_params params,
size_t dtype_size)
{
constexpr size_t kReasonableMaxBatchSize = 65536;
constexpr size_t kSpecAlignMax = 1024;

size_t n_rows = dataset.extent(0);
size_t dim = dataset.extent(1);
size_t pq_dim = params.build_params.pq_dim;
size_t pq_bits = params.build_params.pq_bits;
size_t rot_dim = raft::round_up_safe<size_t>(dim, pq_dim);
size_t n_clusters = params.build_params.n_lists;

RAFT_EXPECTS(pq_dim > 0, "pq_dim should not be 0");

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: I don't think this check is necessary - it doesn't guard against anything dangerous below (no divisions) and its valid values are checked in IVF-PQ during build anyway.

@huuanhhuyn huuanhhuyn Jun 29, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The AI review suggested adding this check.


size_t max_batch_size = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
size_t workspace_size = max_batch_size * dim * dtype_size // vec_batches
+ max_batch_size * rot_dim * sizeof(float) // new_vectors_residual
+ max_batch_size * dim * sizeof(float); // flat_compute_residuals_tmp

// each row contains pq codes and index
size_t code_bytes_per_vec = pq_dim * pq_bits / 8;
size_t bytes_per_row = code_bytes_per_vec + sizeof(uint32_t);

// estimate the "worst-case" for the number of placeholder rows and resize rows
// The worst-case (i.e. max) happens for INTERLEAVED (as oppposed to FLAT) and when each row
// wastes n_cluster * alignment_size
size_t n_rows_placeholder = n_rows + ivf_pq::kIndexGroupSize * n_clusters;
size_t placeholder_dev = n_rows_placeholder * bytes_per_row;
size_t n_rows_resize_lists = n_rows + kSpecAlignMax * n_clusters;
size_t resize_lists_dev = n_rows_resize_lists * bytes_per_row;

// Placeholder freed before resize_list
size_t device_size = std::max(placeholder_dev, resize_lists_dev);

return device_size + workspace_size;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to be careful to not account for the same memory multiple times. As far as I remember, we add the whole workspace size as used somewhere above in the call chain. The intuition behind that is that the algorithms are always free to use all of the workspace size (e.g. by changing the batch size). Please make sure your changes are consistent with the rest of the logic in that regard.

@huuanhhuyn huuanhhuyn Jun 29, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have any duplicated estimate for the placeholder and resize lists within our cagra_build_mem_usage() call chain.

The estimate would be the upper-bound for the real allocation because the max_batch_size could be shrinked further on the real path and the alignment "waste" will be less. The memory resource tracking shows that the over estimation is not too significant (e.g. 280MiB vs 230MiB).

@huuanhhuyn huuanhhuyn Jun 29, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think I should add the shrinking logics based on the total free mem? This would improve the over-estimation and it will still be the upper bound afterwards.

With the current version, if the smallest free mem is 1 GiB and the largest dim is 4000 then the max_batch_size could be over-estimated by approx. 4x (the real batch size is shrinked to 16k).

}
Comment thread
huuanhhuyn marked this conversation as resolved.

// All sizes are in bytes
inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
raft::resources const& res,
Expand All @@ -100,7 +148,8 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
cuvs::neighbors::graph_build_params::ivf_pq_params params,
size_t graph_degree,
size_t intermediate_graph_degree,
bool guarantee_connectivity)
bool guarantee_connectivity,
bool attach_dataset_on_build)
{
size_t dtype_size = cuda_data_type_size(dtype);
bool input_is_float = (dtype == CUDA_R_32F);
Expand All @@ -125,6 +174,10 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
params.build_params.n_lists));
size_t kmeans_n_rows = n_rows / kmeans_trainset_ratio;
size_t kmeans_gpu_mem = kmeans_n_rows * dim * sizeof(float);
if (dtype != CUDA_R_32F) {
// kmeans trainset tmp allocation
kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size;
}

// For non-float input, ivf_pq::build first samples into a temporary trainset of type T
if (!input_is_float) { kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; }
Expand All @@ -137,6 +190,12 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
size_t kmeans_pinned_host = 2 * pinned_rows * dim * dtype_size; // two staging double-buffers
size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host;

// Extend phase
size_t extend_gpu_mem = params.build_params.add_data_on_build ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) : 0;

// Add graph to index on GPU
size_t attach_graph_gpu_mem = attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0;

// Search phase (build_knn_graph):
constexpr size_t kWorkspaceRatio = 5;
size_t top_k = intermediate_graph_degree + 1;
Expand All @@ -155,7 +214,7 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
+ (sizeof(float) + sizeof(int64_t)) * top_k); // refined_*

// Phases run sequentially (train/extend -> search -> optimize)
size_t total_dev = std::max({kmeans_gpu_mem, search_phase_dev, gpu_workspace_size});
size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size});

// The graph (and its optimize workspace) stays resident across phases
size_t total_host =
Expand Down Expand Up @@ -209,7 +268,8 @@ std::pair<size_t, size_t> cagra_build_mem_usage(raft::resources const& res,
pq_params,
cparams.graph_degree,
cparams.intermediate_graph_degree,
cparams.guarantee_connectivity);
cparams.guarantee_connectivity,
cparams.attach_dataset_on_build);
} else if (std::holds_alternative<graph_build_params::nn_descent_params>(
cparams.graph_build_params)) {
RAFT_LOG_INFO("Considering CAGRA in memory build with NN-descent");
Expand Down
Loading