-
Notifications
You must be signed in to change notification settings - Fork 196
Add missing memory estimates #2255
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
eada0c5
e914251
45ce85e
677f67e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -84,14 +84,62 @@ std::tuple<size_t, size_t, size_t, size_t> optimize_workspace_size(size_t n_rows | |
| } | ||
| size_t combine_dev = combine_dev_fixed; | ||
|
|
||
| size_t total_host = mst_host + combine_host; | ||
| size_t debug_host_size = 0; | ||
| if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) { | ||
| // cagra::detail::graph::optimize() allocates extra memory to calculate | ||
| // graph metrics when debug logging is enabled | ||
| debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph | ||
| + n_rows * sizeof(uint32_t) // in_edge_count | ||
| + graph_degree * sizeof(uint32_t); // hist | ||
| } | ||
|
|
||
| size_t total_host = mst_host + combine_host + debug_host_size; | ||
| size_t total_host_fixed = mst_host_fixed + combine_host_fixed; | ||
| size_t total_dev = std::max(prune_dev, rev_dev + combine_dev); | ||
| size_t total_dev_fixed = std::max(prune_dev_fixed, combine_dev_fixed); | ||
|
|
||
| return std::make_tuple(total_host, total_dev, total_host_fixed, total_dev_fixed); | ||
| } | ||
|
huuanhhuyn marked this conversation as resolved.
|
||
|
|
||
| inline size_t ivf_pq_extend_mem_usage(raft::matrix_extent<int64_t> dataset, | ||
| cuvs::neighbors::graph_build_params::ivf_pq_params params, | ||
| size_t dtype_size) | ||
| { | ||
| constexpr size_t kReasonableMaxBatchSize = 65536; | ||
| constexpr size_t kSpecAlignMax = 1024; | ||
|
|
||
| size_t n_rows = dataset.extent(0); | ||
| size_t dim = dataset.extent(1); | ||
| size_t pq_dim = params.build_params.pq_dim; | ||
| size_t pq_bits = params.build_params.pq_bits; | ||
| size_t rot_dim = raft::round_up_safe<size_t>(dim, pq_dim); | ||
| size_t n_clusters = params.build_params.n_lists; | ||
|
|
||
| RAFT_EXPECTS(pq_dim > 0, "pq_dim should not be 0"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: I don't think this check is necessary - it doesn't guard against anything dangerous below (no divisions) and its valid values are checked in IVF-PQ during build anyway.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The AI review suggested adding this check. |
||
|
|
||
| size_t max_batch_size = std::min<size_t>(n_rows, kReasonableMaxBatchSize); | ||
| size_t workspace_size = max_batch_size * dim * dtype_size // vec_batches | ||
| + max_batch_size * rot_dim * sizeof(float) // new_vectors_residual | ||
| + max_batch_size * dim * sizeof(float); // flat_compute_residuals_tmp | ||
|
|
||
| // each row contains pq codes and index | ||
| size_t code_bytes_per_vec = pq_dim * pq_bits / 8; | ||
| size_t bytes_per_row = code_bytes_per_vec + sizeof(uint32_t); | ||
|
|
||
| // estimate the "worst-case" for the number of placeholder rows and resize rows | ||
| // The worst-case (i.e. max) happens for INTERLEAVED (as oppposed to FLAT) and when each row | ||
| // wastes n_cluster * alignment_size | ||
| size_t n_rows_placeholder = n_rows + ivf_pq::kIndexGroupSize * n_clusters; | ||
| size_t placeholder_dev = n_rows_placeholder * bytes_per_row; | ||
| size_t n_rows_resize_lists = n_rows + kSpecAlignMax * n_clusters; | ||
| size_t resize_lists_dev = n_rows_resize_lists * bytes_per_row; | ||
|
|
||
| // Placeholder freed before resize_list | ||
| size_t device_size = std::max(placeholder_dev, resize_lists_dev); | ||
|
|
||
| return device_size + workspace_size; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to be careful to not account for the same memory multiple times. As far as I remember, we add the whole workspace size as used somewhere above in the call chain. The intuition behind that is that the algorithms are always free to use all of the workspace size (e.g. by changing the batch size). Please make sure your changes are consistent with the rest of the logic in that regard.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we have any duplicated estimate for the placeholder and resize lists within our The estimate would be the upper-bound for the real allocation because the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think I should add the shrinking logics based on the total free mem? This would improve the over-estimation and it will still be the upper bound afterwards. With the current version, if the smallest free mem is 1 GiB and the largest dim is 4000 then the max_batch_size could be over-estimated by approx. 4x (the real batch size is shrinked to 16k). |
||
| } | ||
|
huuanhhuyn marked this conversation as resolved.
|
||
|
|
||
| // All sizes are in bytes | ||
| inline std::pair<size_t, size_t> ivf_pq_build_mem_usage( | ||
| raft::resources const& res, | ||
|
|
@@ -100,7 +148,8 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage( | |
| cuvs::neighbors::graph_build_params::ivf_pq_params params, | ||
| size_t graph_degree, | ||
| size_t intermediate_graph_degree, | ||
| bool guarantee_connectivity) | ||
| bool guarantee_connectivity, | ||
| bool attach_dataset_on_build) | ||
| { | ||
| size_t dtype_size = cuda_data_type_size(dtype); | ||
| bool input_is_float = (dtype == CUDA_R_32F); | ||
|
|
@@ -125,6 +174,10 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage( | |
| params.build_params.n_lists)); | ||
| size_t kmeans_n_rows = n_rows / kmeans_trainset_ratio; | ||
| size_t kmeans_gpu_mem = kmeans_n_rows * dim * sizeof(float); | ||
| if (dtype != CUDA_R_32F) { | ||
| // kmeans trainset tmp allocation | ||
| kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; | ||
| } | ||
|
|
||
| // For non-float input, ivf_pq::build first samples into a temporary trainset of type T | ||
| if (!input_is_float) { kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; } | ||
|
|
@@ -137,6 +190,12 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage( | |
| size_t kmeans_pinned_host = 2 * pinned_rows * dim * dtype_size; // two staging double-buffers | ||
| size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host; | ||
|
|
||
| // Extend phase | ||
| size_t extend_gpu_mem = params.build_params.add_data_on_build ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) : 0; | ||
|
|
||
| // Add graph to index on GPU | ||
| size_t attach_graph_gpu_mem = attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0; | ||
|
|
||
| // Search phase (build_knn_graph): | ||
| constexpr size_t kWorkspaceRatio = 5; | ||
| size_t top_k = intermediate_graph_degree + 1; | ||
|
|
@@ -155,7 +214,7 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage( | |
| + (sizeof(float) + sizeof(int64_t)) * top_k); // refined_* | ||
|
|
||
| // Phases run sequentially (train/extend -> search -> optimize) | ||
| size_t total_dev = std::max({kmeans_gpu_mem, search_phase_dev, gpu_workspace_size}); | ||
| size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size}); | ||
|
|
||
| // The graph (and its optimize workspace) stays resident across phases | ||
| size_t total_host = | ||
|
|
@@ -209,7 +268,8 @@ std::pair<size_t, size_t> cagra_build_mem_usage(raft::resources const& res, | |
| pq_params, | ||
| cparams.graph_degree, | ||
| cparams.intermediate_graph_degree, | ||
| cparams.guarantee_connectivity); | ||
| cparams.guarantee_connectivity, | ||
| cparams.attach_dataset_on_build); | ||
| } else if (std::holds_alternative<graph_build_params::nn_descent_params>( | ||
| cparams.graph_build_params)) { | ||
| RAFT_LOG_INFO("Considering CAGRA in memory build with NN-descent"); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.