diff --git a/benchmarks/cagra_recall_throughput_bench.py b/benchmarks/cagra_recall_throughput_bench.py new file mode 100644 index 0000000000..2339008767 --- /dev/null +++ b/benchmarks/cagra_recall_throughput_bench.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +CAGRA Recall/Throughput Benchmark: SINGLE_CTA vs MULTI_CTA with search_width > 1 + +Measures recall@10 and throughput (QPS) across configurations to evaluate +the impact of the AUTO algorithm selector when search_width > 1. + +Configurations tested: + 1. MULTI_CTA (default max_iterations) — reference baseline + 2. SINGLE_CTA (default max_iterations) — shows the recall problem + 3. SINGLE_CTA (floor at 32 base iterations) — alternative fix + 4. AUTO (current code) — shows when the switch happens + +For each: sweep batch_size=[1, 64, 256, 512, 1024] x search_width=[1, 4, 8] +""" + +import time + +import numpy as np +from cuvs.neighbors import brute_force, cagra +from pylibraft.common import device_ndarray + + +# --- Dataset parameters --- +N_SAMPLES = 100_000 +N_QUERIES = 2048 +DIM = 128 +K = 10 +N_WARMUP = 3 +N_TIMED = 10 + +# --- Search configurations --- +BATCH_SIZES = [1, 64, 256, 512, 1024] +SEARCH_WIDTHS = [1, 4, 8] +ITOPK_SIZE = 64 +GRAPH_DEGREE = 64 + + +def compute_reachability_iters(dataset_size, graph_degree): + """Replicate the C++ reachability iteration calculation.""" + iters = 0 + reachable = 1 + while reachable < dataset_size: + reachable *= max(2, graph_degree // 2) + iters += 1 + return iters + + +def compute_max_iterations( + algo, itopk_size, search_width, dataset_size, graph_degree +): + """Replicate the C++ max_iterations calculation for each algorithm.""" + reach = compute_reachability_iters(dataset_size, graph_degree) + if algo == "multi_cta": + base = 32 # mc_itopk_size / mc_search_width = 32/1 + elif algo == "single_cta_default": + base = itopk_size // search_width + elif algo == "single_cta_floor32": + # Alternative fix: floor base iterations at 32 (match MULTI_CTA) + base = max(itopk_size // search_width, 32) + else: + return 0 # Let cuVS calculate (AUTO mode) + return base + reach + + +def compute_recall(results, ground_truth, k): + """Compute recall@k: fraction of true k-NN found in results.""" + n = results.shape[0] + recall = 0.0 + for i in range(n): + gt_set = set(ground_truth[i, :k].tolist()) + res_set = set(results[i, :k].tolist()) + recall += len(gt_set & res_set) / k + return recall / n + + +def run_benchmark(): + np.random.seed(42) + + print("Generating dataset...") + dataset = np.random.random((N_SAMPLES, DIM)).astype(np.float32) + queries = dataset[:N_QUERIES].copy() + + # Move dataset to device + dataset_device = device_ndarray(dataset) + queries_device = device_ndarray(queries) + + # Build brute-force ground truth + print("Computing brute-force ground truth...") + bf_index = brute_force.build(dataset_device) + gt_distances, gt_neighbors = brute_force.search( + bf_index, queries_device, k=K + ) + gt_neighbors_host = gt_neighbors.copy_to_host() + + # Build CAGRA index + print("Building CAGRA index...") + index_params = cagra.IndexParams( + intermediate_graph_degree=128, + graph_degree=GRAPH_DEGREE, + ) + cagra_index = cagra.build(index_params, dataset_device) + print( + f" Index built: {N_SAMPLES} vectors, dim={DIM}, graph_degree={GRAPH_DEGREE}" + ) + + reach_iters = compute_reachability_iters(N_SAMPLES, GRAPH_DEGREE) + print(f" Reachability iterations: {reach_iters}") + print() + + # Define test configurations + configs = [ + ("MULTI_CTA", "multi_cta"), + ("SINGLE_CTA (default)", "single_cta_default"), + ("SINGLE_CTA (floor@32)", "single_cta_floor32"), + ("AUTO", "auto"), + ] + + # Header + print( + f"{'Config':<25} {'sw':>3} {'batch':>6} {'max_iter':>9} " + f"{'recall@10':>10} {'QPS':>10}" + ) + print("-" * 75) + + results_table = [] + + for search_width in SEARCH_WIDTHS: + for batch_size in BATCH_SIZES: + batch_queries = queries[:batch_size] + batch_gt = gt_neighbors_host[:batch_size] + + for config_name, config_key in configs: + max_iter = compute_max_iterations( + config_key, + ITOPK_SIZE, + search_width, + N_SAMPLES, + GRAPH_DEGREE, + ) + + # Build SearchParams — algo must be set in constructor + if config_key == "multi_cta": + search_params = cagra.SearchParams( + algo="multi_cta", + itopk_size=ITOPK_SIZE, + search_width=search_width, + max_iterations=max_iter, + ) + elif config_key.startswith("single_cta"): + search_params = cagra.SearchParams( + algo="single_cta", + itopk_size=ITOPK_SIZE, + search_width=search_width, + max_iterations=max_iter, + ) + else: + # AUTO: let cuVS decide algorithm and max_iterations + search_params = cagra.SearchParams( + algo="auto", + itopk_size=ITOPK_SIZE, + search_width=search_width, + ) + + batch_device = device_ndarray(batch_queries) + + # Warmup + for _ in range(N_WARMUP): + _, neighbors = cagra.search( + search_params, cagra_index, batch_device, k=K + ) + + # Timed runs + elapsed = 0.0 + for _ in range(N_TIMED): + start = time.perf_counter() + _, neighbors = cagra.search( + search_params, cagra_index, batch_device, k=K + ) + elapsed += time.perf_counter() - start + + neighbors_host = neighbors.copy_to_host() + recall_val = compute_recall(neighbors_host, batch_gt, K) + + avg_time = elapsed / N_TIMED + qps = batch_size / avg_time if avg_time > 0 else 0 + + iter_str = str(max_iter) if max_iter > 0 else "auto" + print( + f"{config_name:<25} {search_width:>3} {batch_size:>6} " + f"{iter_str:>9} {recall_val:>10.4f} {qps:>10.0f}" + ) + + results_table.append( + { + "config": config_name, + "search_width": search_width, + "batch_size": batch_size, + "max_iterations": max_iter, + "recall": recall_val, + "qps": qps, + } + ) + + print() # Blank line between search_width groups + + # Summary analysis + print("\n" + "=" * 75) + print("ANALYSIS: Recall delta at algorithm switch point (batch_size=512)") + print("=" * 75) + for sw in SEARCH_WIDTHS: + multi = [ + r + for r in results_table + if r["config"] == "MULTI_CTA" + and r["search_width"] == sw + and r["batch_size"] == 512 + ] + single_def = [ + r + for r in results_table + if r["config"] == "SINGLE_CTA (default)" + and r["search_width"] == sw + and r["batch_size"] == 512 + ] + single_fix = [ + r + for r in results_table + if r["config"] == "SINGLE_CTA (floor@32)" + and r["search_width"] == sw + and r["batch_size"] == 512 + ] + auto = [ + r + for r in results_table + if r["config"] == "AUTO" + and r["search_width"] == sw + and r["batch_size"] == 512 + ] + + if multi and single_def and single_fix and auto: + m = multi[0] + sd = single_def[0] + sf = single_fix[0] + a = auto[0] + print(f"\n search_width={sw}:") + print( + f" MULTI_CTA: recall={m['recall']:.4f} QPS={m['qps']:.0f}" + ) + print( + f" SINGLE_CTA (default): recall={sd['recall']:.4f} QPS={sd['qps']:.0f} " + f"(recall delta: {sd['recall'] - m['recall']:+.4f})" + ) + print( + f" SINGLE_CTA (floor@32): recall={sf['recall']:.4f} QPS={sf['qps']:.0f} " + f"(recall delta: {sf['recall'] - m['recall']:+.4f})" + ) + print( + f" AUTO (current code): recall={a['recall']:.4f} QPS={a['qps']:.0f} " + f"(recall delta: {a['recall'] - m['recall']:+.4f})" + ) + + if sd["qps"] > 0 and m["qps"] > 0: + print( + f" Throughput gain (SINGLE_CTA default vs MULTI_CTA): " + f"{sd['qps'] / m['qps']:.2f}x" + ) + if sf["qps"] > 0 and m["qps"] > 0: + print( + f" Throughput gain (SINGLE_CTA floor@32 vs MULTI_CTA): " + f"{sf['qps'] / m['qps']:.2f}x" + ) + + print("\n\nKey question: Does SINGLE_CTA (floor@32) recover recall while") + print("preserving throughput advantage over MULTI_CTA?") + + +if __name__ == "__main__": + run_benchmark() diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h index af6c66c9d4..9664383bc6 100644 --- a/c/include/cuvs/neighbors/cagra.h +++ b/c/include/cuvs/neighbors/cagra.h @@ -222,6 +222,19 @@ struct cuvsCagraIndexParams { * - Others: nullptr */ void* graph_build_params; + /** + * Whether to add the dataset content to the index after building the graph. + * + * - true (default): the index is filled with the dataset vectors and ready to search + * after build, but requires enough memory to hold an aligned copy of the dataset. + * - false: only the search graph is built. The user must call cuvsCagraUpdateDataset + * to attach the dataset before searching. This avoids duplicating the dataset in + * device memory during build, which is useful for memory-constrained scenarios. + * + * When compression is set, this parameter is ignored (compressed dataset is always + * added to the index). + */ + bool attach_dataset_on_build; }; typedef struct cuvsCagraIndexParams* cuvsCagraIndexParams_t; @@ -617,6 +630,26 @@ cuvsError_t cuvsCagraBuild(cuvsResources_t res, DLManagedTensor* dataset, cuvsCagraIndex_t index); +/** + * @brief Update (attach) a dataset to an existing CAGRA index. + * + * This is intended for use after building an index with attach_dataset_on_build = false. + * If the dataset rows are already aligned on 16 bytes and reside on the device, only a + * reference is stored (zero-copy). Otherwise, an aligned copy is made. + * + * It is the caller's responsibility to ensure that the same dataset used for building + * is supplied here. The dataset must remain valid for the lifetime of the index when + * zero-copy is used. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] dataset DLManagedTensor* dataset to attach + * @param[in] index cuvsCagraIndex_t the index to update + * @return cuvsError_t + */ +cuvsError_t cuvsCagraUpdateDataset(cuvsResources_t res, + DLManagedTensor* dataset, + cuvsCagraIndex_t index); + /** * @} */ diff --git a/c/include/cuvs/neighbors/ivf_pq.h b/c/include/cuvs/neighbors/ivf_pq.h index c780ab8065..534b53eed8 100644 --- a/c/include/cuvs/neighbors/ivf_pq.h +++ b/c/include/cuvs/neighbors/ivf_pq.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -516,8 +517,9 @@ cuvsError_t cuvsIvfPqBuildPrecomputed(cuvsResources_t res, * cuvsError_t params_create_status = cuvsIvfPqSearchParamsCreate(&search_params); * * // Search the `index` built using `cuvsIvfPqBuild` + * cuvsFilter filter = {.addr = 0, .type = NO_FILTER}; * cuvsError_t search_status = cuvsIvfPqSearch(res, search_params, index, &queries, &neighbors, - * &distances); + * &distances, filter); * * // de-allocate `search_params` and `res` * cuvsError_t params_destroy_status = cuvsIvfPqSearchParamsDestroy(search_params); @@ -530,13 +532,15 @@ cuvsError_t cuvsIvfPqBuildPrecomputed(cuvsResources_t res, * @param[in] queries DLManagedTensor* queries dataset to search * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries * @param[out] distances DLManagedTensor* output `k` distances for queries + * @param[in] filter cuvsFilter filter to apply to the search */ cuvsError_t cuvsIvfPqSearch(cuvsResources_t res, cuvsIvfPqSearchParams_t search_params, cuvsIvfPqIndex_t index, DLManagedTensor* queries, DLManagedTensor* neighbors, - DLManagedTensor* distances); + DLManagedTensor* distances, + cuvsFilter filter); /** * @} */ diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp index 081179ca46..cb2acf88f5 100644 --- a/c/src/neighbors/cagra.cpp +++ b/c/src/neighbors/cagra.cpp @@ -128,6 +128,28 @@ void* _build(cuvsResources_t res, cuvsCagraIndexParams params, DLManagedTensor* return index; } +template +void _update_dataset(cuvsResources_t res, + cuvsCagraIndex index, + DLManagedTensor* dataset_tensor) +{ + auto dataset = dataset_tensor->dl_tensor; + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index.addr); + + if (cuvs::core::is_dlpack_device_compatible(dataset)) { + using mdspan_type = raft::device_matrix_view; + auto mds = cuvs::core::from_dlpack(dataset_tensor); + index_ptr->update_dataset(*res_ptr, mds); + } else if (cuvs::core::is_dlpack_host_compatible(dataset)) { + using mdspan_type = raft::host_matrix_view; + auto mds = cuvs::core::from_dlpack(dataset_tensor); + index_ptr->update_dataset(*res_ptr, mds); + } else { + RAFT_FAIL("Unsupported dataset DLtensor device type: %d", dataset.device.device_type); + } +} + template void* _from_args(cuvsResources_t res, cuvsDistanceType _metric, @@ -221,22 +243,26 @@ void _search(cuvsResources_t res, if (filter.type == NO_FILTER) { cuvs::neighbors::cagra::search( *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds); + } else if (filter.type == BITMAP) { + using filter_mdspan_type = raft::device_vector_view; + using filter_bmp_type = cuvs::core::bitmap_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter( + filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size())); + cuvs::neighbors::cagra::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj); } else if (filter.type == BITSET) { - using filter_mdspan_type = raft::device_vector_view; - auto removed_indices_tensor = reinterpret_cast(filter.addr); - auto removed_indices = cuvs::core::from_dlpack(removed_indices_tensor); - cuvs::core::bitset_view removed_indices_bitset( - removed_indices, index_ptr->dataset().extent(0)); - auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset); - cuvs::neighbors::cagra::search(*res_ptr, - search_params, - *index_ptr, - queries_mds, - neighbors_mds, - distances_mds, - bitset_filter_obj); + using filter_mdspan_type = raft::device_vector_view; + using filter_bst_type = cuvs::core::bitset_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter( + filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size())); + cuvs::neighbors::cagra::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj); } else { - RAFT_FAIL("Unsupported filter type: BITMAP"); + RAFT_FAIL("Unsupported filter type"); } } @@ -442,6 +468,7 @@ void convert_c_index_params(cuvsCagraIndexParams params, out->metric = static_cast((int)params.metric); out->intermediate_graph_degree = params.intermediate_graph_degree; out->graph_degree = params.graph_degree; + out->attach_dataset_on_build = params.attach_dataset_on_build; _set_graph_build_params(out->graph_build_params, params, params.build_algo, n_rows, dim); if (auto* cparams = params.compression; cparams != nullptr) { @@ -588,6 +615,27 @@ extern "C" cuvsError_t cuvsCagraBuild(cuvsResources_t res, }); } +extern "C" cuvsError_t cuvsCagraUpdateDataset(cuvsResources_t res, + DLManagedTensor* dataset_tensor, + cuvsCagraIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat && index->dtype.bits == 32) { + _update_dataset(res, *index, dataset_tensor); + } else if (index->dtype.code == kDLFloat && index->dtype.bits == 16) { + _update_dataset(res, *index, dataset_tensor); + } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) { + _update_dataset(res, *index, dataset_tensor); + } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) { + _update_dataset(res, *index, dataset_tensor); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", + index->dtype.code, + index->dtype.bits); + } + }); +} + extern "C" cuvsError_t cuvsCagraIndexFromArgs(cuvsResources_t res, cuvsDistanceType metric, DLManagedTensor* graph_tensor, @@ -736,7 +784,10 @@ extern "C" cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params .intermediate_graph_degree = 128, .graph_degree = 64, .build_algo = IVF_PQ, - .nn_descent_niter = 20}; + .nn_descent_niter = 20, + .compression = nullptr, + .graph_build_params = nullptr, + .attach_dataset_on_build = true}; (*params)->graph_build_params = new cuvsIvfPqParams{nullptr, nullptr, 1}; }); } diff --git a/c/src/neighbors/ivf_flat.cpp b/c/src/neighbors/ivf_flat.cpp index 56a3088e89..58accc7f81 100644 --- a/c/src/neighbors/ivf_flat.cpp +++ b/c/src/neighbors/ivf_flat.cpp @@ -90,23 +90,26 @@ void _search(cuvsResources_t res, if (filter.type == NO_FILTER) { cuvs::neighbors::ivf_flat::search( *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds); + } else if (filter.type == BITMAP) { + using filter_mdspan_type = raft::device_vector_view; + using filter_bmp_type = cuvs::core::bitmap_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter( + filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size())); + cuvs::neighbors::ivf_flat::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj); } else if (filter.type == BITSET) { - using filter_mdspan_type = raft::device_vector_view; - auto removed_indices_tensor = reinterpret_cast(filter.addr); - auto removed_indices = cuvs::core::from_dlpack(removed_indices_tensor); - cuvs::core::bitset_view removed_indices_bitset(removed_indices, - index_ptr->size()); - auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset); - cuvs::neighbors::ivf_flat::search(*res_ptr, - search_params, - *index_ptr, - queries_mds, - neighbors_mds, - distances_mds, - bitset_filter_obj); - + using filter_mdspan_type = raft::device_vector_view; + using filter_bst_type = cuvs::core::bitset_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter( + filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size())); + cuvs::neighbors::ivf_flat::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj); } else { - RAFT_FAIL("Unsupported filter type: BITMAP"); + RAFT_FAIL("Unsupported filter type"); } } diff --git a/c/src/neighbors/ivf_pq.cpp b/c/src/neighbors/ivf_pq.cpp index 8c444431e6..8835a94835 100644 --- a/c/src/neighbors/ivf_pq.cpp +++ b/c/src/neighbors/ivf_pq.cpp @@ -83,7 +83,8 @@ void _search(cuvsResources_t res, cuvsIvfPqIndex index, DLManagedTensor* queries_tensor, DLManagedTensor* neighbors_tensor, - DLManagedTensor* distances_tensor) + DLManagedTensor* distances_tensor, + cuvsFilter filter) { auto res_ptr = reinterpret_cast(res); auto index_ptr = reinterpret_cast*>(index.addr); @@ -98,8 +99,30 @@ void _search(cuvsResources_t res, auto neighbors_mds = cuvs::core::from_dlpack(neighbors_tensor); auto distances_mds = cuvs::core::from_dlpack(distances_tensor); - cuvs::neighbors::ivf_pq::search( - *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds); + if (filter.type == NO_FILTER) { + cuvs::neighbors::ivf_pq::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds); + } else if (filter.type == BITMAP) { + using filter_mdspan_type = raft::device_vector_view; + using filter_bmp_type = cuvs::core::bitmap_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter( + filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size())); + cuvs::neighbors::ivf_pq::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj); + } else if (filter.type == BITSET) { + using filter_mdspan_type = raft::device_vector_view; + using filter_bst_type = cuvs::core::bitset_view; + auto filter_tensor = reinterpret_cast(filter.addr); + auto filter_mds = cuvs::core::from_dlpack(filter_tensor); + const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter( + filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size())); + cuvs::neighbors::ivf_pq::search( + *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj); + } else { + RAFT_FAIL("Unsupported filter type"); + } } template @@ -305,7 +328,8 @@ extern "C" cuvsError_t cuvsIvfPqSearch(cuvsResources_t res, cuvsIvfPqIndex_t index_c_ptr, DLManagedTensor* queries_tensor, DLManagedTensor* neighbors_tensor, - DLManagedTensor* distances_tensor) + DLManagedTensor* distances_tensor, + cuvsFilter filter) { return cuvs::core::translate_exceptions([=] { auto queries = queries_tensor->dl_tensor; @@ -327,16 +351,16 @@ extern "C" cuvsError_t cuvsIvfPqSearch(cuvsResources_t res, auto index = *index_c_ptr; if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) { _search( - res, *params, index, queries_tensor, neighbors_tensor, distances_tensor); + res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter); } else if (queries.dtype.code == kDLFloat && queries.dtype.bits == 16) { _search( - res, *params, index, queries_tensor, neighbors_tensor, distances_tensor); + res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter); } else if (queries.dtype.code == kDLInt && queries.dtype.bits == 8) { _search( - res, *params, index, queries_tensor, neighbors_tensor, distances_tensor); + res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter); } else if (queries.dtype.code == kDLUInt && queries.dtype.bits == 8) { _search( - res, *params, index, queries_tensor, neighbors_tensor, distances_tensor); + res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter); } else { RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d", queries.dtype.code, diff --git a/c/src/neighbors/tiered_index.cpp b/c/src/neighbors/tiered_index.cpp index 2a7d54b16d..1c21b0876a 100644 --- a/c/src/neighbors/tiered_index.cpp +++ b/c/src/neighbors/tiered_index.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -71,6 +71,9 @@ void* _build(cuvsResources_t res, cuvsTieredIndexParams params, DLManagedTensor* case CUVS_TIERED_INDEX_ALGO_CAGRA: { auto build_params = tiered_index::index_params(); convert_c_index_params(params, dataset.shape[0], dataset.shape[1], &build_params); + // The tiered index sub-CAGRA always needs the dataset attached for search. + // Force this in case the caller did not set the field (e.g. zero-initialized struct). + build_params.attach_dataset_on_build = true; return new tiered_index::index>( tiered_index::build(*res_ptr, build_params, mds)); } diff --git a/c/tests/neighbors/ann_cagra_c.cu b/c/tests/neighbors/ann_cagra_c.cu index 9c14bbea7d..5f1e0cfe33 100644 --- a/c/tests/neighbors/ann_cagra_c.cu +++ b/c/tests/neighbors/ann_cagra_c.cu @@ -337,7 +337,229 @@ TEST(CagraC, BuildExtendSearch) cuvsResourcesDestroy(res); } -TEST(CagraC, BuildSearchFiltered) +TEST(CagraC, BuildNoDatasetThenUpdateAndSearch) +{ + // Test the attach_dataset_on_build = false workflow: + // 1. Build index without attaching dataset (saves a full dataset copy) + // 2. Attach dataset via cuvsCagraUpdateDataset + // 3. Search and verify correctness + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + cudaStream_t stream; + cuvsStreamGet(res, &stream); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = dataset; + dataset_tensor.dl_tensor.device.device_type = kDLCPU; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {4, 2}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = nullptr; + + // create index + cuvsCagraIndex_t index; + cuvsCagraIndexCreate(&index); + + // build index with attach_dataset_on_build = false + cuvsCagraIndexParams_t build_params; + cuvsCagraIndexParamsCreate(&build_params); + build_params->attach_dataset_on_build = false; + ASSERT_EQ(cuvsCagraBuild(res, build_params, &dataset_tensor, index), CUVS_SUCCESS); + + // now attach the dataset + ASSERT_EQ(cuvsCagraUpdateDataset(res, &dataset_tensor, index), CUVS_SUCCESS); + + // create queries DLTensor + rmm::device_uvector queries_d(4 * 2, stream); + raft::copy(queries_d.data(), (float*)queries, 4 * 2, stream); + + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = queries_d.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {4, 2}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = nullptr; + + // create neighbors DLTensor + rmm::device_uvector neighbors_d(4, stream); + + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_d.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLUInt; + neighbors_tensor.dl_tensor.dtype.bits = 32; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {4, 1}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = nullptr; + + // create distances DLTensor + rmm::device_uvector distances_d(4, stream); + + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_d.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {4, 1}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = nullptr; + + cuvsFilter filter; + filter.type = NO_FILTER; + filter.addr = (uintptr_t)NULL; + + // search index + cuvsCagraSearchParams_t search_params; + cuvsCagraSearchParamsCreate(&search_params); + cuvsCagraSearch( + res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // verify output — should match the standard BuildSearch test results + ASSERT_TRUE( + cuvs::devArrMatchHost(neighbors_exp, neighbors_d.data(), 4, cuvs::Compare())); + ASSERT_TRUE(cuvs::devArrMatchHost( + distances_exp, distances_d.data(), 4, cuvs::CompareApprox(0.001f))); + + // de-allocate index and res + cuvsCagraSearchParamsDestroy(search_params); + cuvsCagraIndexParamsDestroy(build_params); + cuvsCagraIndexDestroy(index); + cuvsResourcesDestroy(res); +} + +TEST(CagraC, BuildNoDatasetThenUpdateDeviceAndSearch) +{ + // Test the motivating scenario: dataset already on device (kDLCUDA). + // Using attach_dataset_on_build = false avoids duplicating the device dataset, + // then cuvsCagraUpdateDataset attaches it (zero-copy when properly aligned). + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + cudaStream_t stream; + cuvsStreamGet(res, &stream); + + // copy dataset to device memory (simulating a dataset that is already on GPU) + rmm::device_uvector dataset_d(4 * 2, stream); + raft::copy(dataset_d.data(), (float*)dataset, 4 * 2, stream); + + // create dataset DLTensor on CPU for building the graph + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = dataset; + dataset_tensor.dl_tensor.device.device_type = kDLCPU; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {4, 2}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = nullptr; + + // create index + cuvsCagraIndex_t index; + cuvsCagraIndexCreate(&index); + + // build index with attach_dataset_on_build = false + cuvsCagraIndexParams_t build_params; + cuvsCagraIndexParamsCreate(&build_params); + build_params->attach_dataset_on_build = false; + ASSERT_EQ(cuvsCagraBuild(res, build_params, &dataset_tensor, index), CUVS_SUCCESS); + + // attach the device-resident dataset via cuvsCagraUpdateDataset (kDLCUDA path) + DLManagedTensor device_dataset_tensor; + device_dataset_tensor.dl_tensor.data = dataset_d.data(); + device_dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + device_dataset_tensor.dl_tensor.device.device_id = 0; + device_dataset_tensor.dl_tensor.ndim = 2; + device_dataset_tensor.dl_tensor.dtype.code = kDLFloat; + device_dataset_tensor.dl_tensor.dtype.bits = 32; + device_dataset_tensor.dl_tensor.dtype.lanes = 1; + device_dataset_tensor.dl_tensor.shape = dataset_shape; + device_dataset_tensor.dl_tensor.strides = nullptr; + + ASSERT_EQ(cuvsCagraUpdateDataset(res, &device_dataset_tensor, index), CUVS_SUCCESS); + + // create queries DLTensor + rmm::device_uvector queries_d(4 * 2, stream); + raft::copy(queries_d.data(), (float*)queries, 4 * 2, stream); + + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = queries_d.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {4, 2}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = nullptr; + + // create neighbors DLTensor + rmm::device_uvector neighbors_d(4, stream); + + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_d.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLUInt; + neighbors_tensor.dl_tensor.dtype.bits = 32; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {4, 1}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = nullptr; + + // create distances DLTensor + rmm::device_uvector distances_d(4, stream); + + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_d.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {4, 1}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = nullptr; + + cuvsFilter filter; + filter.type = NO_FILTER; + filter.addr = (uintptr_t)NULL; + + // search index + cuvsCagraSearchParams_t search_params; + cuvsCagraSearchParamsCreate(&search_params); + cuvsCagraSearch( + res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // verify output — should match the standard BuildSearch test results + ASSERT_TRUE( + cuvs::devArrMatchHost(neighbors_exp, neighbors_d.data(), 4, cuvs::Compare())); + ASSERT_TRUE(cuvs::devArrMatchHost( + distances_exp, distances_d.data(), 4, cuvs::CompareApprox(0.001f))); + + // de-allocate index and res + cuvsCagraSearchParamsDestroy(search_params); + cuvsCagraIndexParamsDestroy(build_params); + cuvsCagraIndexDestroy(index); + cuvsResourcesDestroy(res); +} + +TEST(CagraC, BuildSearchBitsetFiltered) { // create cuvsResources_t cuvsResources_t res; @@ -447,6 +669,142 @@ TEST(CagraC, BuildSearchFiltered) cuvsResourcesDestroy(res); } +TEST(CagraC, BuildSearchBitmapFiltered) +{ + int64_t n_rows = 100; + int64_t n_queries = 10; + int64_t n_dim = 16; + uint32_t n_neighbors = 4; + + raft::handle_t handle; + auto stream = raft::resource::get_cuda_stream(handle); + + // Generate data + rmm::device_uvector index_data(n_rows * n_dim, stream); + rmm::device_uvector query_data(n_queries * n_dim, stream); + raft::random::RngState r(1234ULL); + raft::random::uniform( + handle, r, index_data.data(), n_rows * n_dim, float(0.1), float(2.0)); + raft::random::uniform( + handle, r, query_data.data(), n_queries * n_dim, float(0.1), float(2.0)); + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = index_data.data(); + dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {n_rows, n_dim}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = nullptr; + + // create index + cuvsCagraIndex_t index; + cuvsCagraIndexCreate(&index); + + // build index + cuvsCagraIndexParams_t build_params; + cuvsCagraIndexParamsCreate(&build_params); + cuvsCagraBuild(res, build_params, &dataset_tensor, index); + + // create queries DLTensor + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = query_data.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {n_queries, n_dim}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = nullptr; + + // create neighbors DLTensor + rmm::device_uvector neighbors_data(n_queries * n_neighbors, stream); + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_data.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLUInt; + neighbors_tensor.dl_tensor.dtype.bits = 32; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {n_queries, n_neighbors}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = nullptr; + + // create distances DLTensor + rmm::device_uvector distances_data(n_queries * n_neighbors, stream); + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_data.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {n_queries, n_neighbors}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = nullptr; + + // Create bitmap filter - per query filter + // For each query, remove even indices + auto bitmap_size = n_queries * ((n_rows + 31) / 32); // n_queries x (bits for n_rows) + rmm::device_uvector filter_bitmap(bitmap_size, stream); + std::vector filter_bitmap_h(bitmap_size); + for (size_t q = 0; q < n_queries; ++q) { + for (size_t i = 0; i < (n_rows + 31) / 32; ++i) { + filter_bitmap_h[q * ((n_rows + 31) / 32) + i] = + 0xAAAAAAAA; // 10101010... pattern - removes even indices + } + } + raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream); + + DLManagedTensor filter_tensor; + filter_tensor.dl_tensor.data = filter_bitmap.data(); + filter_tensor.dl_tensor.device.device_type = kDLCUDA; + filter_tensor.dl_tensor.ndim = 1; + filter_tensor.dl_tensor.dtype.code = kDLUInt; + filter_tensor.dl_tensor.dtype.bits = 32; + filter_tensor.dl_tensor.dtype.lanes = 1; + int64_t filter_shape[1] = {bitmap_size}; + filter_tensor.dl_tensor.shape = filter_shape; + filter_tensor.dl_tensor.strides = nullptr; + + cuvsFilter filter; + filter.type = BITMAP; + filter.addr = (uintptr_t)&filter_tensor; + + // search index with bitmap filter + cuvsCagraSearchParams_t search_params; + cuvsCagraSearchParamsCreate(&search_params); + cuvsCagraSearch( + res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // Verify all returned neighbors are odd indices (not filtered out) + std::vector neighbors_h(n_queries * n_neighbors); + raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream); + raft::resource::sync_stream(handle); + + for (size_t i = 0; i < n_queries * n_neighbors; ++i) { + // All neighbors should be odd indices (since even indices are filtered) + // Note: uint32_t max value indicates no valid neighbor found + ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == std::numeric_limits::max()) + << "Neighbor at position " << i << " has value " << neighbors_h[i] + << " which is an even index (should be filtered)"; + } + + // de-allocate index and res + cuvsCagraSearchParamsDestroy(search_params); + cuvsCagraIndexParamsDestroy(build_params); + cuvsCagraIndexDestroy(index); + cuvsResourcesDestroy(res); +} + TEST(CagraC, BuildMergeSearch) { cuvsResources_t res; diff --git a/c/tests/neighbors/ann_ivf_flat_c.cu b/c/tests/neighbors/ann_ivf_flat_c.cu index 2039721d2f..f07706fa77 100644 --- a/c/tests/neighbors/ann_ivf_flat_c.cu +++ b/c/tests/neighbors/ann_ivf_flat_c.cu @@ -129,3 +129,276 @@ TEST(IvfFlatC, BuildSearch) n_probes, n_lists); } +TEST(IvfFlatC, BuildSearchBitsetFiltered) +{ + int64_t n_rows = 1000; + int64_t n_queries = 10; + int64_t n_dim = 16; + uint32_t n_neighbors = 10; + + raft::handle_t handle; + auto stream = raft::resource::get_cuda_stream(handle); + + cuvsDistanceType metric = L2Expanded; + size_t n_probes = 10; + size_t n_lists = 20; + + // Generate data + rmm::device_uvector index_data(n_rows * n_dim, stream); + rmm::device_uvector query_data(n_queries * n_dim, stream); + generate_random_data(index_data.data(), n_rows * n_dim); + generate_random_data(query_data.data(), n_queries * n_dim); + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = index_data.data(); + dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {n_rows, n_dim}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = NULL; + + // create index + cuvsIvfFlatIndex_t index; + cuvsIvfFlatIndexCreate(&index); + + // build index + cuvsIvfFlatIndexParams_t build_params; + cuvsIvfFlatIndexParamsCreate(&build_params); + build_params->metric = metric; + build_params->n_lists = n_lists; + cuvsIvfFlatBuild(res, build_params, &dataset_tensor, index); + + // create queries DLTensor + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = query_data.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {n_queries, n_dim}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = NULL; + + // create neighbors DLTensor + rmm::device_uvector neighbors_data(n_queries * n_neighbors, stream); + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_data.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLInt; + neighbors_tensor.dl_tensor.dtype.bits = 64; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {n_queries, n_neighbors}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = NULL; + + // create distances DLTensor + rmm::device_uvector distances_data(n_queries * n_neighbors, stream); + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_data.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {n_queries, n_neighbors}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = NULL; + + // Create bitset filter - remove every other index + auto bitset_size = (n_rows + 31) / 32; // number of uint32_t needed + rmm::device_uvector filter_bitset(bitset_size, stream); + std::vector filter_bitset_h(bitset_size); + for (size_t i = 0; i < bitset_size; ++i) { + filter_bitset_h[i] = 0xAAAAAAAA; // 10101010... pattern - removes even indices + } + raft::copy(filter_bitset.data(), filter_bitset_h.data(), bitset_size, stream); + + DLManagedTensor filter_tensor; + filter_tensor.dl_tensor.data = filter_bitset.data(); + filter_tensor.dl_tensor.device.device_type = kDLCUDA; + filter_tensor.dl_tensor.ndim = 1; + filter_tensor.dl_tensor.dtype.code = kDLUInt; + filter_tensor.dl_tensor.dtype.bits = 32; + filter_tensor.dl_tensor.dtype.lanes = 1; + int64_t filter_shape[1] = {bitset_size}; + filter_tensor.dl_tensor.shape = filter_shape; + filter_tensor.dl_tensor.strides = NULL; + + cuvsFilter filter; + filter.type = BITSET; + filter.addr = (uintptr_t)&filter_tensor; + + // search index with filter + cuvsIvfFlatSearchParams_t search_params; + cuvsIvfFlatSearchParamsCreate(&search_params); + search_params->n_probes = n_probes; + cuvsIvfFlatSearch( + res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // Verify all returned neighbors are odd indices (not filtered out) + std::vector neighbors_h(n_queries * n_neighbors); + raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream); + raft::resource::sync_stream(handle); + + for (size_t i = 0; i < n_queries * n_neighbors; ++i) { + // All neighbors should be odd indices (since even indices are filtered) + ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1) + << "Neighbor at position " << i << " has value " << neighbors_h[i] + << " which is an even index (should be filtered)"; + } + + // de-allocate index and res + cuvsIvfFlatSearchParamsDestroy(search_params); + cuvsIvfFlatIndexParamsDestroy(build_params); + cuvsIvfFlatIndexDestroy(index); + cuvsResourcesDestroy(res); +} + +TEST(IvfFlatC, BuildSearchBitmapFiltered) +{ + int64_t n_rows = 1000; + int64_t n_queries = 10; + int64_t n_dim = 16; + uint32_t n_neighbors = 10; + + raft::handle_t handle; + auto stream = raft::resource::get_cuda_stream(handle); + + cuvsDistanceType metric = L2Expanded; + size_t n_probes = 10; + size_t n_lists = 20; + + // Generate data + rmm::device_uvector index_data(n_rows * n_dim, stream); + rmm::device_uvector query_data(n_queries * n_dim, stream); + generate_random_data(index_data.data(), n_rows * n_dim); + generate_random_data(query_data.data(), n_queries * n_dim); + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = index_data.data(); + dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {n_rows, n_dim}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = NULL; + + // create index + cuvsIvfFlatIndex_t index; + cuvsIvfFlatIndexCreate(&index); + + // build index + cuvsIvfFlatIndexParams_t build_params; + cuvsIvfFlatIndexParamsCreate(&build_params); + build_params->metric = metric; + build_params->n_lists = n_lists; + cuvsIvfFlatBuild(res, build_params, &dataset_tensor, index); + + // create queries DLTensor + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = query_data.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {n_queries, n_dim}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = NULL; + + // create neighbors DLTensor + rmm::device_uvector neighbors_data(n_queries * n_neighbors, stream); + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_data.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLInt; + neighbors_tensor.dl_tensor.dtype.bits = 64; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {n_queries, n_neighbors}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = NULL; + + // create distances DLTensor + rmm::device_uvector distances_data(n_queries * n_neighbors, stream); + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_data.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {n_queries, n_neighbors}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = NULL; + + // Create bitmap filter - per query filter + // For each query, remove even indices + auto bitmap_size = n_queries * ((n_rows + 31) / 32); // n_queries x (bits for n_rows) + rmm::device_uvector filter_bitmap(bitmap_size, stream); + std::vector filter_bitmap_h(bitmap_size); + for (size_t q = 0; q < n_queries; ++q) { + for (size_t i = 0; i < (n_rows + 31) / 32; ++i) { + filter_bitmap_h[q * ((n_rows + 31) / 32) + i] = + 0xAAAAAAAA; // 10101010... pattern - removes even indices + } + } + raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream); + + DLManagedTensor filter_tensor; + filter_tensor.dl_tensor.data = filter_bitmap.data(); + filter_tensor.dl_tensor.device.device_type = kDLCUDA; + filter_tensor.dl_tensor.ndim = 1; + filter_tensor.dl_tensor.dtype.code = kDLUInt; + filter_tensor.dl_tensor.dtype.bits = 32; + filter_tensor.dl_tensor.dtype.lanes = 1; + int64_t filter_shape[1] = {bitmap_size}; + filter_tensor.dl_tensor.shape = filter_shape; + filter_tensor.dl_tensor.strides = NULL; + + cuvsFilter filter; + filter.type = BITMAP; + filter.addr = (uintptr_t)&filter_tensor; + + // search index with bitmap filter + cuvsIvfFlatSearchParams_t search_params; + cuvsIvfFlatSearchParamsCreate(&search_params); + search_params->n_probes = n_probes; + cuvsIvfFlatSearch( + res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // Verify all returned neighbors are odd indices (not filtered out) + std::vector neighbors_h(n_queries * n_neighbors); + raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream); + raft::resource::sync_stream(handle); + + for (size_t i = 0; i < n_queries * n_neighbors; ++i) { + // All neighbors should be odd indices (since even indices are filtered) + ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1) + << "Neighbor at position " << i << " has value " << neighbors_h[i] + << " which is an even index (should be filtered)"; + } + + // de-allocate index and res + cuvsIvfFlatSearchParamsDestroy(search_params); + cuvsIvfFlatIndexParamsDestroy(build_params); + cuvsIvfFlatIndexDestroy(index); + cuvsResourcesDestroy(res); +} diff --git a/c/tests/neighbors/ann_ivf_pq_c.cu b/c/tests/neighbors/ann_ivf_pq_c.cu index 06c2f7f6e1..9bfedf810a 100644 --- a/c/tests/neighbors/ann_ivf_pq_c.cu +++ b/c/tests/neighbors/ann_ivf_pq_c.cu @@ -129,3 +129,274 @@ TEST(IvfPqC, BuildSearch) n_probes, n_lists); } + +TEST(IvfPqC, BuildSearchBitsetFiltered) +{ + int64_t n_rows = 1000; + int64_t n_queries = 10; + int64_t n_dim = 16; + uint32_t n_neighbors = 10; + + raft::handle_t handle; + auto stream = raft::resource::get_cuda_stream(handle); + + cuvsDistanceType metric = L2Expanded; + size_t n_probes = 10; + size_t n_lists = 20; + + // Generate data + rmm::device_uvector index_data(n_rows * n_dim, stream); + rmm::device_uvector query_data(n_queries * n_dim, stream); + generate_random_data(index_data.data(), n_rows * n_dim); + generate_random_data(query_data.data(), n_queries * n_dim); + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = index_data.data(); + dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {n_rows, n_dim}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = NULL; + + // create index + cuvsIvfPqIndex_t index; + cuvsIvfPqIndexCreate(&index); + + // build index + cuvsIvfPqIndexParams_t build_params; + cuvsIvfPqIndexParamsCreate(&build_params); + build_params->metric = metric; + build_params->n_lists = n_lists; + cuvsIvfPqBuild(res, build_params, &dataset_tensor, index); + + // create queries DLTensor + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = query_data.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {n_queries, n_dim}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = NULL; + + // create neighbors DLTensor + rmm::device_uvector neighbors_data(n_queries * n_neighbors, stream); + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_data.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLInt; + neighbors_tensor.dl_tensor.dtype.bits = 64; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {n_queries, n_neighbors}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = NULL; + + // create distances DLTensor + rmm::device_uvector distances_data(n_queries * n_neighbors, stream); + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_data.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {n_queries, n_neighbors}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = NULL; + + // Create bitset filter - remove every other index + auto bitset_size = (n_rows + 31) / 32; // number of uint32_t needed + rmm::device_uvector filter_bitset(bitset_size, stream); + std::vector filter_bitset_h(bitset_size); + for (size_t i = 0; i < bitset_size; ++i) { + filter_bitset_h[i] = 0xAAAAAAAA; // 10101010... pattern - removes even indices + } + raft::copy(filter_bitset.data(), filter_bitset_h.data(), bitset_size, stream); + + DLManagedTensor filter_tensor; + filter_tensor.dl_tensor.data = filter_bitset.data(); + filter_tensor.dl_tensor.device.device_type = kDLCUDA; + filter_tensor.dl_tensor.ndim = 1; + filter_tensor.dl_tensor.dtype.code = kDLUInt; + filter_tensor.dl_tensor.dtype.bits = 32; + filter_tensor.dl_tensor.dtype.lanes = 1; + int64_t filter_shape[1] = {bitset_size}; + filter_tensor.dl_tensor.shape = filter_shape; + filter_tensor.dl_tensor.strides = NULL; + + cuvsFilter filter; + filter.type = BITSET; + filter.addr = (uintptr_t)&filter_tensor; + + // search index with filter + cuvsIvfPqSearchParams_t search_params; + cuvsIvfPqSearchParamsCreate(&search_params); + search_params->n_probes = n_probes; + cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // Verify all returned neighbors are odd indices (not filtered out) + std::vector neighbors_h(n_queries * n_neighbors); + raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream); + raft::resource::sync_stream(handle); + + for (size_t i = 0; i < n_queries * n_neighbors; ++i) { + // All neighbors should be odd indices (since even indices are filtered) + ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1) + << "Neighbor at position " << i << " has value " << neighbors_h[i] + << " which is an even index (should be filtered)"; + } + + // de-allocate index and res + cuvsIvfPqSearchParamsDestroy(search_params); + cuvsIvfPqIndexParamsDestroy(build_params); + cuvsIvfPqIndexDestroy(index); + cuvsResourcesDestroy(res); +} + +TEST(IvfPqC, BuildSearchBitmapFiltered) +{ + int64_t n_rows = 1000; + int64_t n_queries = 10; + int64_t n_dim = 16; + uint32_t n_neighbors = 10; + + raft::handle_t handle; + auto stream = raft::resource::get_cuda_stream(handle); + + cuvsDistanceType metric = L2Expanded; + size_t n_probes = 10; + size_t n_lists = 20; + + // Generate data + rmm::device_uvector index_data(n_rows * n_dim, stream); + rmm::device_uvector query_data(n_queries * n_dim, stream); + generate_random_data(index_data.data(), n_rows * n_dim); + generate_random_data(query_data.data(), n_queries * n_dim); + + // create cuvsResources_t + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // create dataset DLTensor + DLManagedTensor dataset_tensor; + dataset_tensor.dl_tensor.data = index_data.data(); + dataset_tensor.dl_tensor.device.device_type = kDLCUDA; + dataset_tensor.dl_tensor.ndim = 2; + dataset_tensor.dl_tensor.dtype.code = kDLFloat; + dataset_tensor.dl_tensor.dtype.bits = 32; + dataset_tensor.dl_tensor.dtype.lanes = 1; + int64_t dataset_shape[2] = {n_rows, n_dim}; + dataset_tensor.dl_tensor.shape = dataset_shape; + dataset_tensor.dl_tensor.strides = NULL; + + // create index + cuvsIvfPqIndex_t index; + cuvsIvfPqIndexCreate(&index); + + // build index + cuvsIvfPqIndexParams_t build_params; + cuvsIvfPqIndexParamsCreate(&build_params); + build_params->metric = metric; + build_params->n_lists = n_lists; + cuvsIvfPqBuild(res, build_params, &dataset_tensor, index); + + // create queries DLTensor + DLManagedTensor queries_tensor; + queries_tensor.dl_tensor.data = query_data.data(); + queries_tensor.dl_tensor.device.device_type = kDLCUDA; + queries_tensor.dl_tensor.ndim = 2; + queries_tensor.dl_tensor.dtype.code = kDLFloat; + queries_tensor.dl_tensor.dtype.bits = 32; + queries_tensor.dl_tensor.dtype.lanes = 1; + int64_t queries_shape[2] = {n_queries, n_dim}; + queries_tensor.dl_tensor.shape = queries_shape; + queries_tensor.dl_tensor.strides = NULL; + + // create neighbors DLTensor + rmm::device_uvector neighbors_data(n_queries * n_neighbors, stream); + DLManagedTensor neighbors_tensor; + neighbors_tensor.dl_tensor.data = neighbors_data.data(); + neighbors_tensor.dl_tensor.device.device_type = kDLCUDA; + neighbors_tensor.dl_tensor.ndim = 2; + neighbors_tensor.dl_tensor.dtype.code = kDLInt; + neighbors_tensor.dl_tensor.dtype.bits = 64; + neighbors_tensor.dl_tensor.dtype.lanes = 1; + int64_t neighbors_shape[2] = {n_queries, n_neighbors}; + neighbors_tensor.dl_tensor.shape = neighbors_shape; + neighbors_tensor.dl_tensor.strides = NULL; + + // create distances DLTensor + rmm::device_uvector distances_data(n_queries * n_neighbors, stream); + DLManagedTensor distances_tensor; + distances_tensor.dl_tensor.data = distances_data.data(); + distances_tensor.dl_tensor.device.device_type = kDLCUDA; + distances_tensor.dl_tensor.ndim = 2; + distances_tensor.dl_tensor.dtype.code = kDLFloat; + distances_tensor.dl_tensor.dtype.bits = 32; + distances_tensor.dl_tensor.dtype.lanes = 1; + int64_t distances_shape[2] = {n_queries, n_neighbors}; + distances_tensor.dl_tensor.shape = distances_shape; + distances_tensor.dl_tensor.strides = NULL; + + // Create bitmap filter - per query filter + // For each query, remove even indices + auto bitmap_size = n_queries * ((n_rows + 31) / 32); // n_queries x (bits for n_rows) + rmm::device_uvector filter_bitmap(bitmap_size, stream); + std::vector filter_bitmap_h(bitmap_size); + for (size_t q = 0; q < n_queries; ++q) { + for (size_t i = 0; i < (n_rows + 31) / 32; ++i) { + filter_bitmap_h[q * ((n_rows + 31) / 32) + i] = 0xAAAAAAAA; // 10101010... pattern - removes even indices + } + } + raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream); + + DLManagedTensor filter_tensor; + filter_tensor.dl_tensor.data = filter_bitmap.data(); + filter_tensor.dl_tensor.device.device_type = kDLCUDA; + filter_tensor.dl_tensor.ndim = 1; + filter_tensor.dl_tensor.dtype.code = kDLUInt; + filter_tensor.dl_tensor.dtype.bits = 32; + filter_tensor.dl_tensor.dtype.lanes = 1; + int64_t filter_shape[1] = {bitmap_size}; + filter_tensor.dl_tensor.shape = filter_shape; + filter_tensor.dl_tensor.strides = NULL; + + cuvsFilter filter; + filter.type = BITMAP; + filter.addr = (uintptr_t)&filter_tensor; + + // search index with filter + cuvsIvfPqSearchParams_t search_params; + cuvsIvfPqSearchParamsCreate(&search_params); + search_params->n_probes = n_probes; + cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); + + // Verify all returned neighbors are odd indices (not filtered out) + std::vector neighbors_h(n_queries * n_neighbors); + raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream); + raft::resource::sync_stream(handle); + + for (size_t i = 0; i < n_queries * n_neighbors; ++i) { + // All neighbors should be odd indices (since even indices are filtered) + ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1) + << "Neighbor at position " << i << " has value " << neighbors_h[i] + << " which is an even index (should be filtered)"; + } + + // de-allocate index and res + cuvsIvfPqSearchParamsDestroy(search_params); + cuvsIvfPqIndexParamsDestroy(build_params); + cuvsIvfPqIndexDestroy(index); + cuvsResourcesDestroy(res); +} diff --git a/c/tests/neighbors/run_ivf_pq_c.c b/c/tests/neighbors/run_ivf_pq_c.c index 64154fb3c0..1006f1e8a9 100644 --- a/c/tests/neighbors/run_ivf_pq_c.c +++ b/c/tests/neighbors/run_ivf_pq_c.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -80,11 +80,15 @@ void run_ivf_pq(int64_t n_rows, distances_tensor.dl_tensor.shape = distances_shape; distances_tensor.dl_tensor.strides = NULL; + cuvsFilter filter; + filter.type = NO_FILTER; + filter.addr = (uintptr_t)NULL; + // search index cuvsIvfPqSearchParams_t search_params; cuvsIvfPqSearchParamsCreate(&search_params); search_params->n_probes = n_probes; - cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor); + cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter); // de-allocate index and res cuvsIvfPqSearchParamsDestroy(search_params); diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp index 050f0cecdc..b0a65b4369 100644 --- a/cpp/include/cuvs/cluster/agglomerative.hpp +++ b/cpp/include/cuvs/cluster/agglomerative.hpp @@ -129,7 +129,7 @@ struct distance_params { /** Specialized parameters to build the Mutual Reachability graph */ struct mutual_reachability_params { /** this neighborhood will be selected for core distances. */ - int min_samples; + int min_samples = 5; /** weight applied when internal distance is chosen for mutual reachability (value of 1.0 disables * the weighting) */ diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp index 13c8c7bd7e..7eb4527878 100644 --- a/cpp/include/cuvs/distance/distance.hpp +++ b/cpp/include/cuvs/distance/distance.hpp @@ -93,10 +93,10 @@ enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH }; */ struct KernelParams { // Kernel function parameters - KernelType kernel; //!< Type of the kernel function - int degree; //!< Degree of polynomial kernel (ignored by others) - double gamma; //!< multiplier in the - double coef0; //!< additive constant in poly and tanh kernels + KernelType kernel = KernelType::LINEAR; //!< Type of the kernel function + int degree = 3; //!< Degree of polynomial kernel (ignored by others) + double gamma = 1.0; //!< multiplier in the + double coef0 = 0.0; //!< additive constant in poly and tanh kernels }; } // end namespace kernels diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp index 4cb3aa47b6..a97316f9ce 100644 --- a/cpp/include/cuvs/distance/grammian.hpp +++ b/cpp/include/cuvs/distance/grammian.hpp @@ -36,15 +36,17 @@ using csr_input_matrix_view_t = raft::device_csr_matrix_view class GramMatrixBase { protected: - cublasHandle_t cublas_handle; + cublasHandle_t cublas_handle = nullptr; bool legacy_interface; public: - GramMatrixBase() : legacy_interface(false) {}; + GramMatrixBase() : legacy_interface(false) {} [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) - : cublas_handle(cublas_handle), legacy_interface(true) {}; + : cublas_handle(cublas_handle), legacy_interface(true) + { + } - virtual ~GramMatrixBase() {}; + virtual ~GramMatrixBase() = default; /** Convenience function to evaluate the Gram matrix for two vector sets. * Vector sets are provided in Matrix format @@ -320,10 +322,14 @@ class PolynomialKernel : public GramMatrixBase { * @param offset */ PolynomialKernel(exp_t exponent, math_t gain, math_t offset) - : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) {}; + : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) + { + } [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) {}; + : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) + { + } /** Evaluate kernel matrix using polynomial kernel. * @@ -436,7 +442,9 @@ class TanhKernel : public GramMatrixBase { TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain), offset(offset) {}; + : GramMatrixBase(handle), gain(gain), offset(offset) + { + } /** Evaluate kernel matrix using tanh kernel. * @@ -551,10 +559,12 @@ class RBFKernel : public GramMatrixBase { * @tparam math_t floating point type * @param gain */ - RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {}; + RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {} [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain) {}; + : GramMatrixBase(handle), gain(gain) + { + } void matrixRowNormL2(raft::resources const& handle, dense_input_matrix_view_t matrix, diff --git a/cpp/include/cuvs/neighbors/ball_cover.hpp b/cpp/include/cuvs/neighbors/ball_cover.hpp index 17fa94bfd7..77a1bc1d7c 100644 --- a/cpp/include/cuvs/neighbors/ball_cover.hpp +++ b/cpp/include/cuvs/neighbors/ball_cover.hpp @@ -135,7 +135,7 @@ struct index : cuvs::neighbors::index { raft::device_matrix X_reordered; protected: - bool index_trained; + bool index_trained = false; }; /** @} */ diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 3909098398..d9cefea4d9 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -498,7 +498,7 @@ namespace filtering { enum class FilterType { None, Bitmap, Bitset }; struct base_filter { - ~base_filter() = default; + virtual ~base_filter() = default; virtual FilterType get_filter_type() const = 0; }; @@ -972,7 +972,7 @@ struct mg_index { auto operator=(mg_index&&) -> mg_index& = default; distribution_mode mode_; - int num_ranks_; + int num_ranks_ = 0; std::vector> ann_interfaces_; // for load balancing mechanism diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp index 7ee91f18ba..26137d6669 100644 --- a/cpp/include/cuvs/neighbors/hnsw.hpp +++ b/cpp/include/cuvs/neighbors/hnsw.hpp @@ -738,9 +738,9 @@ void extend(raft::resources const& res, */ struct search_params : cuvs::neighbors::search_params { - int ef; // size of the candidate list - int num_threads = 0; // number of host threads to use for concurrent searches. Value of 0 - // automatically maximizes parallelism + int ef = 200; // size of the candidate list + int num_threads = 0; // number of host threads to use for concurrent searches. Value of 0 + // automatically maximizes parallelism }; /** diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index c3ba86d5b6..1c34c1f801 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -33,8 +33,8 @@ namespace cuvs::neighbors::vamana { */ template struct codebook_params { - int pq_codebook_size; - int pq_dim; + int pq_codebook_size = 0; + int pq_dim = 0; std::vector pq_encoding_table; std::vector rotation_matrix; }; diff --git a/cpp/include/cuvs/preprocessing/spectral_embedding.hpp b/cpp/include/cuvs/preprocessing/spectral_embedding.hpp index e7a578d2ab..5575b1bb38 100644 --- a/cpp/include/cuvs/preprocessing/spectral_embedding.hpp +++ b/cpp/include/cuvs/preprocessing/spectral_embedding.hpp @@ -24,10 +24,10 @@ namespace cuvs::preprocessing::spectral_embedding { */ struct params { /** @brief The number of components to reduce the data to. */ - int n_components; + int n_components = 2; /** @brief The number of neighbors to use for the nearest neighbors graph. */ - int n_neighbors; + int n_neighbors = 15; /** * @brief Whether to normalize the Laplacian matrix. @@ -36,7 +36,7 @@ struct params { * If false, uses the unnormalized graph Laplacian (L = D - W). * Normalized Laplacian often leads to better results for clustering tasks. */ - bool norm_laplacian; + bool norm_laplacian = true; /** * @brief Whether to drop the first eigenvector. @@ -45,7 +45,7 @@ struct params { * uninformative. Setting this to true drops it from the embedding. * This is typically set to true when norm_laplacian is true. */ - bool drop_first; + bool drop_first = true; /** * @brief Tolerance for the eigenvalue solver. diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 82bd6e755a..e3c7e120da 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -136,7 +136,7 @@ struct config { }; template <> struct config { - using value_t = half; + using value_t = float; static constexpr double kDivisor = 1.0; }; template <> diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 9d3198eb74..415ebc635c 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2080,7 +2080,7 @@ auto iterative_build_graph( curr_itopk_size = curr_topk + 32; } - RAFT_LOG_INFO( + RAFT_LOG_DEBUG( "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size, @@ -2146,7 +2146,7 @@ auto iterative_build_graph( auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); - RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); + RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); if (flag_last) { break; } flag_last = (curr_graph_size == final_graph_size); diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index f9974fa3df..22b325fca5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -235,6 +235,7 @@ struct dataset_descriptor_host { std::mutex mutex; std::atomic ready; // Not sure if std::holds_alternative is thread-safe std::variant value; + cudaEvent_t init_event{nullptr}; template state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} @@ -247,6 +248,7 @@ struct dataset_descriptor_host { auto& [ptr, stream] = std::get(value); RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); } + if (init_event != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(init_event)); } } void eval(rmm::cuda_stream_view stream) @@ -255,8 +257,12 @@ struct dataset_descriptor_host { if (std::holds_alternative(value)) { auto& [fun, size] = std::get(value); dev_descriptor_t* ptr = nullptr; + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&init_event, cudaEventDisableTiming)); RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream)); fun(ptr, stream); + // Record an event after initialization so that other streams can establish + // a GPU-side dependency without expensive host synchronization. + RAFT_CUDA_TRY(cudaEventRecord(init_event, stream)); value = std::make_tuple(ptr, stream); ready.store(true, std::memory_order_release); } @@ -265,6 +271,11 @@ struct dataset_descriptor_host { auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t* { if (!ready.load(std::memory_order_acquire)) { eval(stream); } + // Make the caller's stream wait for the init to complete. This is a + // lightweight GPU-side dependency with no host blocking. On the same + // stream that performed the init (or after the event has already + // completed) this is essentially a no-op. + if (init_event != nullptr) { RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, init_event)); } return std::get<0>(std::get(value)); } }; diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 9cc6aeb353..b67044ce36 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -110,7 +110,13 @@ struct search_plan_impl_base : public search_params { { if (algo == search_algo::AUTO) { const size_t num_sm = raft::getMultiProcessorCount(); - if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) { + // SINGLE_CTA computes max_iterations as itopk_size / search_width. + // When search_width > 1, this results in significantly fewer iterations + // compared to MULTI_CTA (which uses its own internal mc_search_width=1). + // This causes recall degradation when AUTO switches to SINGLE_CTA at + // large batch sizes. Only select SINGLE_CTA when search_width <= 1 to + // ensure consistent recall across batch sizes. (See #1187) + if (itopk_size <= 512 && search_width <= 1 && search_params::max_queries >= num_sm * 2lu) { algo = search_algo::SINGLE_CTA; RAFT_LOG_DEBUG("Auto strategy: selecting single-cta"); } else { diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh index 41c95c0ccd..ebc0e94495 100644 --- a/cpp/src/neighbors/detail/smem_utils.cuh +++ b/cpp/src/neighbors/detail/smem_utils.cuh @@ -5,7 +5,6 @@ #pragma once #include - #include #include #include @@ -14,14 +13,19 @@ namespace cuvs::neighbors::detail { /** * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size. - * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed - * atomically. * - * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus - * guarantees that the kernel is safe to launch. + * Maintains a monotonically growing high-water mark for + * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new + * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark, + * it is grown for the current kernel. This guarantees every kernel's attribute is always >= + * smem_size at the time of launch. + * + * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel + * template instantiations may share the same KernelT type (e.g. function pointers with the same + * signature), so we track the kernel identity alongside the smem high-water mark. * * @tparam KernelT The type of the kernel. - * @tparam InvocationT The type of the invocation function. + * @tparam KernelLauncherT The type of the launch function/lambda. * @param kernel The kernel function address (for whom the smem-size is specified). * @param smem_size The size of the dynamic shared memory to be set. * @param launch The kernel launch function/lambda. @@ -31,31 +35,41 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, uint32_t smem_size, KernelLauncherT const& launch) { - // the last smem size is parameterized by the kernel thanks to the template parameter. - static std::atomic current_smem_size{0}; - auto last_smem_size = current_smem_size.load(std::memory_order_relaxed); - if (smem_size > last_smem_size) { - // We still need a mutex for the critical section: actualize last_smem_size and set the - // attribute. - static auto mutex = std::mutex{}; - auto guard = std::lock_guard{mutex}; - if (!current_smem_size.compare_exchange_strong( - last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) { - // The value has been updated by another thread between the load and the mutex acquisition. - if (smem_size > last_smem_size) { - current_smem_size.store(smem_size, std::memory_order_relaxed); - } - } - // Only update if the last seen value is smaller than the new one. - if (smem_size > last_smem_size) { - auto launch_status = - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - RAFT_EXPECTS(launch_status == cudaSuccess, - "Failed to set max dynamic shared memory size to %u bytes", - smem_size); - } + // last_smem_size is a monotonically growing high-water mark across all kernel pointers. + // last_kernel tracks which kernel pointer was last used. + static std::atomic last_smem_size{0}; + static std::atomic last_kernel{KernelT{}}; + static std::mutex mutex; + bool updated_needed = false; + // When the kernel function pointer changes, bring the new kernel up to the global high-water + // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer, + // not to the pointer type — different template instantiations may share the same KernelT. + if (kernel != last_kernel.load(std::memory_order_relaxed)) + { + last_kernel.store(kernel, std::memory_order_relaxed); + updated_needed = true; + } + // Since we first read the kernel pointer, and the shem_size can only grow, + // reading an inconsistent state is safe. At worst we will use a larger smem_size + uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed); + if (smem_size > cur_smem_size) + { + last_smem_size.store(smem_size, std::memory_order_relaxed); + cur_smem_size = smem_size; + updated_needed = true; + } + // Mutex-protected cudaFuncSetAttribute + if (updated_needed) + { + std::lock_guard guard(mutex); + auto launch_status = + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size); + RAFT_EXPECTS(launch_status == cudaSuccess, + "Failed to set max dynamic shared memory size to %u bytes", + cur_smem_size); } - // We don't need to guard the kernel launch because the smem_size can only grow. + // The kernel launch is outside the lock: any concurrent cudaFuncSetAttribute can only increase + // the limit, so the launch is always safe. return launch(kernel); } diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh index 4c0bb3644a..4e6c96f68c 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -122,8 +122,8 @@ struct loadAndComputeDist { for (int k = 0; k < Veclen; ++k) { compute_dist(dist, queryRegs[k], encV[k]); if constexpr (ComputeNorm) { - norm_query += queryRegs[k] * queryRegs[k]; - norm_data += encV[k] * encV[k]; + norm_query += (AccT)(queryRegs[k] * queryRegs[k]); + norm_data += (AccT)(encV[k] * encV[k]); } } } @@ -157,8 +157,8 @@ struct loadAndComputeDist { T q = raft::shfl(queryReg, d + k, raft::WarpSize); compute_dist(dist, q, encV[k]); if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += encV[k] * encV[k]; + norm_query += (AccT)(q * q); + norm_data += (AccT)(encV[k] * encV[k]); } } } @@ -183,8 +183,8 @@ struct loadAndComputeDist { T q = raft::shfl(queryReg, d + k, raft::WarpSize); compute_dist(dist, q, enc[k]); if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += enc[k] * enc[k]; + norm_query += (AccT)(q * q); + norm_data += (AccT)(enc[k] * enc[k]); } } } @@ -875,7 +875,10 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) uint32_t sample_offset = 0; if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; } assert(list_length == chunk_indices[probe_id] - sample_offset); - assert(sample_offset + list_length <= max_samples); + if constexpr (!kManageLocalTopK) { + // max_samples is zero/unused in the kManageLocalTopK mode + assert(sample_offset + list_length <= max_samples); + } constexpr int kUnroll = raft::WarpSize / Veclen; constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize; diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh index bf86fbb118..d48a6578fd 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh @@ -833,7 +833,10 @@ __device__ __forceinline__ void interleaved_scan_kernel_impl(const uint32_t quer uint32_t sample_offset = 0; if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; } assert(list_length == chunk_indices[probe_id] - sample_offset); - assert(sample_offset + list_length <= max_samples); + if constexpr (!kManageLocalTopK) { + // max_samples is zero/unused in the kManageLocalTopK mode + assert(sample_offset + list_length <= max_samples); + } constexpr int kUnroll = raft::WarpSize / Veclen; constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize; diff --git a/examples/c/src/ivf_pq_c_example.c b/examples/c/src/ivf_pq_c_example.c index 9ec8221431..3620543015 100644 --- a/examples/c/src/ivf_pq_c_example.c +++ b/examples/c/src/ivf_pq_c_example.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -57,9 +57,14 @@ void ivf_pq_build_search(cuvsResources_t* res, search_params->internal_distance_dtype = CUDA_R_16F; search_params->lut_dtype = CUDA_R_16F; + // Create filter (no filtering) + cuvsFilter filter; + filter.type = NO_FILTER; + filter.addr = (uintptr_t)NULL; + // Search the `index` built using `cuvsIvfPqBuild` CHECK_CUVS(cuvsIvfPqSearch( - *res, search_params, index, queries_tensor, &neighbors_tensor, &distances_tensor)); + *res, search_params, index, queries_tensor, &neighbors_tensor, &distances_tensor, filter)); int64_t* neighbors = (int64_t*)malloc(n_queries * topk * sizeof(int64_t)); float* distances = (float*)malloc(n_queries * topk * sizeof(float)); diff --git a/go/ivf_pq/ivf_pq.go b/go/ivf_pq/ivf_pq.go index cbbec629d1..c6bf5852b3 100644 --- a/go/ivf_pq/ivf_pq.go +++ b/go/ivf_pq/ivf_pq.go @@ -68,6 +68,10 @@ func SearchIndex[T any](Resources cuvs.Resource, params *SearchParams, index *Iv if !index.trained { return errors.New("index needs to be built before calling search") } + prefilter := C.cuvsFilter{ + addr: 0, + _type: C.NO_FILTER, + } - return cuvs.CheckCuvs(cuvs.CuvsError(C.cuvsIvfPqSearch(C.cuvsResources_t(Resources.Resource), params.params, index.index, (*C.DLManagedTensor)(unsafe.Pointer(queries.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(neighbors.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(distances.C_tensor))))) + return cuvs.CheckCuvs(cuvs.CuvsError(C.cuvsIvfPqSearch(C.cuvsResources_t(Resources.Resource), params.params, index.index, (*C.DLManagedTensor)(unsafe.Pointer(queries.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(neighbors.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(distances.C_tensor)), prefilter))) } diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd index 24bfcc62d7..caaf660460 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd @@ -10,6 +10,7 @@ from libcpp cimport bool from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor from cuvs.distance_type cimport cuvsDistanceType +from cuvs.neighbors.filters.filters cimport cuvsFilter cdef extern from "library_types.h": @@ -141,7 +142,8 @@ cdef extern from "cuvs/neighbors/ivf_pq.h" nogil: cuvsIvfPqIndex_t index, DLManagedTensor* queries, DLManagedTensor* neighbors, - DLManagedTensor* distances) + DLManagedTensor* distances, + cuvsFilter filter) cuvsError_t cuvsIvfPqSerialize(cuvsResources_t res, const char * filename, diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx index 3bc96b44eb..509df6801e 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx @@ -35,6 +35,7 @@ from libc.stdint cimport ( ) from cuvs.common.exceptions import check_cuvs +from cuvs.neighbors.filters import no_filter cdef class IndexParams: @@ -748,7 +749,8 @@ def search(SearchParams search_params, k, neighbors=None, distances=None, - resources=None): + resources=None, + filter=None): """ Find the k nearest neighbors for each query. @@ -769,6 +771,7 @@ def search(SearchParams search_params, (n_queries, k) If supplied, the distances to the neighbors will be written here in-place. (default None) {resources_docstring} + filter : Optional cuvs.neighbors.filters.Filter for prefiltering Examples -------- @@ -826,6 +829,9 @@ def search(SearchParams search_params, cydlpack.dlpack_c(distances_cai) cdef cuvsResources_t res = resources.get_c_obj() + if filter is None: + filter = no_filter() + with cuda_interruptible(): check_cuvs(cuvsIvfPqSearch( res, @@ -833,7 +839,8 @@ def search(SearchParams search_params, index.index, queries_dlpack, neighbors_dlpack, - distances_dlpack + distances_dlpack, + filter.prefilter )) return (distances, neighbors) diff --git a/python/cuvs/cuvs/tests/test_cagra_batch_recall.py b/python/cuvs/cuvs/tests/test_cagra_batch_recall.py new file mode 100644 index 0000000000..08755d73e0 --- /dev/null +++ b/python/cuvs/cuvs/tests/test_cagra_batch_recall.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# +# Regression tests for CAGRA search recall consistency across query batch sizes. +# See https://github.com/rapidsai/cuvs/issues/1187 +# +# When the AUTO algorithm selector switches from MULTI_CTA to SINGLE_CTA +# at larger batch sizes, SINGLE_CTA computes max_iterations = itopk_size / +# search_width. With search_width > 1, this gives far fewer iterations than +# MULTI_CTA (which uses internal mc_search_width=1), causing a recall cliff. +# The fix prevents SINGLE_CTA selection when search_width > 1. + +import numpy as np +import pytest +from pylibraft.common import device_ndarray + +from cuvs.neighbors import cagra + + +def calc_recall(found_indices, ground_truth, k): + """Calculate recall@k between found indices and ground truth.""" + n_queries = found_indices.shape[0] + correct = 0 + total = n_queries * k + for i in range(n_queries): + correct += len( + set(found_indices[i, :k].tolist()) + & set(ground_truth[i, :k].tolist()) + ) + return correct / total + + +def search_in_batches(search_params, index, queries, k, batch_size): + """Search the index in fixed-size batches, return host neighbors array.""" + n_queries = queries.shape[0] + all_neighbors = np.zeros((n_queries, k), dtype=np.uint32) + for start in range(0, n_queries, batch_size): + end = min(start + batch_size, n_queries) + q = device_ndarray(queries[start:end]) + _, neighbors = cagra.search(search_params, index, q, k=k) + neighbors_host = neighbors.copy_to_host() + all_neighbors[start:end] = neighbors_host + return all_neighbors + + +@pytest.fixture(scope="module") +def cagra_test_data(): + """Build CAGRA index and brute-force ground truth once for all tests.""" + np.random.seed(42) + n_samples = 50000 + n_queries = 512 + dim = 64 + k = 10 + + dataset = np.random.randn(n_samples, dim).astype(np.float32) + queries = np.random.randn(n_queries, dim).astype(np.float32) + dataset_device = device_ndarray(dataset) + + build_params = cagra.IndexParams( + graph_degree=32, intermediate_graph_degree=64 + ) + index = cagra.build(build_params, dataset_device) + + # Brute-force ground truth via NumPy (L2 distance), chunked to avoid OOM. + # Each chunk creates a (chunk_size x n_samples x dim) array. + # chunk_size=16 => 16 * 50000 * 64 * 4 bytes = ~200MB per chunk. + gt_neighbors = np.zeros((n_queries, k), dtype=np.uint32) + chunk_size = 16 + for start in range(0, n_queries, chunk_size): + end = min(start + chunk_size, n_queries) + q_chunk = queries[start:end] + dists = np.sum( + (q_chunk[:, None, :] - dataset[None, :, :]) ** 2, axis=2 + ) + gt_neighbors[start:end] = np.argsort(dists, axis=1)[:, :k].astype( + np.uint32 + ) + del dists + + return { + "index": index, + "dataset_device": dataset_device, # keep alive for index search + "queries": queries, + "gt_neighbors": gt_neighbors, + "n_queries": n_queries, + "k": k, + } + + +@pytest.mark.parametrize("search_width", [1, 4, 8]) +def test_cagra_batch_recall_consistency(cagra_test_data, search_width): + """ + Recall must be consistent across batch sizes regardless of search_width. + + The AUTO selector may switch algorithms at larger batch sizes. This test + verifies that the switch does not cause a recall cliff. Without the fix, + search_width=8 drops from ~0.87 recall (MULTI_CTA) to ~0.65 (SINGLE_CTA) + at batch_size >= 256 (on machines with >= 128 SMs, fewer for smaller GPUs). + """ + data = cagra_test_data + search_params = cagra.SearchParams( + itopk_size=64, search_width=search_width + ) + + batch_sizes = [32, 64, 128, 256, 512] + recalls = [] + + for batch_size in batch_sizes: + neighbors = search_in_batches( + search_params, + data["index"], + data["queries"], + data["k"], + batch_size, + ) + recall = calc_recall(neighbors, data["gt_neighbors"], data["k"]) + recalls.append(recall) + + recall_std = np.std(recalls) + recall_range = max(recalls) - min(recalls) + + assert recall_std < 0.02, ( + f"search_width={search_width}: recall varies too much across batch " + f"sizes (std={recall_std:.4f}). " + + ", ".join(f"bs={bs}:{r:.4f}" for bs, r in zip(batch_sizes, recalls)) + ) + assert recall_range < 0.05, ( + f"search_width={search_width}: recall range too wide " + f"({recall_range:.4f}). " + + ", ".join(f"bs={bs}:{r:.4f}" for bs, r in zip(batch_sizes, recalls)) + ) + + +@pytest.mark.parametrize( + "search_width,min_expected_recall", + [(1, 0.4), (4, 0.6), (8, 0.7)], +) +def test_cagra_search_width_recall_quality( + cagra_test_data, search_width, min_expected_recall +): + """ + Higher search_width must yield higher recall, not lower. + + SINGLE_CTA with search_width > 1 previously got fewer iterations, + producing LOWER recall with HIGHER search_width — the opposite of + expected behavior. This test catches that inversion. + """ + data = cagra_test_data + search_params = cagra.SearchParams( + itopk_size=64, search_width=search_width + ) + + # Use batch_size=512 to trigger AUTO algorithm selection + neighbors = search_in_batches( + search_params, + data["index"], + data["queries"], + data["k"], + batch_size=512, + ) + recall = calc_recall(neighbors, data["gt_neighbors"], data["k"]) + + assert recall >= min_expected_recall, ( + f"search_width={search_width}: recall={recall:.4f} is below " + f"minimum expected {min_expected_recall:.4f} at batch_size=512. " + f"This suggests SINGLE_CTA may be selected with too few iterations." + ) + + +def test_cagra_search_width_monotonicity(cagra_test_data): + """ + Recall must increase (or stay flat) as search_width increases. + + Wider search explores more neighbors per iteration. If recall decreases + with higher search_width, the algorithm is getting fewer effective + iterations — the exact bug this fix addresses. + """ + data = cagra_test_data + search_widths = [1, 2, 4, 8] + recalls = [] + + for sw in search_widths: + search_params = cagra.SearchParams(itopk_size=64, search_width=sw) + neighbors = search_in_batches( + search_params, + data["index"], + data["queries"], + data["k"], + batch_size=512, + ) + recall = calc_recall(neighbors, data["gt_neighbors"], data["k"]) + recalls.append(recall) + + # Each subsequent search_width should have recall >= previous - tolerance + tolerance = 0.02 + for i in range(1, len(search_widths)): + assert recalls[i] >= recalls[i - 1] - tolerance, ( + f"Recall decreased from sw={search_widths[i - 1]} " + f"({recalls[i - 1]:.4f}) to sw={search_widths[i]} " + f"({recalls[i]:.4f}) at batch_size=512. " + f"Higher search_width should not reduce recall." + ) diff --git a/rust/cuvs/examples/cagra.rs b/rust/cuvs/examples/cagra.rs index b118c3abc5..80d866112f 100644 --- a/rust/cuvs/examples/cagra.rs +++ b/rust/cuvs/examples/cagra.rs @@ -10,7 +10,11 @@ use ndarray::s; use ndarray_rand::rand_distr::Uniform; use ndarray_rand::RandomExt; -/// Example showing how to index and search data with CAGRA +/// Example showing how to index and search data with CAGRA using the validated builder API. +/// +/// `IndexParams::builder()` validates parameters before any GPU allocation, surfacing +/// misconfiguration immediately with a clear error message instead of an opaque CUDA +/// assertion 1-2 seconds into `Index::build()`. fn cagra_example() -> Result<()> { let res = Resources::new()?; @@ -20,8 +24,14 @@ fn cagra_example() -> Result<()> { let dataset = ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); - // build the cagra index - let build_params = IndexParams::new()?; + // Build the CAGRA index using the validated builder. + // Parameters are checked in Rust before any FFI call — invalid values (e.g. + // graph_degree=0) produce an error here, not inside Index::build(). + let build_params = IndexParams::builder() + .graph_degree(32) + .intermediate_graph_degree(64) + .nn_descent_niter(20) + .build()?; let index = Index::build(&res, &build_params, &dataset)?; println!( "Indexed {}x{} datapoints into cagra index", @@ -47,7 +57,7 @@ fn cagra_example() -> Result<()> { let search_params = SearchParams::new()?; - index.search(&res, &search_params, &queries, &neighbors, &distances)?; + index.search(&res, &search_params, &queries, &neighbors, &distances, None)?; // Copy back to host memory distances.to_host(&res, &mut distances_host)?; diff --git a/rust/cuvs/src/brute_force.rs b/rust/cuvs/src/brute_force.rs index 1440bb3205..9437f42c26 100644 --- a/rust/cuvs/src/brute_force.rs +++ b/rust/cuvs/src/brute_force.rs @@ -5,18 +5,68 @@ //! Brute Force KNN use std::io::{stderr, Write}; +use std::marker::PhantomData; use crate::distance_type::DistanceType; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{DatasetOwnership, ManagedTensor}; use crate::error::{check_cuvs, Result}; use crate::resources::Resources; /// Brute Force KNN Index +/// +/// The brute force C library stores a non-owning view into the original dataset. +/// The lifetime parameter `'a` ensures the dataset outlives the index when built +/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index +/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]). +/// +/// # Examples +/// +/// ## Borrowed dataset (compiler enforces lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::brute_force::Index; +/// # use cuvs::distance_type::DistanceType; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((64, 8)); +/// let tensor = ManagedTensor::from(&arr); +/// let index = Index::build(&res, DistanceType::L2Expanded, None, &tensor).unwrap(); +/// // arr and tensor must remain alive while index is in use +/// ``` +/// +/// ## Owned dataset ('static lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::brute_force::Index; +/// # use cuvs::distance_type::DistanceType; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((64, 8)); +/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap(); +/// let index = Index::build_owned(&res, DistanceType::L2Expanded, None, device_tensor).unwrap(); +/// drop(arr); // Fine — index owns the device copy +/// ``` #[derive(Debug)] -pub struct Index(ffi::cuvsBruteForceIndex_t); +pub struct Index<'a> { + inner: ffi::cuvsBruteForceIndex_t, + _data: DatasetOwnership<'a>, +} -impl Index { - /// Builds a new Brute Force KNN Index from the dataset for efficient search. +impl<'a> Index<'a> { + /// Creates a new FFI index handle. + fn create_handle() -> Result { + unsafe { + let mut index = std::mem::MaybeUninit::::uninit(); + check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?; + Ok(index.assume_init()) + } + } + + /// Builds a new Brute Force KNN Index from a borrowed dataset. + /// + /// The compiler enforces that `dataset` outlives the returned index, + /// preventing use-after-free when the C library dereferences its + /// internal view of the data. /// /// # Arguments /// @@ -24,33 +74,34 @@ impl Index { /// * `metric` - DistanceType to use for building the index /// * `metric_arg` - Optional value of `p` for Minkowski distances /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + pub fn build( res: &Resources, metric: DistanceType, metric_arg: Option, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); - let index = Index::new()?; + dataset: &'a ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; unsafe { check_cuvs(ffi::cuvsBruteForceBuild( res.0, dataset.as_ptr(), metric, metric_arg.unwrap_or(2.0), - index.0, + inner, ))?; } - Ok(index) + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Creates a new empty index - pub fn new() -> Result { - unsafe { - let mut index = std::mem::MaybeUninit::::uninit(); - check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?; - Ok(Index(index.assume_init())) - } + pub fn new() -> Result> { + Ok(Index { + inner: Self::create_handle()?, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Perform a Nearest Neighbors search on the Index @@ -76,7 +127,7 @@ impl Index { check_cuvs(ffi::cuvsBruteForceSearch( res.0, - self.0, + self.inner, queries.as_ptr(), neighbors.as_ptr(), distances.as_ptr(), @@ -86,9 +137,46 @@ impl Index { } } -impl Drop for Index { +impl Index<'static> { + /// Builds a new Brute Force KNN Index from an owned dataset. + /// + /// The index takes ownership of `dataset`, making it self-contained + /// with a `'static` lifetime. This is useful when the dataset is a + /// device copy (from [`ManagedTensor::to_device`]) that should live + /// as long as the index. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `metric` - DistanceType to use for building the index + /// * `metric_arg` - Optional value of `p` for Minkowski distances + /// * `dataset` - A row-major matrix to index (ownership transferred to the index) + pub fn build_owned( + res: &Resources, + metric: DistanceType, + metric_arg: Option, + dataset: ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsBruteForceBuild( + res.0, + dataset.as_ptr(), + metric, + metric_arg.unwrap_or(2.0), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Owned(dataset), + }) + } +} + +impl Drop for Index<'_> { fn drop(&mut self) { - if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.0) }) { + if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.inner) }) { write!(stderr(), "failed to call bruteForceIndexDestroy {:?}", e) .expect("failed to write to stderr"); } @@ -116,9 +204,9 @@ mod tests { println!("dataset {:#?}", dataset_host); - // build the brute force index - let index = - Index::build(&res, metric, None, dataset).expect("failed to create brute force index"); + // build the brute force index (owned — device copy lives in the index) + let index = Index::build_owned(&res, metric, None, dataset) + .expect("failed to create brute force index"); res.sync_stream().unwrap(); @@ -173,10 +261,104 @@ mod tests { test_bfknn(DistanceType::L2Expanded); } - // NOTE: brute_force multiple-search test is omitted here because the C++ - // brute_force::index stores a non-owning view into the dataset. Building - // from device data via `build()` drops the ManagedTensor after the call, - // leaving a dangling pointer. A follow-up PR will add dataset lifetime - // enforcement (DatasetOwnership<'a>) to make this safe. - // See: https://github.com/rapidsai/cuvs/issues/1838 + /// Test that an index built with build_owned can be searched multiple times. + #[test] + fn test_brute_force_multiple_searches() { + let res = Resources::new().unwrap(); + + // Create a random dataset + let n_datapoints = 64; + let n_features = 8; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + // Build the brute force index with owned device memory + let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let index = Index::build_owned(&res, DistanceType::L2Expanded, None, dataset_device) + .expect("failed to create brute force index"); + + res.sync_stream().unwrap(); + + let k = 4; + + // Perform multiple searches on the same index + for search_iter in 0..3 { + let n_queries = 4; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + index + .search(&res, &queries, &neighbors, &distances) + .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e)); + + // Copy back to host memory + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + res.sync_stream().unwrap(); + + // Verify results are consistent + assert_eq!( + neighbors_host[[0, 0]], + 0, + "iteration {}: first query should find itself", + search_iter + ); + } + } + + /// Test that an index built with build (borrowed) ties the dataset lifetime. + #[test] + fn test_brute_force_borrowed_build() { + let res = Resources::new().unwrap(); + + let n_datapoints = 64; + let n_features = 8; + let dataset_host = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + // Create a device tensor and borrow it for the index + let dataset_device = ManagedTensor::from(&dataset_host).to_device(&res).unwrap(); + let index = Index::build(&res, DistanceType::L2Expanded, None, &dataset_device) + .expect("failed to create brute force index"); + + res.sync_stream().unwrap(); + + // Search while the borrowed dataset is still alive + let n_queries = 4; + let k = 4; + let queries = dataset_host.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + index + .search(&res, &queries, &neighbors, &distances) + .unwrap(); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + res.sync_stream().unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + assert_eq!(neighbors_host[[1, 0]], 1); + // dataset_device is still alive here — compiler ensures it + } } diff --git a/rust/cuvs/src/cagra/index.rs b/rust/cuvs/src/cagra/index.rs index 789f72b603..341db36cfd 100644 --- a/rust/cuvs/src/cagra/index.rs +++ b/rust/cuvs/src/cagra/index.rs @@ -3,50 +3,102 @@ * SPDX-License-Identifier: Apache-2.0 */ +use std::ffi::CString; use std::io::{stderr, Write}; +use std::marker::PhantomData; use crate::cagra::{IndexParams, SearchParams}; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{DatasetOwnership, ManagedTensor}; use crate::error::{check_cuvs, Result}; +use crate::filters::{Filter, NoFilter}; use crate::resources::Resources; /// CAGRA ANN Index +/// +/// The lifetime parameter `'a` ensures the dataset outlives the index when built +/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index +/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]). +/// +/// # Examples +/// +/// ## Borrowed dataset (compiler enforces lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::cagra::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((256, 16)); +/// let params = IndexParams::new().unwrap(); +/// let tensor = ManagedTensor::from(&arr); +/// let index = Index::build(&res, ¶ms, &tensor).unwrap(); +/// // arr and tensor must remain alive while index is in use +/// ``` +/// +/// ## Owned dataset ('static lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::cagra::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((256, 16)); +/// let params = IndexParams::new().unwrap(); +/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap(); +/// let index = Index::build_owned(&res, ¶ms, device_tensor).unwrap(); +/// drop(arr); // Fine — index owns the device copy +/// ``` #[derive(Debug)] -pub struct Index(ffi::cuvsCagraIndex_t); +pub struct Index<'a> { + inner: ffi::cuvsCagraIndex_t, + _data: DatasetOwnership<'a>, +} + +impl<'a> Index<'a> { + /// Creates a new FFI index handle. + fn create_handle() -> Result { + unsafe { + let mut index = std::mem::MaybeUninit::::uninit(); + check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?; + Ok(index.assume_init()) + } + } -impl Index { - /// Builds a new Index from the dataset for efficient search. + /// Builds a new CAGRA Index from a borrowed dataset. + /// + /// The compiler enforces that `dataset` outlives the returned index, + /// preventing use-after-free when the C library dereferences its + /// internal view of the data. /// /// # Arguments /// /// * `res` - Resources to use /// * `params` - Parameters for building the index /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + pub fn build( res: &Resources, params: &IndexParams, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); - let index = Index::new()?; + dataset: &'a ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; unsafe { check_cuvs(ffi::cuvsCagraBuild( res.0, params.0, dataset.as_ptr(), - index.0, + inner, ))?; } - Ok(index) + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Creates a new empty index - pub fn new() -> Result { - unsafe { - let mut index = std::mem::MaybeUninit::::uninit(); - check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?; - Ok(Index(index.assume_init())) - } + pub fn new() -> Result> { + Ok(Index { + inner: Self::create_handle()?, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Perform a Approximate Nearest Neighbors search on the Index @@ -58,6 +110,7 @@ impl Index { /// * `queries` - A matrix in device memory to query for /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors + /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided) pub fn search( &self, res: &Resources, @@ -65,29 +118,130 @@ impl Index { queries: &ManagedTensor, neighbors: &ManagedTensor, distances: &ManagedTensor, + filter: Option<&dyn Filter>, ) -> Result<()> { unsafe { - let prefilter = ffi::cuvsFilter { - addr: 0, - type_: ffi::cuvsFilterType::NO_FILTER, - }; + let filter_ffi = filter + .map(|f| f.into_ffi()) + .unwrap_or_else(|| NoFilter.into_ffi()); check_cuvs(ffi::cuvsCagraSearch( res.0, params.0, - self.0, + self.inner, queries.as_ptr(), neighbors.as_ptr(), distances.as_ptr(), - prefilter, + filter_ffi, )) } } + + /// Save the CAGRA index to file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The file name for saving the index + /// * `include_dataset` - Whether to write out the dataset to the file + pub fn serialize(&self, res: &Resources, filename: &str, include_dataset: bool) -> Result<()> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + unsafe { + check_cuvs(ffi::cuvsCagraSerialize( + res.0, + c_filename.as_ptr(), + self.inner, + include_dataset, + )) + } + } + + /// Save the CAGRA index to file in hnswlib format. + /// + /// NOTE: The saved index can only be read by the hnswlib wrapper in cuVS, + /// as the serialization format is not compatible with the original hnswlib. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The file name for saving the index + pub fn serialize_to_hnswlib(&self, res: &Resources, filename: &str) -> Result<()> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + unsafe { + check_cuvs(ffi::cuvsCagraSerializeToHnswlib( + res.0, + c_filename.as_ptr(), + self.inner, + )) + } + } + + /// Load a CAGRA index from file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The name of the file that stores the index + pub fn deserialize(res: &Resources, filename: &str) -> Result> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsCagraDeserialize( + res.0, + c_filename.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) + } +} + +impl Index<'static> { + /// Builds a new CAGRA Index from an owned dataset. + /// + /// The index takes ownership of `dataset`, making it self-contained + /// with a `'static` lifetime. This is useful when the dataset is a + /// device copy (from [`ManagedTensor::to_device`]) that should live + /// as long as the index. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `params` - Parameters for building the index + /// * `dataset` - A row-major matrix to index (ownership transferred to the index) + pub fn build_owned( + res: &Resources, + params: &IndexParams, + dataset: ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsCagraBuild( + res.0, + params.0, + dataset.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Owned(dataset), + }) + } } -impl Drop for Index { +impl Drop for Index<'_> { fn drop(&mut self) { - if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.0) }) { + if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.inner) }) { write!(stderr(), "failed to call cagraIndexDestroy {:?}", e) .expect("failed to write to stderr"); } @@ -110,9 +264,10 @@ mod tests { let dataset = ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); - // build the cagra index + // build the cagra index (borrowed — dataset tensor must outlive index) + let tensor = ManagedTensor::from(&dataset); let index = - Index::build(&res, &build_params, &dataset).expect("failed to create cagra index"); + Index::build(&res, &build_params, &tensor).expect("failed to create cagra index"); // use the first 4 points from the dataset as queries : will test that we get them back // as their own nearest neighbor @@ -138,7 +293,7 @@ mod tests { let search_params = SearchParams::new().unwrap(); index - .search(&res, &search_params, &queries, &neighbors, &distances) + .search(&res, &search_params, &queries, &neighbors, &distances, None) .unwrap(); // Copy back to host memory @@ -181,9 +336,10 @@ mod tests { let dataset = ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); - // Build the index once + // Build the index once (borrowed) + let tensor = ManagedTensor::from(&dataset); let index = - Index::build(&res, &build_params, &dataset).expect("failed to create cagra index"); + Index::build(&res, &build_params, &tensor).expect("failed to create cagra index"); let search_params = SearchParams::new().unwrap(); let k = 5; @@ -206,8 +362,8 @@ mod tests { // This should work on every iteration because search() takes &self index - .search(&res, &search_params, &queries, &neighbors, &distances) - .expect(&format!("search iteration {} failed", search_iter)); + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e)); // Copy back to host memory distances.to_host(&res, &mut distances_host).unwrap(); @@ -222,4 +378,128 @@ mod tests { ); } } + + #[test] + fn test_cagra_serialize_deserialize() { + let res = Resources::new().unwrap(); + + let n_datapoints = 256; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + let build_params = IndexParams::new().unwrap(); + let tensor = ManagedTensor::from(&dataset); + let index = + Index::build(&res, &build_params, &tensor).expect("failed to create cagra index"); + + let dir = std::env::temp_dir(); + let filepath = dir.join("test_cagra_index.bin"); + let filepath_str = filepath.to_str().unwrap(); + index + .serialize(&res, filepath_str, true) + .expect("failed to serialize cagra index"); + + assert!(filepath.exists()); + assert!(std::fs::metadata(&filepath).unwrap().len() > 0); + + let loaded_index: Index<'_> = + Index::deserialize(&res, filepath_str).expect("failed to deserialize cagra index"); + + let n_queries = 4; + let k = 10; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + let search_params = SearchParams::new().unwrap(); + loaded_index + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .expect("failed to search deserialized index"); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + assert_eq!(neighbors_host[[1, 0]], 1); + + let _ = std::fs::remove_file(&filepath); + } + + #[test] + fn test_cagra_serialize_to_hnswlib() { + let res = Resources::new().unwrap(); + + let n_datapoints = 256; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + let build_params = IndexParams::new().unwrap(); + let tensor = ManagedTensor::from(&dataset); + let index = + Index::build(&res, &build_params, &tensor).expect("failed to create cagra index"); + + let dir = std::env::temp_dir(); + let filepath = dir.join("test_cagra_index_hnsw.bin"); + let filepath_str = filepath.to_str().unwrap(); + index + .serialize_to_hnswlib(&res, filepath_str) + .expect("failed to serialize to hnswlib format"); + + assert!(filepath.exists()); + assert!(std::fs::metadata(&filepath).unwrap().len() > 0); + + let _ = std::fs::remove_file(&filepath); + } + + /// Test that an index built with build_owned can be searched after the + /// original host data is dropped. + #[test] + fn test_cagra_build_owned() { + let res = Resources::new().unwrap(); + let build_params = IndexParams::new().unwrap(); + + let n_datapoints = 256; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + // Build with owned device memory + let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let index = Index::build_owned(&res, &build_params, dataset_device) + .expect("failed to create cagra index"); + + let search_params = SearchParams::new().unwrap(); + let k = 5; + let n_queries = 4; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + index + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap(); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + } } diff --git a/rust/cuvs/src/cagra/index_params.rs b/rust/cuvs/src/cagra/index_params.rs index ea34959147..6655b24741 100644 --- a/rust/cuvs/src/cagra/index_params.rs +++ b/rust/cuvs/src/cagra/index_params.rs @@ -137,6 +137,31 @@ impl IndexParams { } } +impl IndexParams { + /// Returns a builder for constructing [`IndexParams`] with validated parameters. + /// + /// Unlike the `IndexParams::new()?.set_*()` setter chain, [`IndexParamsBuilder::build`] + /// validates all parameters in Rust before any FFI allocation. Invalid values produce a + /// clear error message naming the offending field and its valid range, before any GPU + /// work begins. + /// + /// # Example + /// + /// ```no_run + /// use cuvs::cagra::IndexParams; + /// + /// let params = IndexParams::builder() + /// .graph_degree(32) + /// .intermediate_graph_degree(64) + /// .nn_descent_niter(20) + /// .build() + /// .unwrap(); + /// ``` + pub fn builder() -> IndexParamsBuilder { + IndexParamsBuilder::default() + } +} + impl fmt::Debug for IndexParams { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // custom debug trait here, default value will show the pointer address @@ -177,6 +202,115 @@ impl Drop for CompressionParams { } } +/// Builder for [`IndexParams`] with pre-validated parameters. +/// +/// Construct via [`IndexParams::builder()`]. Call [`IndexParamsBuilder::build`] to +/// validate all parameters and allocate the FFI struct in one step. +/// +/// Defaults match the cuVS C API defaults: `graph_degree=64`, +/// `intermediate_graph_degree=128`, `nn_descent_niter=20`. +pub struct IndexParamsBuilder { + graph_degree: usize, + intermediate_graph_degree: usize, + nn_descent_niter: usize, + build_algo: Option, + compression: Option, +} + +impl Default for IndexParamsBuilder { + fn default() -> Self { + Self { + graph_degree: 64, + intermediate_graph_degree: 128, + nn_descent_niter: 20, + build_algo: None, + compression: None, + } + } +} + +impl IndexParamsBuilder { + /// Degree of output graph. + /// + /// Must be > 0. Values that are multiples of 32 are preferred for warp alignment. + pub fn graph_degree(mut self, v: usize) -> Self { + self.graph_degree = v; + self + } + + /// Degree of input graph for pruning. + /// + /// Must be >= `graph_degree`. + pub fn intermediate_graph_degree(mut self, v: usize) -> Self { + self.intermediate_graph_degree = v; + self + } + + /// Number of iterations to run if building with NN_DESCENT. + /// + /// Must be > 0. + pub fn nn_descent_niter(mut self, v: usize) -> Self { + self.nn_descent_niter = v; + self + } + + /// ANN algorithm to build knn graph. + pub fn build_algo(mut self, v: BuildAlgo) -> Self { + self.build_algo = Some(v); + self + } + + /// Vector compression parameters. + pub fn compression(mut self, v: CompressionParams) -> Self { + self.compression = Some(v); + self + } + + /// Validate all parameters without allocating any GPU resources. + /// + /// Returns `Ok(())` if all parameters are valid, or `Err` with a message naming + /// the offending field and its valid range. + pub fn validate(&self) -> crate::error::Result<()> { + if self.graph_degree == 0 { + return Err(format!("graph_degree must be > 0; got {}", self.graph_degree).into()); + } + if self.intermediate_graph_degree < self.graph_degree { + return Err(format!( + "intermediate_graph_degree ({}) must be >= graph_degree ({})", + self.intermediate_graph_degree, self.graph_degree + ) + .into()); + } + if self.nn_descent_niter == 0 { + return Err(format!( + "nn_descent_niter must be > 0; got {}", + self.nn_descent_niter + ) + .into()); + } + Ok(()) + } + + /// Validate all parameters and allocate the FFI struct. + /// + /// Returns `Err` with a message naming the offending field and its valid range + /// before any GPU work begins. + pub fn build(self) -> crate::error::Result { + self.validate()?; + let mut params = IndexParams::new()? + .set_graph_degree(self.graph_degree) + .set_intermediate_graph_degree(self.intermediate_graph_degree) + .set_nn_descent_niter(self.nn_descent_niter); + if let Some(algo) = self.build_algo { + params = params.set_build_algo(algo); + } + if let Some(compression) = self.compression { + params = params.set_compression(compression); + } + Ok(params) + } +} + #[cfg(test)] mod tests { use super::*; @@ -206,4 +340,95 @@ mod tests { assert_eq!((*(*params.0).compression).pq_bits, 4); } } + + // --- IndexParamsBuilder tests --- + + #[test] + fn builder_rejects_zero_graph_degree() { + let err = IndexParams::builder() + .graph_degree(0) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("graph_degree"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_invalid_intermediate_degree() { + let err = IndexParams::builder() + .graph_degree(32) + .intermediate_graph_degree(16) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("intermediate_graph_degree"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_zero_niter() { + let err = IndexParams::builder() + .nn_descent_niter(0) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("nn_descent_niter"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_accepts_valid_params() { + assert!(IndexParams::builder() + .graph_degree(32) + .intermediate_graph_degree(64) + .nn_descent_niter(20) + .validate() + .is_ok()); + } + + #[test] + fn builder_round_trips_to_ffi() { + // Built params must produce the same FFI struct values as the manual setter chain. + let via_builder = IndexParams::builder() + .graph_degree(32) + .intermediate_graph_degree(64) + .nn_descent_niter(20) + .build() + .unwrap(); + let via_setters = IndexParams::new() + .unwrap() + .set_graph_degree(32) + .set_intermediate_graph_degree(64) + .set_nn_descent_niter(20); + unsafe { + assert_eq!((*via_builder.0).graph_degree, (*via_setters.0).graph_degree); + assert_eq!( + (*via_builder.0).intermediate_graph_degree, + (*via_setters.0).intermediate_graph_degree + ); + assert_eq!( + (*via_builder.0).nn_descent_niter, + (*via_setters.0).nn_descent_niter + ); + } + } + + #[test] + fn existing_setter_api_unchanged() { + // Ensure the original API still compiles and sets values correctly. + let params = IndexParams::new() + .unwrap() + .set_graph_degree(32) + .set_intermediate_graph_degree(64) + .set_nn_descent_niter(20); + unsafe { + assert_eq!((*params.0).graph_degree, 32); + assert_eq!((*params.0).intermediate_graph_degree, 64); + assert_eq!((*params.0).nn_descent_niter, 20); + } + } } diff --git a/rust/cuvs/src/cagra/mod.rs b/rust/cuvs/src/cagra/mod.rs index d39de752cf..b7954c2f9f 100644 --- a/rust/cuvs/src/cagra/mod.rs +++ b/rust/cuvs/src/cagra/mod.rs @@ -27,7 +27,8 @@ //! //! // build the cagra index //! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; +//! let tensor = ManagedTensor::from(&dataset); +//! let index = Index::build(&res, &build_params, &tensor)?; //! println!( //! "Indexed {}x{} datapoints into cagra index", //! n_datapoints, n_features @@ -71,5 +72,5 @@ mod index_params; mod search_params; pub use index::Index; -pub use index_params::{BuildAlgo, CompressionParams, IndexParams}; -pub use search_params::{HashMode, SearchAlgo, SearchParams}; +pub use index_params::{BuildAlgo, CompressionParams, IndexParams, IndexParamsBuilder}; +pub use search_params::{HashMode, SearchAlgo, SearchParams, SearchParamsBuilder}; diff --git a/rust/cuvs/src/cagra/search_params.rs b/rust/cuvs/src/cagra/search_params.rs index 59537d7718..7914b1dbc2 100644 --- a/rust/cuvs/src/cagra/search_params.rs +++ b/rust/cuvs/src/cagra/search_params.rs @@ -122,6 +122,15 @@ impl SearchParams { } } +impl SearchParams { + /// Returns a builder for constructing [`SearchParams`] with validated parameters. + /// + /// See [`SearchParamsBuilder`] for details. + pub fn builder() -> SearchParamsBuilder { + SearchParamsBuilder::default() + } +} + impl fmt::Debug for SearchParams { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // custom debug trait here, default value will show the pointer address @@ -143,6 +152,180 @@ impl Drop for SearchParams { } } +/// Builder for [`SearchParams`] with pre-validated parameters. +/// +/// Construct via [`SearchParams::builder()`]. Call [`SearchParamsBuilder::build`] to +/// validate all parameters and allocate the FFI struct in one step. +pub struct SearchParamsBuilder { + itopk_size: usize, + max_queries: usize, + max_iterations: usize, + min_iterations: usize, + team_size: usize, + thread_block_size: usize, + hashmap_max_fill_rate: f32, + hashmap_min_bitlen: usize, + num_random_samplings: u32, + rand_xor_mask: u64, + algo: Option, + hashmap_mode: Option, +} + +impl Default for SearchParamsBuilder { + fn default() -> Self { + Self { + itopk_size: 64, + max_queries: 0, + max_iterations: 0, + min_iterations: 0, + team_size: 0, + thread_block_size: 0, + hashmap_max_fill_rate: 0.5, + hashmap_min_bitlen: 0, + num_random_samplings: 1, + rand_xor_mask: 0x128394, + algo: None, + hashmap_mode: None, + } + } +} + +impl SearchParamsBuilder { + /// Number of intermediate search results retained during the search. + /// + /// Must be a power of 2 (or 0 to use the cuVS default). + pub fn itopk_size(mut self, v: usize) -> Self { + self.itopk_size = v; + self + } + + /// Maximum number of queries to search at the same time. 0 = auto. + pub fn max_queries(mut self, v: usize) -> Self { + self.max_queries = v; + self + } + + /// Upper limit of search iterations. 0 = auto. + pub fn max_iterations(mut self, v: usize) -> Self { + self.max_iterations = v; + self + } + + /// Lower limit of search iterations. + pub fn min_iterations(mut self, v: usize) -> Self { + self.min_iterations = v; + self + } + + /// Number of threads used to calculate a single distance. + /// + /// Must be 0 (auto), 4, 8, 16, or 32. + pub fn team_size(mut self, v: usize) -> Self { + self.team_size = v; + self + } + + /// Thread block size. 0 (auto), 64, 128, 256, 512, or 1024. + pub fn thread_block_size(mut self, v: usize) -> Self { + self.thread_block_size = v; + self + } + + /// Upper limit of hashmap fill rate. + /// + /// Must be in the exclusive range (0.1, 0.9). + pub fn hashmap_max_fill_rate(mut self, v: f32) -> Self { + self.hashmap_max_fill_rate = v; + self + } + + /// Lower limit of hashmap bit length. + pub fn hashmap_min_bitlen(mut self, v: usize) -> Self { + self.hashmap_min_bitlen = v; + self + } + + /// Number of iterations of initial random seed node selection. + pub fn num_random_samplings(mut self, v: u32) -> Self { + self.num_random_samplings = v; + self + } + + /// Bit mask used for initial random seed node selection. + pub fn rand_xor_mask(mut self, v: u64) -> Self { + self.rand_xor_mask = v; + self + } + + /// Which search implementation to use. + pub fn algo(mut self, v: SearchAlgo) -> Self { + self.algo = Some(v); + self + } + + /// Hashmap type. + pub fn hashmap_mode(mut self, v: HashMode) -> Self { + self.hashmap_mode = Some(v); + self + } + + /// Validate all parameters without allocating any GPU resources. + /// + /// Returns `Ok(())` if all parameters are valid, or `Err` with a message naming + /// the offending field and its valid range. + pub fn validate(&self) -> crate::error::Result<()> { + if self.itopk_size != 0 && !self.itopk_size.is_power_of_two() { + return Err(format!( + "itopk_size must be a power of 2 or 0 (auto); got {}", + self.itopk_size + ) + .into()); + } + const VALID_TEAM_SIZES: &[usize] = &[0, 4, 8, 16, 32]; + if !VALID_TEAM_SIZES.contains(&self.team_size) { + return Err(format!( + "team_size must be one of {{0, 4, 8, 16, 32}}; got {}", + self.team_size + ) + .into()); + } + if self.hashmap_max_fill_rate <= 0.1 || self.hashmap_max_fill_rate >= 0.9 { + return Err(format!( + "hashmap_max_fill_rate must be in (0.1, 0.9); got {}", + self.hashmap_max_fill_rate + ) + .into()); + } + Ok(()) + } + + /// Validate all parameters and allocate the FFI struct. + /// + /// Returns `Err` with a message naming the offending field and its valid range + /// before any GPU work begins. + pub fn build(self) -> crate::error::Result { + self.validate()?; + let mut params = SearchParams::new()? + .set_itopk_size(self.itopk_size) + .set_max_queries(self.max_queries) + .set_max_iterations(self.max_iterations) + .set_min_iterations(self.min_iterations) + .set_team_size(self.team_size) + .set_thread_block_size(self.thread_block_size) + .set_hashmap_max_fill_rate(self.hashmap_max_fill_rate) + .set_hashmap_min_bitlen(self.hashmap_min_bitlen) + .set_num_random_samplings(self.num_random_samplings) + .set_rand_xor_mask(self.rand_xor_mask); + if let Some(algo) = self.algo { + params = params.set_algo(algo); + } + if let Some(mode) = self.hashmap_mode { + params = params.set_hashmap_mode(mode); + } + Ok(params) + } +} + #[cfg(test)] mod tests { use super::*; @@ -155,4 +338,67 @@ mod tests { assert_eq!((*params.0).itopk_size, 128); } } + + // --- SearchParamsBuilder tests --- + + #[test] + fn builder_rejects_non_power_of_two_itopk() { + let err = SearchParams::builder() + .itopk_size(100) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("itopk_size"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_invalid_team_size() { + let err = SearchParams::builder().team_size(7).validate().unwrap_err(); + assert!( + err.to_string().contains("team_size"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_fill_rate_too_high() { + let err = SearchParams::builder() + .hashmap_max_fill_rate(0.95) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("hashmap_max_fill_rate"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_fill_rate_too_low() { + let err = SearchParams::builder() + .hashmap_max_fill_rate(0.05) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("hashmap_max_fill_rate"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_accepts_valid_params() { + assert!(SearchParams::builder() + .itopk_size(64) + .team_size(8) + .hashmap_max_fill_rate(0.5) + .validate() + .is_ok()); + } + + #[test] + fn builder_accepts_zero_itopk_as_auto() { + // itopk_size=0 means "auto select" in cuVS — should be valid + assert!(SearchParams::builder().itopk_size(0).validate().is_ok()); + } } diff --git a/rust/cuvs/src/dlpack.rs b/rust/cuvs/src/dlpack.rs index d60b0acf6a..b0232c24f0 100644 --- a/rust/cuvs/src/dlpack.rs +++ b/rust/cuvs/src/dlpack.rs @@ -4,10 +4,25 @@ */ use std::convert::From; +use std::marker::PhantomData; use crate::error::{check_cuda, check_cuvs, Result}; use crate::resources::Resources; +/// Tracks whether an Index borrows or owns its dataset. +/// +/// When an index is built from a borrowed `ManagedTensor`, the compiler enforces +/// that the dataset outlives the index. When built from an owned `ManagedTensor` +/// (e.g., a device copy from [`ManagedTensor::to_device`]), the index is +/// self-contained and has a `'static` lifetime. +#[derive(Debug)] +pub(crate) enum DatasetOwnership<'a> { + /// Dataset is borrowed — caller must keep it alive. + Borrowed(PhantomData<&'a ()>), + /// Dataset is owned by the index (e.g., a device copy from `to_device()`). + Owned(ManagedTensor), +} + /// ManagedTensor is a wrapper around a dlpack DLManagedTensor object. /// This lets you pass matrices in device or host memory into cuvs. #[derive(Debug)] diff --git a/rust/cuvs/src/error.rs b/rust/cuvs/src/error.rs index 74cb1037f6..ab906c399b 100644 --- a/rust/cuvs/src/error.rs +++ b/rust/cuvs/src/error.rs @@ -60,3 +60,12 @@ pub fn check_cuda(err: ffi::cudaError_t) -> Result<()> { _ => Err(Error::CudaError(err)), } } + +impl From for Error { + fn from(text: String) -> Self { + Error::CuvsError(CuvsError { + code: ffi::cuvsError_t::CUVS_ERROR, + text, + }) + } +} diff --git a/rust/cuvs/src/filters.rs b/rust/cuvs/src/filters.rs new file mode 100644 index 0000000000..b315c1954d --- /dev/null +++ b/rust/cuvs/src/filters.rs @@ -0,0 +1,605 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +//! Filters for approximate nearest neighbor search +//! +//! This module provides filtering functionality for ANN search operations, +//! allowing you to exclude certain vectors from search results. +//! +//! # Filter Types +//! +//! - **No Filter**: Default behavior, includes all vectors +//! - **Bitset**: Global filter applied to all queries +//! - **Bitmap**: Per-query filter for batch operations +//! +//! # Examples +//! +//! ## Creating a Bitset Filter (Exclude Specific Vectors) +//! +//! ```no_run +//! use cuvs::filters::{Bitset, bitset_from_excluded_indices}; +//! use cuvs::Resources; +//! +//! let res = Resources::new().unwrap(); +//! let n_samples = 1000; +//! +//! // Exclude specific vector indices from search +//! let excluded = vec![0, 5, 10, 15, 20]; +//! let tensor = bitset_from_excluded_indices(n_samples, &excluded); +//! let device_tensor = tensor.to_device(&res).unwrap(); +//! let filter = Bitset::new(&device_tensor); +//! +//! // Use with search: +//! // index.search(&res, ¶ms, &queries, &neighbors, &distances, Some(&filter)); +//! ``` +//! +//! ## Creating a Bitset Filter (Include Only Specific Vectors) +//! +//! ```no_run +//! use cuvs::filters::{Bitset, bitset_from_included_indices}; +//! use cuvs::Resources; +//! +//! let res = Resources::new().unwrap(); +//! let n_samples = 1000; +//! +//! // Only search these specific vectors +//! let included = vec![100, 200, 300]; +//! let tensor = bitset_from_included_indices(n_samples, &included); +//! let device_tensor = tensor.to_device(&res).unwrap(); +//! let filter = Bitset::new(&device_tensor); +//! ``` +//! +//! ## Creating a Bitmap Filter (Per-Query Exclusions) +//! +//! ```no_run +//! use cuvs::filters::{Bitmap, bitmap_from_excluded_indices}; +//! use cuvs::Resources; +//! +//! let res = Resources::new().unwrap(); +//! let n_queries = 10; +//! let n_samples = 1000; +//! +//! // Different exclusions for each query +//! let excluded_per_query = vec![ +//! vec![0, 1, 2], // Query 0 excludes these +//! vec![10, 20, 30], // Query 1 excludes these +//! vec![5], // Query 2 excludes this +//! // ... one per query +//! ]; +//! let tensor = bitmap_from_excluded_indices(n_queries, n_samples, &excluded_per_query); +//! let device_tensor = tensor.to_device(&res).unwrap(); +//! let filter = Bitmap::new(&device_tensor); +//! ``` +//! +//! ## Manual Construction (Advanced) +//! +//! For fine-grained control, you can manually construct the bitset: +//! +//! ```no_run +//! use cuvs::filters::Bitset; +//! use cuvs::{Resources, ManagedTensor}; +//! use ndarray::Array1; +//! +//! let res = Resources::new().unwrap(); +//! let n_samples = 1000; +//! let bitset_size = (n_samples + 31) / 32; +//! +//! // Create bitset manually with custom bit patterns +//! let mut bitset_data = Array1::::from_elem(bitset_size, 0xFFFFFFFF); +//! bitset_data[0] = 0xAAAAAAAA; // Custom pattern for first 32 vectors +//! +//! let bitset_tensor = ManagedTensor::from(&bitset_data).to_device(&res).unwrap(); +//! let filter = Bitset::new(&bitset_tensor); +//! ``` + +use crate::dlpack::ManagedTensor; + +pub type FilterType = ffi::cuvsFilterType; + +/// Base trait for all filter types +pub trait Filter { + /// Convert this filter into a C FFI filter struct + fn into_ffi(&self) -> ffi::cuvsFilter; +} + +/// No filter - includes all vectors in search results +/// +/// This is the default behavior when no filter is specified. +#[derive(Debug)] +pub struct NoFilter; + +impl Filter for NoFilter { + fn into_ffi(&self) -> ffi::cuvsFilter { + ffi::cuvsFilter { + addr: 0, + type_: ffi::cuvsFilterType::NO_FILTER, + } + } +} + +/// Bitset filter - applies the same filter to all queries +/// +/// A bitset is a compact representation where each bit indicates whether +/// a vector should be included (1) or excluded (0) from search results. +/// This filter type applies the same filtering to all queries in a batch. +/// +/// # Tensor Format +/// +/// The tensor must be a 1D array of `uint32` elements: +/// - **Shape**: `[(n_samples + 31) / 32]` +/// - **Type**: `uint32` +/// - **Device**: Must be in device (GPU) memory +/// - Each bit represents one vector in the dataset +/// - Bit value 1: vector is included in search +/// - Bit value 0: vector is excluded from search +/// +/// The bitset uses little-endian bit ordering within each uint32 element. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitset, bitset_from_excluded_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let n_samples = 1000; +/// let excluded = vec![0, 5, 10]; +/// let tensor = bitset_from_excluded_indices(n_samples, &excluded); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitset::new(&device_tensor); +/// ``` +#[derive(Debug)] +pub struct Bitset<'a> { + tensor: &'a ManagedTensor, +} + +impl<'a> Bitset<'a> { + /// Create a new bitset filter from a tensor + /// + /// Use [`bitset_from_excluded_indices`] or [`bitset_from_included_indices`] + /// to create the tensor from index lists. + /// + /// # Arguments + /// + /// * `tensor` - Device tensor containing bitset data as uint32 elements. + /// Must have shape `[(n_samples + 31) / 32]` where `n_samples` + /// is the number of vectors in the dataset being filtered. + pub fn new(tensor: &'a ManagedTensor) -> Self { + Bitset { tensor } + } +} + +impl<'a> Filter for Bitset<'a> { + fn into_ffi(&self) -> ffi::cuvsFilter { + ffi::cuvsFilter { + addr: self.tensor.as_ptr() as uintptr_t, + type_: ffi::cuvsFilterType::BITSET, + } + } +} + +/// Bitmap filter - applies different filters for each query +/// +/// A bitmap allows per-query filtering in batch search operations. +/// Each query can have its own set of allowed/disallowed vectors. +/// +/// # Tensor Format +/// +/// The tensor must be a 1D array of `uint32` elements: +/// - **Shape**: `[n_queries * ((n_samples + 31) / 32)]` +/// - **Type**: `uint32` +/// - **Device**: Must be in device (GPU) memory +/// - Layout: Row-major, where each row is one query's bitset +/// - Each query has its own bitset of size `(n_samples + 31) / 32` +/// - Bit value 1: vector is included for this query +/// - Bit value 0: vector is excluded for this query +/// +/// The bitmap uses little-endian bit ordering within each uint32 element. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitmap, bitmap_from_excluded_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let n_queries = 10; +/// let n_samples = 1000; +/// let excluded_per_query = vec![vec![0, 1, 2], vec![5, 10]]; +/// let tensor = bitmap_from_excluded_indices(n_queries, n_samples, &excluded_per_query); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitmap::new(&device_tensor); +/// ``` +#[derive(Debug)] +pub struct Bitmap<'a> { + tensor: &'a ManagedTensor, +} + +impl<'a> Bitmap<'a> { + /// Create a new bitmap filter from a tensor + /// + /// Use [`bitmap_from_excluded_indices`] or [`bitmap_from_included_indices`] + /// to create the tensor from index lists. + /// + /// # Arguments + /// + /// * `tensor` - Device tensor containing bitmap data as uint32 elements. + /// Must have shape `[n_queries * ((n_samples + 31) / 32)]` where + /// `n_queries` is the number of queries in the batch and `n_samples` + /// is the number of vectors in the dataset being filtered. + pub fn new(tensor: &'a ManagedTensor) -> Self { + Bitmap { tensor } + } +} + +impl<'a> Filter for Bitmap<'a> { + fn into_ffi(&self) -> ffi::cuvsFilter { + ffi::cuvsFilter { + addr: self.tensor.as_ptr() as uintptr_t, + type_: ffi::cuvsFilterType::BITMAP, + } + } +} + +// Re-export for convenience +use ffi::cuvs_sys as ffi; +type uintptr_t = usize; + +/// Create a bitmap tensor by excluding specific indices per query +/// +/// Creates a bitmap tensor in host memory where each query can have its own set of excluded vectors. +/// All vectors are included by default, and specified indices are excluded. +/// Call `.to_device()` on the returned tensor before using it with a filter. +/// +/// # Arguments +/// +/// * `n_queries` - Number of queries in the batch +/// * `n_samples` - Total number of vectors in the dataset +/// * `excluded_indices_per_query` - Slice of vectors, one per query, containing indices to exclude +/// +/// # Returns +/// +/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitmap, bitmap_from_excluded_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let excluded_per_query = vec![ +/// vec![0, 1, 2], // Query 0 excludes these +/// vec![10, 20, 30], // Query 1 excludes these +/// ]; +/// let tensor = bitmap_from_excluded_indices(2, 1000, &excluded_per_query); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitmap::new(&device_tensor); +/// ``` +pub fn bitmap_from_excluded_indices( + n_queries: usize, + n_samples: usize, + excluded_indices_per_query: &[Vec], +) -> ManagedTensor { + use ndarray::Array1; + + let bits_per_query = (n_samples + 31) / 32; + let bitmap_size = n_queries * bits_per_query; + let mut bitmap_data = Array1::::from_elem(bitmap_size, 0xFFFFFFFF); + + // Process each query's exclusion list + for (query_idx, excluded_indices) in excluded_indices_per_query.iter().enumerate() { + if query_idx >= n_queries { + break; + } + let offset = query_idx * bits_per_query; + for &idx in excluded_indices { + if idx < n_samples { + let word_idx = offset + (idx / 32); + let bit_idx = idx % 32; + bitmap_data[word_idx] &= !(1u32 << bit_idx); + } + } + } + + ManagedTensor::from(&bitmap_data) +} + +/// Create a bitmap tensor by including only specific indices per query +/// +/// Creates a bitmap tensor in host memory where each query specifies only the vectors to include. +/// All vectors are excluded by default, and only specified indices are included. +/// Call `.to_device()` on the returned tensor before using it with a filter. +/// +/// # Arguments +/// +/// * `n_queries` - Number of queries in the batch +/// * `n_samples` - Total number of vectors in the dataset +/// * `included_indices_per_query` - Slice of vectors, one per query, containing indices to include +/// +/// # Returns +/// +/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitmap, bitmap_from_included_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let included_per_query = vec![ +/// vec![0, 1, 2], // Query 0 only searches these +/// vec![10, 20, 30], // Query 1 only searches these +/// ]; +/// let tensor = bitmap_from_included_indices(2, 1000, &included_per_query); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitmap::new(&device_tensor); +/// ``` +pub fn bitmap_from_included_indices( + n_queries: usize, + n_samples: usize, + included_indices_per_query: &[Vec], +) -> ManagedTensor { + use ndarray::Array1; + + let bits_per_query = (n_samples + 31) / 32; + let bitmap_size = n_queries * bits_per_query; + let mut bitmap_data = Array1::::zeros(bitmap_size); + + // Process each query's inclusion list + for (query_idx, included_indices) in included_indices_per_query.iter().enumerate() { + if query_idx >= n_queries { + break; + } + let offset = query_idx * bits_per_query; + for &idx in included_indices { + if idx < n_samples { + let word_idx = offset + (idx / 32); + let bit_idx = idx % 32; + bitmap_data[word_idx] |= 1u32 << bit_idx; + } + } + } + + ManagedTensor::from(&bitmap_data) +} + +/// Create a bitset tensor by excluding specific indices +/// +/// Creates a bitset tensor in host memory where all vectors are included except those specified. +/// This is a special case of bitmap with a single query. +/// Call `.to_device()` on the returned tensor before using it with a filter. +/// +/// # Arguments +/// +/// * `n_samples` - Total number of vectors in the dataset +/// * `excluded_indices` - Slice of vector indices to exclude from search +/// +/// # Returns +/// +/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitset, bitset_from_excluded_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let excluded = vec![0, 5, 10, 15]; +/// let tensor = bitset_from_excluded_indices(1000, &excluded); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitset::new(&device_tensor); +/// ``` +pub fn bitset_from_excluded_indices(n_samples: usize, excluded_indices: &[usize]) -> ManagedTensor { + // Bitset is a special case of bitmap with n_queries = 1 + bitmap_from_excluded_indices(1, n_samples, &[excluded_indices.to_vec()]) +} + +/// Create a bitset tensor by including only specific indices +/// +/// Creates a bitset tensor in host memory where only specified vectors are included. +/// This is a special case of bitmap with a single query. +/// Call `.to_device()` on the returned tensor before using it with a filter. +/// +/// # Arguments +/// +/// * `n_samples` - Total number of vectors in the dataset +/// * `included_indices` - Slice of vector indices to include in search +/// +/// # Returns +/// +/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter. +/// +/// # Example +/// +/// ```no_run +/// use cuvs::filters::{Bitset, bitset_from_included_indices}; +/// use cuvs::Resources; +/// +/// let res = Resources::new().unwrap(); +/// let included = vec![0, 5, 10, 15]; +/// let tensor = bitset_from_included_indices(1000, &included); +/// let device_tensor = tensor.to_device(&res).unwrap(); +/// let filter = Bitset::new(&device_tensor); +/// ``` +pub fn bitset_from_included_indices(n_samples: usize, included_indices: &[usize]) -> ManagedTensor { + // Bitset is a special case of bitmap with n_queries = 1 + bitmap_from_included_indices(1, n_samples, &[included_indices.to_vec()]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_no_filter() { + let filter = NoFilter; + let ffi_filter = filter.into_ffi(); + + assert_eq!(ffi_filter.addr, 0); + assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::NO_FILTER); + } + + #[test] + fn test_bitset_filter() { + let arr = ndarray::Array::::zeros(32); + let tensor = ManagedTensor::from(&arr); + let filter = Bitset::new(&tensor); + let ffi_filter = filter.into_ffi(); + + assert_eq!(ffi_filter.addr, tensor.as_ptr() as uintptr_t); + assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::BITSET); + } + + #[test] + fn test_bitmap_filter() { + let arr = ndarray::Array::::zeros(320); + let tensor = ManagedTensor::from(&arr); + let filter = Bitmap::new(&tensor); + let ffi_filter = filter.into_ffi(); + + assert_eq!(ffi_filter.addr, tensor.as_ptr() as uintptr_t); + assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::BITMAP); + } + + #[test] + fn test_bitset_from_excluded_indices() { + use ndarray::Array1; + + let n_samples = 100; + let excluded = vec![0, 5, 10, 99]; + let bitset_size = (n_samples + 31) / 32; + + // Create manually for comparison + let mut expected = Array1::::from_elem(bitset_size, 0xFFFFFFFF); + for &idx in &excluded { + let word_idx = idx / 32; + let bit_idx = idx % 32; + expected[word_idx] &= !(1u32 << bit_idx); + } + + // Create using from_excluded_indices (host version for testing) + let mut actual = Array1::::from_elem(bitset_size, 0xFFFFFFFF); + for &idx in &excluded { + if idx < n_samples { + let word_idx = idx / 32; + let bit_idx = idx % 32; + actual[word_idx] &= !(1u32 << bit_idx); + } + } + + assert_eq!(actual, expected); + + // Verify specific bits are cleared + assert_eq!(actual[0] & 1, 0); // index 0 + assert_eq!(actual[0] & (1 << 5), 0); // index 5 + assert_eq!(actual[0] & (1 << 10), 0); // index 10 + assert_eq!(actual[3] & (1 << 3), 0); // index 99 (word 3, bit 3) + } + + #[test] + fn test_bitset_from_included_indices() { + use ndarray::Array1; + + let n_samples = 100; + let included = vec![0, 5, 10, 99]; + let bitset_size = (n_samples + 31) / 32; + + // Create using from_included_indices logic (host version for testing) + let mut actual = Array1::::zeros(bitset_size); + for &idx in &included { + if idx < n_samples { + let word_idx = idx / 32; + let bit_idx = idx % 32; + actual[word_idx] |= 1u32 << bit_idx; + } + } + + // Verify specific bits are set + assert_eq!(actual[0] & 1, 1); // index 0 + assert_eq!(actual[0] & (1 << 5), 1 << 5); // index 5 + assert_eq!(actual[0] & (1 << 10), 1 << 10); // index 10 + assert_eq!(actual[3] & (1 << 3), 1 << 3); // index 99 (word 3, bit 3) + + // Verify other bits are not set + assert_eq!(actual[0] & (1 << 1), 0); // index 1 + assert_eq!(actual[0] & (1 << 2), 0); // index 2 + } + + #[test] + fn test_bitmap_from_excluded_indices() { + use ndarray::Array1; + + let n_queries = 3; + let n_samples = 100; + let bits_per_query = (n_samples + 31) / 32; + let bitmap_size = n_queries * bits_per_query; + + let excluded_per_query = vec![vec![0, 1], vec![50], vec![99]]; + + // Create using from_excluded_indices logic (host version for testing) + let mut actual = Array1::::from_elem(bitmap_size, 0xFFFFFFFF); + for (query_idx, excluded_indices) in excluded_per_query.iter().enumerate() { + let offset = query_idx * bits_per_query; + for &idx in excluded_indices { + if idx < n_samples { + let word_idx = offset + (idx / 32); + let bit_idx = idx % 32; + actual[word_idx] &= !(1u32 << bit_idx); + } + } + } + + // Verify specific bits are cleared + // Query 0, index 0 + assert_eq!(actual[0] & 1, 0); + // Query 0, index 1 + assert_eq!(actual[0] & 2, 0); + // Query 1, index 50 (word bits_per_query + 1, bit 18) + let word_idx = bits_per_query + 50 / 32; + let bit_idx = 50 % 32; + assert_eq!(actual[word_idx] & (1 << bit_idx), 0); + } + + #[test] + fn test_bitmap_from_included_indices() { + use ndarray::Array1; + + let n_queries = 3; + let n_samples = 100; + let bits_per_query = (n_samples + 31) / 32; + let bitmap_size = n_queries * bits_per_query; + + let included_per_query = vec![vec![0, 1], vec![50], vec![99]]; + + // Create using from_included_indices logic (host version for testing) + let mut actual = Array1::::zeros(bitmap_size); + for (query_idx, included_indices) in included_per_query.iter().enumerate() { + let offset = query_idx * bits_per_query; + for &idx in included_indices { + if idx < n_samples { + let word_idx = offset + (idx / 32); + let bit_idx = idx % 32; + actual[word_idx] |= 1u32 << bit_idx; + } + } + } + + // Verify specific bits are set + // Query 0, index 0 + assert_eq!(actual[0] & 1, 1); + // Query 0, index 1 + assert_eq!(actual[0] & 2, 2); + // Query 1, index 50 (word bits_per_query + 1, bit 18) + let word_idx = bits_per_query + 50 / 32; + let bit_idx = 50 % 32; + assert_eq!(actual[word_idx] & (1 << bit_idx), 1 << bit_idx); + + // Verify other bits are not set (Query 0, index 2) + assert_eq!(actual[0] & 4, 0); + } +} diff --git a/rust/cuvs/src/ivf_flat/index.rs b/rust/cuvs/src/ivf_flat/index.rs index c38be828de..474dd486c2 100644 --- a/rust/cuvs/src/ivf_flat/index.rs +++ b/rust/cuvs/src/ivf_flat/index.rs @@ -3,50 +3,102 @@ * SPDX-License-Identifier: Apache-2.0 */ +use std::ffi::CString; use std::io::{stderr, Write}; +use std::marker::PhantomData; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{DatasetOwnership, ManagedTensor}; use crate::error::{check_cuvs, Result}; +use crate::filters::{Filter, NoFilter}; use crate::ivf_flat::{IndexParams, SearchParams}; use crate::resources::Resources; /// Ivf-Flat ANN Index +/// +/// The lifetime parameter `'a` ensures the dataset outlives the index when built +/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index +/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]). +/// +/// # Examples +/// +/// ## Borrowed dataset (compiler enforces lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::ivf_flat::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((1024, 16)); +/// let params = IndexParams::new().unwrap(); +/// let tensor = ManagedTensor::from(&arr); +/// let index = Index::build(&res, ¶ms, &tensor).unwrap(); +/// // arr and tensor must remain alive while index is in use +/// ``` +/// +/// ## Owned dataset ('static lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::ivf_flat::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((1024, 16)); +/// let params = IndexParams::new().unwrap(); +/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap(); +/// let index = Index::build_owned(&res, ¶ms, device_tensor).unwrap(); +/// drop(arr); // Fine — index owns the device copy +/// ``` #[derive(Debug)] -pub struct Index(ffi::cuvsIvfFlatIndex_t); +pub struct Index<'a> { + inner: ffi::cuvsIvfFlatIndex_t, + _data: DatasetOwnership<'a>, +} + +impl<'a> Index<'a> { + /// Creates a new FFI index handle. + fn create_handle() -> Result { + unsafe { + let mut index = std::mem::MaybeUninit::::uninit(); + check_cuvs(ffi::cuvsIvfFlatIndexCreate(index.as_mut_ptr()))?; + Ok(index.assume_init()) + } + } -impl Index { - /// Builds a new Index from the dataset for efficient search. + /// Builds a new IVF-Flat Index from a borrowed dataset. + /// + /// The compiler enforces that `dataset` outlives the returned index, + /// preventing use-after-free when the C library dereferences its + /// internal view of the data. /// /// # Arguments /// /// * `res` - Resources to use /// * `params` - Parameters for building the index /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + pub fn build( res: &Resources, params: &IndexParams, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); - let index = Index::new()?; + dataset: &'a ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; unsafe { check_cuvs(ffi::cuvsIvfFlatBuild( res.0, params.0, dataset.as_ptr(), - index.0, + inner, ))?; } - Ok(index) + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Creates a new empty index - pub fn new() -> Result { - unsafe { - let mut index = std::mem::MaybeUninit::::uninit(); - check_cuvs(ffi::cuvsIvfFlatIndexCreate(index.as_mut_ptr()))?; - Ok(Index(index.assume_init())) - } + pub fn new() -> Result> { + Ok(Index { + inner: Self::create_handle()?, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Perform a Approximate Nearest Neighbors search on the Index @@ -58,6 +110,7 @@ impl Index { /// * `queries` - A matrix in device memory to query for /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors + /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided) pub fn search( &self, res: &Resources, @@ -65,29 +118,106 @@ impl Index { queries: &ManagedTensor, neighbors: &ManagedTensor, distances: &ManagedTensor, + filter: Option<&dyn Filter>, ) -> Result<()> { unsafe { - let prefilter = ffi::cuvsFilter { - addr: 0, - type_: ffi::cuvsFilterType::NO_FILTER, - }; + let filter_ffi = filter + .map(|f| f.into_ffi()) + .unwrap_or_else(|| NoFilter.into_ffi()); check_cuvs(ffi::cuvsIvfFlatSearch( res.0, params.0, - self.0, + self.inner, queries.as_ptr(), neighbors.as_ptr(), distances.as_ptr(), - prefilter, + filter_ffi, + )) + } + } + + /// Save the IVF-Flat index to file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The file name for saving the index + pub fn serialize(&self, res: &Resources, filename: &str) -> Result<()> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + unsafe { + check_cuvs(ffi::cuvsIvfFlatSerialize( + res.0, + c_filename.as_ptr(), + self.inner, )) } } + + /// Load an IVF-Flat index from file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The name of the file that stores the index + pub fn deserialize(res: &Resources, filename: &str) -> Result> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsIvfFlatDeserialize( + res.0, + c_filename.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) + } } -impl Drop for Index { +impl Index<'static> { + /// Builds a new IVF-Flat Index from an owned dataset. + /// + /// The index takes ownership of `dataset`, making it self-contained + /// with a `'static` lifetime. This is useful when the dataset is a + /// device copy (from [`ManagedTensor::to_device`]) that should live + /// as long as the index. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `params` - Parameters for building the index + /// * `dataset` - A row-major matrix to index (ownership transferred to the index) + pub fn build_owned( + res: &Resources, + params: &IndexParams, + dataset: ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsIvfFlatBuild( + res.0, + params.0, + dataset.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Owned(dataset), + }) + } +} + +impl Drop for Index<'_> { fn drop(&mut self) { - if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfFlatIndexDestroy(self.0) }) { + if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfFlatIndexDestroy(self.inner) }) { write!(stderr(), "failed to call cuvsIvfFlatIndexDestroy {:?}", e) .expect("failed to write to stderr"); } @@ -115,8 +245,8 @@ mod tests { let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); - // build the ivf-flat index - let index = Index::build(&res, &build_params, dataset_device) + // build the ivf-flat index (owned — device copy lives in the index) + let index = Index::build_owned(&res, &build_params, dataset_device) .expect("failed to create ivf-flat index"); // use the first 4 points from the dataset as queries : will test that we get them back @@ -143,7 +273,7 @@ mod tests { let search_params = SearchParams::new().unwrap(); index - .search(&res, &search_params, &queries, &neighbors, &distances) + .search(&res, &search_params, &queries, &neighbors, &distances, None) .unwrap(); // Copy back to host memory @@ -173,8 +303,8 @@ mod tests { let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); - // Build the index once - let index = Index::build(&res, &build_params, dataset_device) + // Build the index once (owned) + let index = Index::build_owned(&res, &build_params, dataset_device) .expect("failed to create ivf-flat index"); let search_params = SearchParams::new().unwrap(); @@ -198,8 +328,8 @@ mod tests { // This should work on every iteration because search() takes &self index - .search(&res, &search_params, &queries, &neighbors, &distances) - .expect(&format!("search iteration {} failed", search_iter)); + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e)); // Copy back to host memory distances.to_host(&res, &mut distances_host).unwrap(); @@ -214,4 +344,50 @@ mod tests { ); } } + + /// Test that an index built with build (borrowed) ties the dataset lifetime. + #[test] + fn test_ivf_flat_borrowed_build() { + let res = Resources::new().unwrap(); + let build_params = IndexParams::new().unwrap().set_n_lists(64); + + let n_datapoints = 1024; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + // Create a device tensor and borrow it for the index + let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let index = Index::build(&res, &build_params, &dataset_device) + .expect("failed to create ivf-flat index"); + + // Search while the borrowed dataset is still alive + let n_queries = 4; + let k = 5; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + let search_params = SearchParams::new().unwrap(); + + index + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap(); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + assert_eq!(neighbors_host[[1, 0]], 1); + // dataset_device is still alive here — compiler ensures it + } } diff --git a/rust/cuvs/src/ivf_flat/index_params.rs b/rust/cuvs/src/ivf_flat/index_params.rs index 523bc7619e..7eac14e413 100644 --- a/rust/cuvs/src/ivf_flat/index_params.rs +++ b/rust/cuvs/src/ivf_flat/index_params.rs @@ -73,6 +73,15 @@ impl IndexParams { } } +impl IndexParams { + /// Returns a builder for constructing [`IndexParams`] with validated parameters. + /// + /// See [`IndexParamsBuilder`] for details. + pub fn builder() -> IndexParamsBuilder { + IndexParamsBuilder::default() + } +} + impl fmt::Debug for IndexParams { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // custom debug trait here, default value will show the pointer address @@ -94,6 +103,101 @@ impl Drop for IndexParams { } } +/// Builder for IVF-Flat [`IndexParams`] with pre-validated parameters. +/// +/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults. +pub struct IndexParamsBuilder { + n_lists: u32, + metric: Option, + metric_arg: f32, + kmeans_n_iters: u32, + kmeans_trainset_fraction: f64, + add_data_on_build: bool, +} + +impl Default for IndexParamsBuilder { + fn default() -> Self { + Self { + n_lists: 1024, + metric: None, + metric_arg: 2.0, + kmeans_n_iters: 20, + kmeans_trainset_fraction: 0.5, + add_data_on_build: true, + } + } +} + +impl IndexParamsBuilder { + /// The number of clusters used in the coarse quantizer. + /// + /// Must be > 0. + pub fn n_lists(mut self, v: u32) -> Self { + self.n_lists = v; + self + } + + /// DistanceType to use for building the index. + pub fn metric(mut self, v: DistanceType) -> Self { + self.metric = Some(v); + self + } + + /// Metric argument (e.g. p for Minkowski distance). + pub fn metric_arg(mut self, v: f32) -> Self { + self.metric_arg = v; + self + } + + /// Number of iterations searching for kmeans centers during index building. + pub fn kmeans_n_iters(mut self, v: u32) -> Self { + self.kmeans_n_iters = v; + self + } + + /// Fraction of dataset used for kmeans training. Must be in (0, 1]. + pub fn kmeans_trainset_fraction(mut self, v: f64) -> Self { + self.kmeans_trainset_fraction = v; + self + } + + /// Populate the index with the dataset during build. When false, use `extend`. + pub fn add_data_on_build(mut self, v: bool) -> Self { + self.add_data_on_build = v; + self + } + + /// Validate all parameters without allocating any GPU resources. + pub fn validate(&self) -> crate::error::Result<()> { + if self.n_lists == 0 { + return Err(format!("n_lists must be > 0; got {}", self.n_lists).into()); + } + if self.kmeans_trainset_fraction <= 0.0 || self.kmeans_trainset_fraction > 1.0 { + return Err(format!( + "kmeans_trainset_fraction must be in (0, 1]; got {}", + self.kmeans_trainset_fraction + ) + .into()); + } + Ok(()) + } + + /// Validate all parameters and allocate the FFI struct. + pub fn build(self) -> crate::error::Result { + self.validate()?; + let mut params = IndexParams::new()? + .set_n_lists(self.n_lists) + .set_metric_arg(self.metric_arg) + .set_kmeans_n_iters(self.kmeans_n_iters) + .set_kmeans_trainset_fraction(self.kmeans_trainset_fraction) + .set_add_data_on_build(self.add_data_on_build); + if let Some(metric) = self.metric { + params = params.set_metric(metric); + } + Ok(params) + } +} + #[cfg(test)] mod tests { use super::*; @@ -110,4 +214,22 @@ mod tests { assert_eq!((*params.0).add_data_on_build, false); } } + + #[test] + fn builder_rejects_zero_n_lists() { + let err = IndexParams::builder().n_lists(0).validate().unwrap_err(); + assert!( + err.to_string().contains("n_lists"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_accepts_valid_params() { + assert!(IndexParams::builder() + .n_lists(256) + .kmeans_trainset_fraction(0.5) + .validate() + .is_ok()); + } } diff --git a/rust/cuvs/src/ivf_flat/mod.rs b/rust/cuvs/src/ivf_flat/mod.rs index 7417116965..954c0238d4 100644 --- a/rust/cuvs/src/ivf_flat/mod.rs +++ b/rust/cuvs/src/ivf_flat/mod.rs @@ -28,7 +28,8 @@ //! //! // build the ivf-flat index //! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; +//! let tensor = ManagedTensor::from(&dataset); +//! let index = Index::build(&res, &build_params, &tensor)?; //! println!( //! "Indexed {}x{} datapoints into ivf-flat index", //! n_datapoints, n_features @@ -72,5 +73,5 @@ mod index_params; mod search_params; pub use index::Index; -pub use index_params::IndexParams; +pub use index_params::{IndexParams, IndexParamsBuilder}; pub use search_params::SearchParams; diff --git a/rust/cuvs/src/ivf_pq/index.rs b/rust/cuvs/src/ivf_pq/index.rs index f61e3b771a..9d5638d98b 100644 --- a/rust/cuvs/src/ivf_pq/index.rs +++ b/rust/cuvs/src/ivf_pq/index.rs @@ -3,50 +3,102 @@ * SPDX-License-Identifier: Apache-2.0 */ +use std::ffi::CString; use std::io::{stderr, Write}; +use std::marker::PhantomData; -use crate::dlpack::ManagedTensor; +use crate::dlpack::{DatasetOwnership, ManagedTensor}; use crate::error::{check_cuvs, Result}; +use crate::filters::{Filter, NoFilter}; use crate::ivf_pq::{IndexParams, SearchParams}; use crate::resources::Resources; /// Ivf-Pq ANN Index +/// +/// The lifetime parameter `'a` ensures the dataset outlives the index when built +/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index +/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]). +/// +/// # Examples +/// +/// ## Borrowed dataset (compiler enforces lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::ivf_pq::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((1024, 16)); +/// let params = IndexParams::new().unwrap(); +/// let tensor = ManagedTensor::from(&arr); +/// let index = Index::build(&res, ¶ms, &tensor).unwrap(); +/// // arr and tensor must remain alive while index is in use +/// ``` +/// +/// ## Owned dataset ('static lifetime) +/// +/// ```no_run +/// # use cuvs::{ManagedTensor, Resources}; +/// # use cuvs::ivf_pq::{Index, IndexParams}; +/// let res = Resources::new().unwrap(); +/// let arr = ndarray::Array::::zeros((1024, 16)); +/// let params = IndexParams::new().unwrap(); +/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap(); +/// let index = Index::build_owned(&res, ¶ms, device_tensor).unwrap(); +/// drop(arr); // Fine — index owns the device copy +/// ``` #[derive(Debug)] -pub struct Index(ffi::cuvsIvfPqIndex_t); +pub struct Index<'a> { + inner: ffi::cuvsIvfPqIndex_t, + _data: DatasetOwnership<'a>, +} -impl Index { - /// Builds a new Index from the dataset for efficient search. +impl<'a> Index<'a> { + /// Creates a new FFI index handle. + fn create_handle() -> Result { + unsafe { + let mut index = std::mem::MaybeUninit::::uninit(); + check_cuvs(ffi::cuvsIvfPqIndexCreate(index.as_mut_ptr()))?; + Ok(index.assume_init()) + } + } + + /// Builds a new IVF-PQ Index from a borrowed dataset. + /// + /// The compiler enforces that `dataset` outlives the returned index, + /// preventing use-after-free when the C library dereferences its + /// internal view of the data. /// /// # Arguments /// /// * `res` - Resources to use /// * `params` - Parameters for building the index /// * `dataset` - A row-major matrix on either the host or device to index - pub fn build>( + pub fn build( res: &Resources, params: &IndexParams, - dataset: T, - ) -> Result { - let dataset: ManagedTensor = dataset.into(); - let index = Index::new()?; + dataset: &'a ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; unsafe { check_cuvs(ffi::cuvsIvfPqBuild( res.0, params.0, dataset.as_ptr(), - index.0, + inner, ))?; } - Ok(index) + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Creates a new empty index - pub fn new() -> Result { - unsafe { - let mut index = std::mem::MaybeUninit::::uninit(); - check_cuvs(ffi::cuvsIvfPqIndexCreate(index.as_mut_ptr()))?; - Ok(Index(index.assume_init())) - } + pub fn new() -> Result> { + Ok(Index { + inner: Self::create_handle()?, + _data: DatasetOwnership::Borrowed(PhantomData), + }) } /// Perform a Approximate Nearest Neighbors search on the Index @@ -58,6 +110,7 @@ impl Index { /// * `queries` - A matrix in device memory to query for /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors + /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided) pub fn search( &self, res: &Resources, @@ -65,23 +118,106 @@ impl Index { queries: &ManagedTensor, neighbors: &ManagedTensor, distances: &ManagedTensor, + filter: Option<&dyn Filter>, ) -> Result<()> { unsafe { + let filter_ffi = filter + .map(|f| f.into_ffi()) + .unwrap_or_else(|| NoFilter.into_ffi()); + check_cuvs(ffi::cuvsIvfPqSearch( res.0, params.0, - self.0, + self.inner, queries.as_ptr(), neighbors.as_ptr(), distances.as_ptr(), + filter_ffi, )) } } + + /// Save the IVF-PQ index to file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The file name for saving the index + pub fn serialize(&self, res: &Resources, filename: &str) -> Result<()> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + unsafe { + check_cuvs(ffi::cuvsIvfPqSerialize( + res.0, + c_filename.as_ptr(), + self.inner, + )) + } + } + + /// Load an IVF-PQ index from file. + /// + /// Experimental, both the API and the serialization format are subject to change. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `filename` - The name of the file that stores the index + pub fn deserialize(res: &Resources, filename: &str) -> Result> { + let c_filename = CString::new(filename).expect("filename contains null byte"); + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsIvfPqDeserialize( + res.0, + c_filename.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Borrowed(PhantomData), + }) + } } -impl Drop for Index { +impl Index<'static> { + /// Builds a new IVF-PQ Index from an owned dataset. + /// + /// The index takes ownership of `dataset`, making it self-contained + /// with a `'static` lifetime. This is useful when the dataset is a + /// device copy (from [`ManagedTensor::to_device`]) that should live + /// as long as the index. + /// + /// # Arguments + /// + /// * `res` - Resources to use + /// * `params` - Parameters for building the index + /// * `dataset` - A row-major matrix to index (ownership transferred to the index) + pub fn build_owned( + res: &Resources, + params: &IndexParams, + dataset: ManagedTensor, + ) -> Result> { + let inner = Self::create_handle()?; + unsafe { + check_cuvs(ffi::cuvsIvfPqBuild( + res.0, + params.0, + dataset.as_ptr(), + inner, + ))?; + } + Ok(Index { + inner, + _data: DatasetOwnership::Owned(dataset), + }) + } +} + +impl Drop for Index<'_> { fn drop(&mut self) { - if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfPqIndexDestroy(self.0) }) { + if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfPqIndexDestroy(self.inner) }) { write!(stderr(), "failed to call cuvsIvfPqIndexDestroy {:?}", e) .expect("failed to write to stderr"); } @@ -109,8 +245,8 @@ mod tests { let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); - // build the ivf-pq index - let index = Index::build(&res, &build_params, dataset_device) + // build the ivf-pq index (owned — device copy lives in the index) + let index = Index::build_owned(&res, &build_params, dataset_device) .expect("failed to create ivf-pq index"); // use the first 4 points from the dataset as queries : will test that we get them back @@ -137,7 +273,7 @@ mod tests { let search_params = SearchParams::new().unwrap(); index - .search(&res, &search_params, &queries, &neighbors, &distances) + .search(&res, &search_params, &queries, &neighbors, &distances, None) .unwrap(); // Copy back to host memory @@ -167,8 +303,8 @@ mod tests { let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); - // Build the index once - let index = Index::build(&res, &build_params, dataset_device) + // Build the index once (owned) + let index = Index::build_owned(&res, &build_params, dataset_device) .expect("failed to create ivf-pq index"); let search_params = SearchParams::new().unwrap(); @@ -192,8 +328,8 @@ mod tests { // This should work on every iteration because search() takes &self index - .search(&res, &search_params, &queries, &neighbors, &distances) - .expect(&format!("search iteration {} failed", search_iter)); + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e)); // Copy back to host memory distances.to_host(&res, &mut distances_host).unwrap(); @@ -208,4 +344,104 @@ mod tests { ); } } + + #[test] + fn test_ivf_pq_serialize_deserialize() { + let res = Resources::new().unwrap(); + let build_params = IndexParams::new().unwrap().set_n_lists(64); + + let n_datapoints = 1024; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let index = Index::build_owned(&res, &build_params, dataset_device) + .expect("failed to create ivf-pq index"); + + let dir = std::env::temp_dir(); + let filepath = dir.join("test_ivf_pq_index.bin"); + let filepath_str = filepath.to_str().unwrap(); + index + .serialize(&res, filepath_str) + .expect("failed to serialize ivf-pq index"); + + assert!(filepath.exists()); + assert!(std::fs::metadata(&filepath).unwrap().len() > 0); + + let loaded_index: Index<'_> = + Index::deserialize(&res, filepath_str).expect("failed to deserialize ivf-pq index"); + + let n_queries = 4; + let k = 10; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + let search_params = SearchParams::new().unwrap(); + loaded_index + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .expect("failed to search deserialized index"); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + assert_eq!(neighbors_host[[1, 0]], 1); + + let _ = std::fs::remove_file(&filepath); + } + + /// Test that an index built with build (borrowed) ties the dataset lifetime. + #[test] + fn test_ivf_pq_borrowed_build() { + let res = Resources::new().unwrap(); + let build_params = IndexParams::new().unwrap().set_n_lists(64); + + let n_datapoints = 1024; + let n_features = 16; + let dataset = + ndarray::Array::::random((n_datapoints, n_features), Uniform::new(0., 1.0)); + + // Create a device tensor and borrow it for the index + let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap(); + let index = Index::build(&res, &build_params, &dataset_device) + .expect("failed to create ivf-pq index"); + + // Search while the borrowed dataset is still alive + let n_queries = 4; + let k = 5; + let queries = dataset.slice(s![0..n_queries, ..]); + let queries = ManagedTensor::from(&queries).to_device(&res).unwrap(); + + let mut neighbors_host = ndarray::Array::::zeros((n_queries, k)); + let neighbors = ManagedTensor::from(&neighbors_host) + .to_device(&res) + .unwrap(); + + let mut distances_host = ndarray::Array::::zeros((n_queries, k)); + let distances = ManagedTensor::from(&distances_host) + .to_device(&res) + .unwrap(); + + let search_params = SearchParams::new().unwrap(); + + index + .search(&res, &search_params, &queries, &neighbors, &distances, None) + .unwrap(); + + distances.to_host(&res, &mut distances_host).unwrap(); + neighbors.to_host(&res, &mut neighbors_host).unwrap(); + + assert_eq!(neighbors_host[[0, 0]], 0); + assert_eq!(neighbors_host[[1, 0]], 1); + // dataset_device is still alive here — compiler ensures it + } } diff --git a/rust/cuvs/src/ivf_pq/index_params.rs b/rust/cuvs/src/ivf_pq/index_params.rs index e1f2d53656..7e276d3d4a 100644 --- a/rust/cuvs/src/ivf_pq/index_params.rs +++ b/rust/cuvs/src/ivf_pq/index_params.rs @@ -149,6 +149,15 @@ impl IndexParams { } } +impl IndexParams { + /// Returns a builder for constructing [`IndexParams`] with validated parameters. + /// + /// See [`IndexParamsBuilder`] for details. + pub fn builder() -> IndexParamsBuilder { + IndexParamsBuilder::default() + } +} + impl fmt::Debug for IndexParams { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // custom debug trait here, default value will show the pointer address @@ -170,6 +179,159 @@ impl Drop for IndexParams { } } +/// Builder for IVF-PQ [`IndexParams`] with pre-validated parameters. +/// +/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults. +pub struct IndexParamsBuilder { + n_lists: u32, + metric: Option, + metric_arg: f32, + kmeans_n_iters: u32, + kmeans_trainset_fraction: f64, + pq_bits: u32, + pq_dim: u32, + codebook_kind: Option, + codes_layout: Option, + force_random_rotation: bool, + max_train_points_per_pq_code: u32, + add_data_on_build: bool, +} + +impl Default for IndexParamsBuilder { + fn default() -> Self { + Self { + n_lists: 1024, + metric: None, + metric_arg: 2.0, + kmeans_n_iters: 20, + kmeans_trainset_fraction: 0.5, + pq_bits: 8, + pq_dim: 0, + codebook_kind: None, + codes_layout: None, + force_random_rotation: false, + max_train_points_per_pq_code: 256, + add_data_on_build: true, + } + } +} + +impl IndexParamsBuilder { + /// The number of clusters used in the coarse quantizer. + /// + /// Must be > 0. + pub fn n_lists(mut self, v: u32) -> Self { + self.n_lists = v; + self + } + + /// DistanceType to use for building the index. + pub fn metric(mut self, v: crate::distance_type::DistanceType) -> Self { + self.metric = Some(v); + self + } + + /// Metric argument (e.g. p for Minkowski distance). + pub fn metric_arg(mut self, v: f32) -> Self { + self.metric_arg = v; + self + } + + /// Number of iterations searching for kmeans centers during index building. + pub fn kmeans_n_iters(mut self, v: u32) -> Self { + self.kmeans_n_iters = v; + self + } + + /// Fraction of dataset used for kmeans training. Must be in (0, 1]. + pub fn kmeans_trainset_fraction(mut self, v: f64) -> Self { + self.kmeans_trainset_fraction = v; + self + } + + /// Bit length of the vector element after quantization. Typically 4 or 8. + pub fn pq_bits(mut self, v: u32) -> Self { + self.pq_bits = v; + self + } + + /// Dimensionality of the vector after product quantization. 0 = auto. + pub fn pq_dim(mut self, v: u32) -> Self { + self.pq_dim = v; + self + } + + /// Codebook generation method. + pub fn codebook_kind(mut self, v: cuvsIvfPqCodebookGen) -> Self { + self.codebook_kind = Some(v); + self + } + + /// Memory layout of IVF-PQ list data. + pub fn codes_layout(mut self, v: cuvsIvfPqListLayout) -> Self { + self.codes_layout = Some(v); + self + } + + /// Apply a random rotation matrix on input data and queries. + pub fn force_random_rotation(mut self, v: bool) -> Self { + self.force_random_rotation = v; + self + } + + /// Max number of data points per PQ code during codebook training. + pub fn max_train_points_per_pq_code(mut self, v: u32) -> Self { + self.max_train_points_per_pq_code = v; + self + } + + /// Populate the index with the dataset during build. When false, use `extend`. + pub fn add_data_on_build(mut self, v: bool) -> Self { + self.add_data_on_build = v; + self + } + + /// Validate all parameters without allocating any GPU resources. + pub fn validate(&self) -> crate::error::Result<()> { + if self.n_lists == 0 { + return Err(format!("n_lists must be > 0; got {}", self.n_lists).into()); + } + if self.kmeans_trainset_fraction <= 0.0 || self.kmeans_trainset_fraction > 1.0 { + return Err(format!( + "kmeans_trainset_fraction must be in (0, 1]; got {}", + self.kmeans_trainset_fraction + ) + .into()); + } + Ok(()) + } + + /// Validate all parameters and allocate the FFI struct. + pub fn build(self) -> crate::error::Result { + self.validate()?; + let mut params = IndexParams::new()? + .set_n_lists(self.n_lists) + .set_metric_arg(self.metric_arg) + .set_kmeans_n_iters(self.kmeans_n_iters) + .set_kmeans_trainset_fraction(self.kmeans_trainset_fraction) + .set_pq_bits(self.pq_bits) + .set_pq_dim(self.pq_dim) + .set_force_random_rotation(self.force_random_rotation) + .set_max_train_points_per_pq_code(self.max_train_points_per_pq_code) + .set_add_data_on_build(self.add_data_on_build); + if let Some(metric) = self.metric { + params = params.set_metric(metric); + } + if let Some(kind) = self.codebook_kind { + params = params.set_codebook_kind(kind); + } + if let Some(layout) = self.codes_layout { + params = params.set_codes_layout(layout); + } + Ok(params) + } +} + #[cfg(test)] mod tests { use super::*; @@ -186,4 +348,34 @@ mod tests { assert_eq!((*params.0).add_data_on_build, false); } } + + #[test] + fn builder_rejects_zero_n_lists() { + let err = IndexParams::builder().n_lists(0).validate().unwrap_err(); + assert!( + err.to_string().contains("n_lists"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_accepts_valid_params() { + assert!(IndexParams::builder() + .n_lists(256) + .kmeans_trainset_fraction(0.5) + .validate() + .is_ok()); + } + + #[test] + fn existing_setter_api_unchanged() { + let params = IndexParams::new() + .unwrap() + .set_n_lists(128) + .set_add_data_on_build(false); + unsafe { + assert_eq!((*params.0).n_lists, 128); + assert_eq!((*params.0).add_data_on_build, false); + } + } } diff --git a/rust/cuvs/src/ivf_pq/mod.rs b/rust/cuvs/src/ivf_pq/mod.rs index c4676cd1aa..bdabe0142e 100644 --- a/rust/cuvs/src/ivf_pq/mod.rs +++ b/rust/cuvs/src/ivf_pq/mod.rs @@ -25,7 +25,8 @@ //! //! // build the ivf-pq index //! let build_params = IndexParams::new()?; -//! let index = Index::build(&res, &build_params, &dataset)?; +//! let tensor = ManagedTensor::from(&dataset); +//! let index = Index::build(&res, &build_params, &tensor)?; //! println!( //! "Indexed {}x{} datapoints into ivf-pq index", //! n_datapoints, n_features @@ -69,5 +70,5 @@ mod index_params; mod search_params; pub use index::Index; -pub use index_params::IndexParams; +pub use index_params::{IndexParams, IndexParamsBuilder}; pub use search_params::SearchParams; diff --git a/rust/cuvs/src/lib.rs b/rust/cuvs/src/lib.rs index f085915680..a3c73e2e45 100644 --- a/rust/cuvs/src/lib.rs +++ b/rust/cuvs/src/lib.rs @@ -14,6 +14,7 @@ pub mod distance; pub mod distance_type; mod dlpack; mod error; +pub mod filters; pub mod ivf_flat; pub mod ivf_pq; mod resources; diff --git a/rust/cuvs/src/vamana/index_params.rs b/rust/cuvs/src/vamana/index_params.rs index c52c287238..3138390a96 100644 --- a/rust/cuvs/src/vamana/index_params.rs +++ b/rust/cuvs/src/vamana/index_params.rs @@ -96,6 +96,15 @@ impl IndexParams { } } +impl IndexParams { + /// Returns a builder for constructing [`IndexParams`] with validated parameters. + /// + /// See [`IndexParamsBuilder`] for details. + pub fn builder() -> IndexParamsBuilder { + IndexParamsBuilder::default() + } +} + impl fmt::Debug for IndexParams { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // custom debug trait here, default value will show the pointer address @@ -117,6 +126,135 @@ impl Drop for IndexParams { } } +/// Builder for Vamana [`IndexParams`] with pre-validated parameters. +/// +/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults. +pub struct IndexParamsBuilder { + graph_degree: u32, + visited_size: u32, + vamana_iters: f32, + alpha: f32, + max_fraction: f32, + batch_base: f32, + queue_size: u32, + reverse_batchsize: u32, + metric: Option, +} + +impl Default for IndexParamsBuilder { + fn default() -> Self { + Self { + graph_degree: 64, + visited_size: 75, + vamana_iters: 2.0, + alpha: 1.2, + max_fraction: 0.06, + batch_base: 2.0, + queue_size: 255, + reverse_batchsize: 1_000_000, + metric: None, + } + } +} + +impl IndexParamsBuilder { + /// Maximum degree of the output graph (R parameter in Vamana literature). + /// + /// Must be > 0. + pub fn graph_degree(mut self, v: u32) -> Self { + self.graph_degree = v; + self + } + + /// Maximum number of visited nodes per search (L parameter in Vamana literature). + /// + /// Must be >= `graph_degree`. + pub fn visited_size(mut self, v: u32) -> Self { + self.visited_size = v; + self + } + + /// Number of Vamana vector insertion iterations. + pub fn vamana_iters(mut self, v: f32) -> Self { + self.vamana_iters = v; + self + } + + /// Alpha for pruning parameter. + /// + /// Must be > 0. + pub fn alpha(mut self, v: f32) -> Self { + self.alpha = v; + self + } + + /// Maximum fraction of dataset inserted per batch. + pub fn max_fraction(mut self, v: f32) -> Self { + self.max_fraction = v; + self + } + + /// Base of growth rate of batch sizes. + pub fn batch_base(mut self, v: f32) -> Self { + self.batch_base = v; + self + } + + /// Size of candidate queue structure. + pub fn queue_size(mut self, v: u32) -> Self { + self.queue_size = v; + self + } + + /// Max batchsize of reverse edge processing. + pub fn reverse_batchsize(mut self, v: u32) -> Self { + self.reverse_batchsize = v; + self + } + + /// DistanceType to use for building the index. + pub fn metric(mut self, v: DistanceType) -> Self { + self.metric = Some(v); + self + } + + /// Validate all parameters without allocating any GPU resources. + pub fn validate(&self) -> crate::error::Result<()> { + if self.graph_degree == 0 { + return Err(format!("graph_degree must be > 0; got {}", self.graph_degree).into()); + } + if self.visited_size < self.graph_degree { + return Err(format!( + "visited_size ({}) must be >= graph_degree ({})", + self.visited_size, self.graph_degree + ) + .into()); + } + if self.alpha <= 0.0 { + return Err(format!("alpha must be > 0; got {}", self.alpha).into()); + } + Ok(()) + } + + /// Validate all parameters and allocate the FFI struct. + pub fn build(self) -> crate::error::Result { + self.validate()?; + let mut params = IndexParams::new()? + .set_graph_degree(self.graph_degree) + .set_visited_size(self.visited_size) + .set_vamana_iters(self.vamana_iters) + .set_alpha(self.alpha) + .set_max_fraction(self.max_fraction) + .set_batch_base(self.batch_base) + .set_queue_size(self.queue_size) + .set_reverse_batchsize(self.reverse_batchsize); + if let Some(metric) = self.metric { + params = params.set_metric(metric); + } + Ok(params) + } +} + #[cfg(test)] mod tests { use super::*; @@ -133,4 +271,39 @@ mod tests { assert_eq!((*params.0).visited_size, 128); } } + + #[test] + fn builder_rejects_zero_graph_degree() { + let err = IndexParams::builder() + .graph_degree(0) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("graph_degree"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_rejects_visited_size_less_than_graph_degree() { + let err = IndexParams::builder() + .graph_degree(64) + .visited_size(32) + .validate() + .unwrap_err(); + assert!( + err.to_string().contains("visited_size"), + "error message should name the field: {err}" + ); + } + + #[test] + fn builder_accepts_valid_params() { + assert!(IndexParams::builder() + .graph_degree(32) + .visited_size(75) + .alpha(1.2) + .validate() + .is_ok()); + } } diff --git a/rust/cuvs/src/vamana/mod.rs b/rust/cuvs/src/vamana/mod.rs index a3ae4ee9ff..631dd37d0e 100644 --- a/rust/cuvs/src/vamana/mod.rs +++ b/rust/cuvs/src/vamana/mod.rs @@ -8,4 +8,4 @@ mod index; mod index_params; pub use index::Index; -pub use index_params::IndexParams; +pub use index_params::{IndexParams, IndexParamsBuilder};