diff --git a/benchmarks/cagra_recall_throughput_bench.py b/benchmarks/cagra_recall_throughput_bench.py
new file mode 100644
index 0000000000..2339008767
--- /dev/null
+++ b/benchmarks/cagra_recall_throughput_bench.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+CAGRA Recall/Throughput Benchmark: SINGLE_CTA vs MULTI_CTA with search_width > 1
+
+Measures recall@10 and throughput (QPS) across configurations to evaluate
+the impact of the AUTO algorithm selector when search_width > 1.
+
+Configurations tested:
+  1. MULTI_CTA (default max_iterations) — reference baseline
+  2. SINGLE_CTA (default max_iterations) — shows the recall problem
+  3. SINGLE_CTA (floor at 32 base iterations) — alternative fix
+  4. AUTO (current code) — shows when the switch happens
+
+For each: sweep batch_size=[1, 64, 256, 512, 1024] x search_width=[1, 4, 8]
+"""
+
+import time
+
+import numpy as np
+from cuvs.neighbors import brute_force, cagra
+from pylibraft.common import device_ndarray
+
+
+# --- Dataset parameters ---
+N_SAMPLES = 100_000
+N_QUERIES = 2048
+DIM = 128
+K = 10
+N_WARMUP = 3
+N_TIMED = 10
+
+# --- Search configurations ---
+BATCH_SIZES = [1, 64, 256, 512, 1024]
+SEARCH_WIDTHS = [1, 4, 8]
+ITOPK_SIZE = 64
+GRAPH_DEGREE = 64
+
+
+def compute_reachability_iters(dataset_size, graph_degree):
+    """Replicate the C++ reachability iteration calculation."""
+    iters = 0
+    reachable = 1
+    while reachable < dataset_size:
+        reachable *= max(2, graph_degree // 2)
+        iters += 1
+    return iters
+
+
+def compute_max_iterations(
+    algo, itopk_size, search_width, dataset_size, graph_degree
+):
+    """Replicate the C++ max_iterations calculation for each algorithm."""
+    reach = compute_reachability_iters(dataset_size, graph_degree)
+    if algo == "multi_cta":
+        base = 32  # mc_itopk_size / mc_search_width = 32/1
+    elif algo == "single_cta_default":
+        base = itopk_size // search_width
+    elif algo == "single_cta_floor32":
+        # Alternative fix: floor base iterations at 32 (match MULTI_CTA)
+        base = max(itopk_size // search_width, 32)
+    else:
+        return 0  # Let cuVS calculate (AUTO mode)
+    return base + reach
+
+
+def compute_recall(results, ground_truth, k):
+    """Compute recall@k: fraction of true k-NN found in results."""
+    n = results.shape[0]
+    recall = 0.0
+    for i in range(n):
+        gt_set = set(ground_truth[i, :k].tolist())
+        res_set = set(results[i, :k].tolist())
+        recall += len(gt_set & res_set) / k
+    return recall / n
+
+
+def run_benchmark():
+    np.random.seed(42)
+
+    print("Generating dataset...")
+    dataset = np.random.random((N_SAMPLES, DIM)).astype(np.float32)
+    queries = dataset[:N_QUERIES].copy()
+
+    # Move dataset to device
+    dataset_device = device_ndarray(dataset)
+    queries_device = device_ndarray(queries)
+
+    # Build brute-force ground truth
+    print("Computing brute-force ground truth...")
+    bf_index = brute_force.build(dataset_device)
+    gt_distances, gt_neighbors = brute_force.search(
+        bf_index, queries_device, k=K
+    )
+    gt_neighbors_host = gt_neighbors.copy_to_host()
+
+    # Build CAGRA index
+    print("Building CAGRA index...")
+    index_params = cagra.IndexParams(
+        intermediate_graph_degree=128,
+        graph_degree=GRAPH_DEGREE,
+    )
+    cagra_index = cagra.build(index_params, dataset_device)
+    print(
+        f"  Index built: {N_SAMPLES} vectors, dim={DIM}, graph_degree={GRAPH_DEGREE}"
+    )
+
+    reach_iters = compute_reachability_iters(N_SAMPLES, GRAPH_DEGREE)
+    print(f"  Reachability iterations: {reach_iters}")
+    print()
+
+    # Define test configurations
+    configs = [
+        ("MULTI_CTA", "multi_cta"),
+        ("SINGLE_CTA (default)", "single_cta_default"),
+        ("SINGLE_CTA (floor@32)", "single_cta_floor32"),
+        ("AUTO", "auto"),
+    ]
+
+    # Header
+    print(
+        f"{'Config':<25} {'sw':>3} {'batch':>6} {'max_iter':>9} "
+        f"{'recall@10':>10} {'QPS':>10}"
+    )
+    print("-" * 75)
+
+    results_table = []
+
+    for search_width in SEARCH_WIDTHS:
+        for batch_size in BATCH_SIZES:
+            batch_queries = queries[:batch_size]
+            batch_gt = gt_neighbors_host[:batch_size]
+
+            for config_name, config_key in configs:
+                max_iter = compute_max_iterations(
+                    config_key,
+                    ITOPK_SIZE,
+                    search_width,
+                    N_SAMPLES,
+                    GRAPH_DEGREE,
+                )
+
+                # Build SearchParams — algo must be set in constructor
+                if config_key == "multi_cta":
+                    search_params = cagra.SearchParams(
+                        algo="multi_cta",
+                        itopk_size=ITOPK_SIZE,
+                        search_width=search_width,
+                        max_iterations=max_iter,
+                    )
+                elif config_key.startswith("single_cta"):
+                    search_params = cagra.SearchParams(
+                        algo="single_cta",
+                        itopk_size=ITOPK_SIZE,
+                        search_width=search_width,
+                        max_iterations=max_iter,
+                    )
+                else:
+                    # AUTO: let cuVS decide algorithm and max_iterations
+                    search_params = cagra.SearchParams(
+                        algo="auto",
+                        itopk_size=ITOPK_SIZE,
+                        search_width=search_width,
+                    )
+
+                batch_device = device_ndarray(batch_queries)
+
+                # Warmup
+                for _ in range(N_WARMUP):
+                    _, neighbors = cagra.search(
+                        search_params, cagra_index, batch_device, k=K
+                    )
+
+                # Timed runs
+                elapsed = 0.0
+                for _ in range(N_TIMED):
+                    start = time.perf_counter()
+                    _, neighbors = cagra.search(
+                        search_params, cagra_index, batch_device, k=K
+                    )
+                    elapsed += time.perf_counter() - start
+
+                neighbors_host = neighbors.copy_to_host()
+                recall_val = compute_recall(neighbors_host, batch_gt, K)
+
+                avg_time = elapsed / N_TIMED
+                qps = batch_size / avg_time if avg_time > 0 else 0
+
+                iter_str = str(max_iter) if max_iter > 0 else "auto"
+                print(
+                    f"{config_name:<25} {search_width:>3} {batch_size:>6} "
+                    f"{iter_str:>9} {recall_val:>10.4f} {qps:>10.0f}"
+                )
+
+                results_table.append(
+                    {
+                        "config": config_name,
+                        "search_width": search_width,
+                        "batch_size": batch_size,
+                        "max_iterations": max_iter,
+                        "recall": recall_val,
+                        "qps": qps,
+                    }
+                )
+
+        print()  # Blank line between search_width groups
+
+    # Summary analysis
+    print("\n" + "=" * 75)
+    print("ANALYSIS: Recall delta at algorithm switch point (batch_size=512)")
+    print("=" * 75)
+    for sw in SEARCH_WIDTHS:
+        multi = [
+            r
+            for r in results_table
+            if r["config"] == "MULTI_CTA"
+            and r["search_width"] == sw
+            and r["batch_size"] == 512
+        ]
+        single_def = [
+            r
+            for r in results_table
+            if r["config"] == "SINGLE_CTA (default)"
+            and r["search_width"] == sw
+            and r["batch_size"] == 512
+        ]
+        single_fix = [
+            r
+            for r in results_table
+            if r["config"] == "SINGLE_CTA (floor@32)"
+            and r["search_width"] == sw
+            and r["batch_size"] == 512
+        ]
+        auto = [
+            r
+            for r in results_table
+            if r["config"] == "AUTO"
+            and r["search_width"] == sw
+            and r["batch_size"] == 512
+        ]
+
+        if multi and single_def and single_fix and auto:
+            m = multi[0]
+            sd = single_def[0]
+            sf = single_fix[0]
+            a = auto[0]
+            print(f"\n  search_width={sw}:")
+            print(
+                f"    MULTI_CTA:              recall={m['recall']:.4f}  QPS={m['qps']:.0f}"
+            )
+            print(
+                f"    SINGLE_CTA (default):   recall={sd['recall']:.4f}  QPS={sd['qps']:.0f}  "
+                f"(recall delta: {sd['recall'] - m['recall']:+.4f})"
+            )
+            print(
+                f"    SINGLE_CTA (floor@32):  recall={sf['recall']:.4f}  QPS={sf['qps']:.0f}  "
+                f"(recall delta: {sf['recall'] - m['recall']:+.4f})"
+            )
+            print(
+                f"    AUTO (current code):    recall={a['recall']:.4f}  QPS={a['qps']:.0f}  "
+                f"(recall delta: {a['recall'] - m['recall']:+.4f})"
+            )
+
+            if sd["qps"] > 0 and m["qps"] > 0:
+                print(
+                    f"    Throughput gain (SINGLE_CTA default vs MULTI_CTA): "
+                    f"{sd['qps'] / m['qps']:.2f}x"
+                )
+            if sf["qps"] > 0 and m["qps"] > 0:
+                print(
+                    f"    Throughput gain (SINGLE_CTA floor@32 vs MULTI_CTA): "
+                    f"{sf['qps'] / m['qps']:.2f}x"
+                )
+
+    print("\n\nKey question: Does SINGLE_CTA (floor@32) recover recall while")
+    print("preserving throughput advantage over MULTI_CTA?")
+
+
+if __name__ == "__main__":
+    run_benchmark()
diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h
index af6c66c9d4..9664383bc6 100644
--- a/c/include/cuvs/neighbors/cagra.h
+++ b/c/include/cuvs/neighbors/cagra.h
@@ -222,6 +222,19 @@ struct cuvsCagraIndexParams {
    * - Others: nullptr
    */
   void* graph_build_params;
+  /**
+   * Whether to add the dataset content to the index after building the graph.
+   *
+   *  - true (default): the index is filled with the dataset vectors and ready to search
+   *    after build, but requires enough memory to hold an aligned copy of the dataset.
+   *  - false: only the search graph is built. The user must call cuvsCagraUpdateDataset
+   *    to attach the dataset before searching. This avoids duplicating the dataset in
+   *    device memory during build, which is useful for memory-constrained scenarios.
+   *
+   * When compression is set, this parameter is ignored (compressed dataset is always
+   * added to the index).
+   */
+  bool attach_dataset_on_build;
 };
 
 typedef struct cuvsCagraIndexParams* cuvsCagraIndexParams_t;
@@ -617,6 +630,26 @@ cuvsError_t cuvsCagraBuild(cuvsResources_t res,
                            DLManagedTensor* dataset,
                            cuvsCagraIndex_t index);
 
+/**
+ * @brief Update (attach) a dataset to an existing CAGRA index.
+ *
+ * This is intended for use after building an index with attach_dataset_on_build = false.
+ * If the dataset rows are already aligned on 16 bytes and reside on the device, only a
+ * reference is stored (zero-copy). Otherwise, an aligned copy is made.
+ *
+ * It is the caller's responsibility to ensure that the same dataset used for building
+ * is supplied here. The dataset must remain valid for the lifetime of the index when
+ * zero-copy is used.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] dataset DLManagedTensor* dataset to attach
+ * @param[in] index cuvsCagraIndex_t the index to update
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraUpdateDataset(cuvsResources_t res,
+                                   DLManagedTensor* dataset,
+                                   cuvsCagraIndex_t index);
+
 /**
  * @}
  */
diff --git a/c/include/cuvs/neighbors/ivf_pq.h b/c/include/cuvs/neighbors/ivf_pq.h
index c780ab8065..534b53eed8 100644
--- a/c/include/cuvs/neighbors/ivf_pq.h
+++ b/c/include/cuvs/neighbors/ivf_pq.h
@@ -7,6 +7,7 @@
 
 #include <cuvs/core/c_api.h>
 #include <cuvs/distance/distance.h>
+#include <cuvs/neighbors/common.h>
 #include <dlpack/dlpack.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -516,8 +517,9 @@ cuvsError_t cuvsIvfPqBuildPrecomputed(cuvsResources_t res,
  * cuvsError_t params_create_status = cuvsIvfPqSearchParamsCreate(&search_params);
  *
  * // Search the `index` built using `cuvsIvfPqBuild`
+ * cuvsFilter filter = {.addr = 0, .type = NO_FILTER};
  * cuvsError_t search_status = cuvsIvfPqSearch(res, search_params, index, &queries, &neighbors,
- * &distances);
+ * &distances, filter);
  *
  * // de-allocate `search_params` and `res`
  * cuvsError_t params_destroy_status = cuvsIvfPqSearchParamsDestroy(search_params);
@@ -530,13 +532,15 @@ cuvsError_t cuvsIvfPqBuildPrecomputed(cuvsResources_t res,
  * @param[in] queries DLManagedTensor* queries dataset to search
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
+ * @param[in] filter cuvsFilter filter to apply to the search
  */
 cuvsError_t cuvsIvfPqSearch(cuvsResources_t res,
                             cuvsIvfPqSearchParams_t search_params,
                             cuvsIvfPqIndex_t index,
                             DLManagedTensor* queries,
                             DLManagedTensor* neighbors,
-                            DLManagedTensor* distances);
+                            DLManagedTensor* distances,
+                            cuvsFilter filter);
 /**
  * @}
  */
diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp
index 081179ca46..cb2acf88f5 100644
--- a/c/src/neighbors/cagra.cpp
+++ b/c/src/neighbors/cagra.cpp
@@ -128,6 +128,28 @@ void* _build(cuvsResources_t res, cuvsCagraIndexParams params, DLManagedTensor*
   return index;
 }
 
+template <typename T>
+void _update_dataset(cuvsResources_t res,
+                     cuvsCagraIndex index,
+                     DLManagedTensor* dataset_tensor)
+{
+  auto dataset   = dataset_tensor->dl_tensor;
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::cagra::index<T, uint32_t>*>(index.addr);
+
+  if (cuvs::core::is_dlpack_device_compatible(dataset)) {
+    using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
+    auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
+    index_ptr->update_dataset(*res_ptr, mds);
+  } else if (cuvs::core::is_dlpack_host_compatible(dataset)) {
+    using mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+    auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
+    index_ptr->update_dataset(*res_ptr, mds);
+  } else {
+    RAFT_FAIL("Unsupported dataset DLtensor device type: %d", dataset.device.device_type);
+  }
+}
+
 template <typename T>
 void* _from_args(cuvsResources_t res,
                  cuvsDistanceType _metric,
@@ -221,22 +243,26 @@ void _search(cuvsResources_t res,
   if (filter.type == NO_FILTER) {
     cuvs::neighbors::cagra::search(
       *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds);
+  } else if (filter.type == BITMAP) {
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bmp_type    = cuvs::core::bitmap_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter(
+      filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size()));
+    cuvs::neighbors::cagra::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj);
   } else if (filter.type == BITSET) {
-    using filter_mdspan_type    = raft::device_vector_view<std::uint32_t, int64_t, raft::row_major>;
-    auto removed_indices_tensor = reinterpret_cast<DLManagedTensor*>(filter.addr);
-    auto removed_indices = cuvs::core::from_dlpack<filter_mdspan_type>(removed_indices_tensor);
-    cuvs::core::bitset_view<std::uint32_t, int64_t> removed_indices_bitset(
-      removed_indices, index_ptr->dataset().extent(0));
-    auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset);
-    cuvs::neighbors::cagra::search(*res_ptr,
-                                   search_params,
-                                   *index_ptr,
-                                   queries_mds,
-                                   neighbors_mds,
-                                   distances_mds,
-                                   bitset_filter_obj);
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bst_type    = cuvs::core::bitset_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(
+      filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size()));
+    cuvs::neighbors::cagra::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj);
   } else {
-    RAFT_FAIL("Unsupported filter type: BITMAP");
+    RAFT_FAIL("Unsupported filter type");
   }
 }
 
@@ -442,6 +468,7 @@ void convert_c_index_params(cuvsCagraIndexParams params,
   out->metric                    = static_cast<cuvs::distance::DistanceType>((int)params.metric);
   out->intermediate_graph_degree = params.intermediate_graph_degree;
   out->graph_degree              = params.graph_degree;
+  out->attach_dataset_on_build   = params.attach_dataset_on_build;
   _set_graph_build_params(out->graph_build_params, params, params.build_algo, n_rows, dim);
 
   if (auto* cparams = params.compression; cparams != nullptr) {
@@ -588,6 +615,27 @@ extern "C" cuvsError_t cuvsCagraBuild(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsCagraUpdateDataset(cuvsResources_t res,
+                                              DLManagedTensor* dataset_tensor,
+                                              cuvsCagraIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
+      _update_dataset<float>(res, *index, dataset_tensor);
+    } else if (index->dtype.code == kDLFloat && index->dtype.bits == 16) {
+      _update_dataset<half>(res, *index, dataset_tensor);
+    } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) {
+      _update_dataset<int8_t>(res, *index, dataset_tensor);
+    } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) {
+      _update_dataset<uint8_t>(res, *index, dataset_tensor);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d",
+                index->dtype.code,
+                index->dtype.bits);
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraIndexFromArgs(cuvsResources_t res,
                                               cuvsDistanceType metric,
                                               DLManagedTensor* graph_tensor,
@@ -736,7 +784,10 @@ extern "C" cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params
                                                              .intermediate_graph_degree = 128,
                                                              .graph_degree              = 64,
                                                              .build_algo                = IVF_PQ,
-                                                             .nn_descent_niter          = 20};
+                                                             .nn_descent_niter          = 20,
+                                                             .compression               = nullptr,
+                                                             .graph_build_params        = nullptr,
+                                                             .attach_dataset_on_build   = true};
     (*params)->graph_build_params = new cuvsIvfPqParams{nullptr, nullptr, 1};
   });
 }
diff --git a/c/src/neighbors/ivf_flat.cpp b/c/src/neighbors/ivf_flat.cpp
index 56a3088e89..58accc7f81 100644
--- a/c/src/neighbors/ivf_flat.cpp
+++ b/c/src/neighbors/ivf_flat.cpp
@@ -90,23 +90,26 @@ void _search(cuvsResources_t res,
   if (filter.type == NO_FILTER) {
     cuvs::neighbors::ivf_flat::search(
       *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds);
+  } else if (filter.type == BITMAP) {
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bmp_type    = cuvs::core::bitmap_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter(
+      filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size()));
+    cuvs::neighbors::ivf_flat::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj);
   } else if (filter.type == BITSET) {
-    using filter_mdspan_type    = raft::device_vector_view<std::uint32_t, int64_t, raft::row_major>;
-    auto removed_indices_tensor = reinterpret_cast<DLManagedTensor*>(filter.addr);
-    auto removed_indices = cuvs::core::from_dlpack<filter_mdspan_type>(removed_indices_tensor);
-    cuvs::core::bitset_view<std::uint32_t, int64_t> removed_indices_bitset(removed_indices,
-                                                                           index_ptr->size());
-    auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset);
-    cuvs::neighbors::ivf_flat::search(*res_ptr,
-                                      search_params,
-                                      *index_ptr,
-                                      queries_mds,
-                                      neighbors_mds,
-                                      distances_mds,
-                                      bitset_filter_obj);
-
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bst_type    = cuvs::core::bitset_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(
+      filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size()));
+    cuvs::neighbors::ivf_flat::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj);
   } else {
-    RAFT_FAIL("Unsupported filter type: BITMAP");
+    RAFT_FAIL("Unsupported filter type");
   }
 }
 
diff --git a/c/src/neighbors/ivf_pq.cpp b/c/src/neighbors/ivf_pq.cpp
index 8c444431e6..8835a94835 100644
--- a/c/src/neighbors/ivf_pq.cpp
+++ b/c/src/neighbors/ivf_pq.cpp
@@ -83,7 +83,8 @@ void _search(cuvsResources_t res,
              cuvsIvfPqIndex index,
              DLManagedTensor* queries_tensor,
              DLManagedTensor* neighbors_tensor,
-             DLManagedTensor* distances_tensor)
+             DLManagedTensor* distances_tensor,
+             cuvsFilter filter)
 {
   auto res_ptr   = reinterpret_cast<raft::resources*>(res);
   auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_pq::index<IdxT>*>(index.addr);
@@ -98,8 +99,30 @@ void _search(cuvsResources_t res,
   auto neighbors_mds          = cuvs::core::from_dlpack<neighbors_mdspan_type>(neighbors_tensor);
   auto distances_mds          = cuvs::core::from_dlpack<distances_mdspan_type>(distances_tensor);
 
-  cuvs::neighbors::ivf_pq::search(
-    *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds);
+  if (filter.type == NO_FILTER) {
+    cuvs::neighbors::ivf_pq::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds);
+  } else if (filter.type == BITMAP) {
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bmp_type    = cuvs::core::bitmap_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitmap_filter_obj = cuvs::neighbors::filtering::bitmap_filter(
+      filter_bmp_type((std::uint32_t*)filter_mds.data_handle(), queries_mds.extent(0), index_ptr->size()));
+    cuvs::neighbors::ivf_pq::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitmap_filter_obj);
+  } else if (filter.type == BITSET) {
+    using filter_mdspan_type = raft::device_vector_view<std::uint32_t, int64_t>;
+    using filter_bst_type    = cuvs::core::bitset_view<std::uint32_t, int64_t>;
+    auto filter_tensor       = reinterpret_cast<DLManagedTensor*>(filter.addr);
+    auto filter_mds          = cuvs::core::from_dlpack<filter_mdspan_type>(filter_tensor);
+    const auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(
+      filter_bst_type((std::uint32_t*)filter_mds.data_handle(), index_ptr->size()));
+    cuvs::neighbors::ivf_pq::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds, bitset_filter_obj);
+  } else {
+    RAFT_FAIL("Unsupported filter type");
+  }
 }
 
 template <typename IdxT>
@@ -305,7 +328,8 @@ extern "C" cuvsError_t cuvsIvfPqSearch(cuvsResources_t res,
                                        cuvsIvfPqIndex_t index_c_ptr,
                                        DLManagedTensor* queries_tensor,
                                        DLManagedTensor* neighbors_tensor,
-                                       DLManagedTensor* distances_tensor)
+                                       DLManagedTensor* distances_tensor,
+                                       cuvsFilter filter)
 {
   return cuvs::core::translate_exceptions([=] {
     auto queries   = queries_tensor->dl_tensor;
@@ -327,16 +351,16 @@ extern "C" cuvsError_t cuvsIvfPqSearch(cuvsResources_t res,
     auto index = *index_c_ptr;
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
       _search<float, int64_t>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
     } else if (queries.dtype.code == kDLFloat && queries.dtype.bits == 16) {
       _search<half, int64_t>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
     } else if (queries.dtype.code == kDLInt && queries.dtype.bits == 8) {
       _search<int8_t, int64_t>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
     } else if (queries.dtype.code == kDLUInt && queries.dtype.bits == 8) {
       _search<uint8_t, int64_t>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
     } else {
       RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d",
                 queries.dtype.code,
diff --git a/c/src/neighbors/tiered_index.cpp b/c/src/neighbors/tiered_index.cpp
index 2a7d54b16d..1c21b0876a 100644
--- a/c/src/neighbors/tiered_index.cpp
+++ b/c/src/neighbors/tiered_index.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -71,6 +71,9 @@ void* _build(cuvsResources_t res, cuvsTieredIndexParams params, DLManagedTensor*
     case CUVS_TIERED_INDEX_ALGO_CAGRA: {
       auto build_params = tiered_index::index_params<cagra::index_params>();
       convert_c_index_params(params, dataset.shape[0], dataset.shape[1], &build_params);
+      // The tiered index sub-CAGRA always needs the dataset attached for search.
+      // Force this in case the caller did not set the field (e.g. zero-initialized struct).
+      build_params.attach_dataset_on_build = true;
       return new tiered_index::index<cagra::index<T, uint32_t>>(
         tiered_index::build(*res_ptr, build_params, mds));
     }
diff --git a/c/tests/neighbors/ann_cagra_c.cu b/c/tests/neighbors/ann_cagra_c.cu
index 9c14bbea7d..5f1e0cfe33 100644
--- a/c/tests/neighbors/ann_cagra_c.cu
+++ b/c/tests/neighbors/ann_cagra_c.cu
@@ -337,7 +337,229 @@ TEST(CagraC, BuildExtendSearch)
   cuvsResourcesDestroy(res);
 }
 
-TEST(CagraC, BuildSearchFiltered)
+TEST(CagraC, BuildNoDatasetThenUpdateAndSearch)
+{
+  // Test the attach_dataset_on_build = false workflow:
+  // 1. Build index without attaching dataset (saves a full dataset copy)
+  // 2. Attach dataset via cuvsCagraUpdateDataset
+  // 3. Search and verify correctness
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+  cudaStream_t stream;
+  cuvsStreamGet(res, &stream);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = dataset;
+  dataset_tensor.dl_tensor.device.device_type = kDLCPU;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {4, 2};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = nullptr;
+
+  // create index
+  cuvsCagraIndex_t index;
+  cuvsCagraIndexCreate(&index);
+
+  // build index with attach_dataset_on_build = false
+  cuvsCagraIndexParams_t build_params;
+  cuvsCagraIndexParamsCreate(&build_params);
+  build_params->attach_dataset_on_build = false;
+  ASSERT_EQ(cuvsCagraBuild(res, build_params, &dataset_tensor, index), CUVS_SUCCESS);
+
+  // now attach the dataset
+  ASSERT_EQ(cuvsCagraUpdateDataset(res, &dataset_tensor, index), CUVS_SUCCESS);
+
+  // create queries DLTensor
+  rmm::device_uvector<float> queries_d(4 * 2, stream);
+  raft::copy(queries_d.data(), (float*)queries, 4 * 2, stream);
+
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = queries_d.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {4, 2};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = nullptr;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<uint32_t> neighbors_d(4, stream);
+
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_d.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLUInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 32;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {4, 1};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = nullptr;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_d(4, stream);
+
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_d.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {4, 1};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = nullptr;
+
+  cuvsFilter filter;
+  filter.type = NO_FILTER;
+  filter.addr = (uintptr_t)NULL;
+
+  // search index
+  cuvsCagraSearchParams_t search_params;
+  cuvsCagraSearchParamsCreate(&search_params);
+  cuvsCagraSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // verify output — should match the standard BuildSearch test results
+  ASSERT_TRUE(
+    cuvs::devArrMatchHost(neighbors_exp, neighbors_d.data(), 4, cuvs::Compare<uint32_t>()));
+  ASSERT_TRUE(cuvs::devArrMatchHost(
+    distances_exp, distances_d.data(), 4, cuvs::CompareApprox<float>(0.001f)));
+
+  // de-allocate index and res
+  cuvsCagraSearchParamsDestroy(search_params);
+  cuvsCagraIndexParamsDestroy(build_params);
+  cuvsCagraIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
+TEST(CagraC, BuildNoDatasetThenUpdateDeviceAndSearch)
+{
+  // Test the motivating scenario: dataset already on device (kDLCUDA).
+  // Using attach_dataset_on_build = false avoids duplicating the device dataset,
+  // then cuvsCagraUpdateDataset attaches it (zero-copy when properly aligned).
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+  cudaStream_t stream;
+  cuvsStreamGet(res, &stream);
+
+  // copy dataset to device memory (simulating a dataset that is already on GPU)
+  rmm::device_uvector<float> dataset_d(4 * 2, stream);
+  raft::copy(dataset_d.data(), (float*)dataset, 4 * 2, stream);
+
+  // create dataset DLTensor on CPU for building the graph
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = dataset;
+  dataset_tensor.dl_tensor.device.device_type = kDLCPU;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {4, 2};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = nullptr;
+
+  // create index
+  cuvsCagraIndex_t index;
+  cuvsCagraIndexCreate(&index);
+
+  // build index with attach_dataset_on_build = false
+  cuvsCagraIndexParams_t build_params;
+  cuvsCagraIndexParamsCreate(&build_params);
+  build_params->attach_dataset_on_build = false;
+  ASSERT_EQ(cuvsCagraBuild(res, build_params, &dataset_tensor, index), CUVS_SUCCESS);
+
+  // attach the device-resident dataset via cuvsCagraUpdateDataset (kDLCUDA path)
+  DLManagedTensor device_dataset_tensor;
+  device_dataset_tensor.dl_tensor.data               = dataset_d.data();
+  device_dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  device_dataset_tensor.dl_tensor.device.device_id   = 0;
+  device_dataset_tensor.dl_tensor.ndim               = 2;
+  device_dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  device_dataset_tensor.dl_tensor.dtype.bits         = 32;
+  device_dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  device_dataset_tensor.dl_tensor.shape              = dataset_shape;
+  device_dataset_tensor.dl_tensor.strides            = nullptr;
+
+  ASSERT_EQ(cuvsCagraUpdateDataset(res, &device_dataset_tensor, index), CUVS_SUCCESS);
+
+  // create queries DLTensor
+  rmm::device_uvector<float> queries_d(4 * 2, stream);
+  raft::copy(queries_d.data(), (float*)queries, 4 * 2, stream);
+
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = queries_d.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {4, 2};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = nullptr;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<uint32_t> neighbors_d(4, stream);
+
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_d.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLUInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 32;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {4, 1};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = nullptr;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_d(4, stream);
+
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_d.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {4, 1};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = nullptr;
+
+  cuvsFilter filter;
+  filter.type = NO_FILTER;
+  filter.addr = (uintptr_t)NULL;
+
+  // search index
+  cuvsCagraSearchParams_t search_params;
+  cuvsCagraSearchParamsCreate(&search_params);
+  cuvsCagraSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // verify output — should match the standard BuildSearch test results
+  ASSERT_TRUE(
+    cuvs::devArrMatchHost(neighbors_exp, neighbors_d.data(), 4, cuvs::Compare<uint32_t>()));
+  ASSERT_TRUE(cuvs::devArrMatchHost(
+    distances_exp, distances_d.data(), 4, cuvs::CompareApprox<float>(0.001f)));
+
+  // de-allocate index and res
+  cuvsCagraSearchParamsDestroy(search_params);
+  cuvsCagraIndexParamsDestroy(build_params);
+  cuvsCagraIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
+TEST(CagraC, BuildSearchBitsetFiltered)
 {
   // create cuvsResources_t
   cuvsResources_t res;
@@ -447,6 +669,142 @@ TEST(CagraC, BuildSearchFiltered)
   cuvsResourcesDestroy(res);
 }
 
+TEST(CagraC, BuildSearchBitmapFiltered)
+{
+  int64_t n_rows       = 100;
+  int64_t n_queries    = 10;
+  int64_t n_dim        = 16;
+  uint32_t n_neighbors = 4;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  // Generate data
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  raft::random::RngState r(1234ULL);
+  raft::random::uniform(
+    handle, r, index_data.data(), n_rows * n_dim, float(0.1), float(2.0));
+  raft::random::uniform(
+    handle, r, query_data.data(), n_queries * n_dim, float(0.1), float(2.0));
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = index_data.data();
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = nullptr;
+
+  // create index
+  cuvsCagraIndex_t index;
+  cuvsCagraIndexCreate(&index);
+
+  // build index
+  cuvsCagraIndexParams_t build_params;
+  cuvsCagraIndexParamsCreate(&build_params);
+  cuvsCagraBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = query_data.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = nullptr;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<uint32_t> neighbors_data(n_queries * n_neighbors, stream);
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_data.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLUInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 32;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = nullptr;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_data.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = nullptr;
+
+  // Create bitmap filter - per query filter
+  // For each query, remove even indices
+  auto bitmap_size = n_queries * ((n_rows + 31) / 32);  // n_queries x (bits for n_rows)
+  rmm::device_uvector<uint32_t> filter_bitmap(bitmap_size, stream);
+  std::vector<uint32_t> filter_bitmap_h(bitmap_size);
+  for (size_t q = 0; q < n_queries; ++q) {
+    for (size_t i = 0; i < (n_rows + 31) / 32; ++i) {
+      filter_bitmap_h[q * ((n_rows + 31) / 32) + i] =
+        0xAAAAAAAA;  // 10101010... pattern - removes even indices
+    }
+  }
+  raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream);
+
+  DLManagedTensor filter_tensor;
+  filter_tensor.dl_tensor.data               = filter_bitmap.data();
+  filter_tensor.dl_tensor.device.device_type = kDLCUDA;
+  filter_tensor.dl_tensor.ndim               = 1;
+  filter_tensor.dl_tensor.dtype.code         = kDLUInt;
+  filter_tensor.dl_tensor.dtype.bits         = 32;
+  filter_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t filter_shape[1]                    = {bitmap_size};
+  filter_tensor.dl_tensor.shape              = filter_shape;
+  filter_tensor.dl_tensor.strides            = nullptr;
+
+  cuvsFilter filter;
+  filter.type = BITMAP;
+  filter.addr = (uintptr_t)&filter_tensor;
+
+  // search index with bitmap filter
+  cuvsCagraSearchParams_t search_params;
+  cuvsCagraSearchParamsCreate(&search_params);
+  cuvsCagraSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // Verify all returned neighbors are odd indices (not filtered out)
+  std::vector<uint32_t> neighbors_h(n_queries * n_neighbors);
+  raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream);
+  raft::resource::sync_stream(handle);
+
+  for (size_t i = 0; i < n_queries * n_neighbors; ++i) {
+    // All neighbors should be odd indices (since even indices are filtered)
+    // Note: uint32_t max value indicates no valid neighbor found
+    ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == std::numeric_limits<uint32_t>::max())
+      << "Neighbor at position " << i << " has value " << neighbors_h[i]
+      << " which is an even index (should be filtered)";
+  }
+
+  // de-allocate index and res
+  cuvsCagraSearchParamsDestroy(search_params);
+  cuvsCagraIndexParamsDestroy(build_params);
+  cuvsCagraIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
 TEST(CagraC, BuildMergeSearch)
 {
   cuvsResources_t res;
diff --git a/c/tests/neighbors/ann_ivf_flat_c.cu b/c/tests/neighbors/ann_ivf_flat_c.cu
index 2039721d2f..f07706fa77 100644
--- a/c/tests/neighbors/ann_ivf_flat_c.cu
+++ b/c/tests/neighbors/ann_ivf_flat_c.cu
@@ -129,3 +129,276 @@ TEST(IvfFlatC, BuildSearch)
               n_probes,
               n_lists);
 }
+TEST(IvfFlatC, BuildSearchBitsetFiltered)
+{
+  int64_t n_rows       = 1000;
+  int64_t n_queries    = 10;
+  int64_t n_dim        = 16;
+  uint32_t n_neighbors = 10;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  cuvsDistanceType metric = L2Expanded;
+  size_t n_probes         = 10;
+  size_t n_lists          = 20;
+
+  // Generate data
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  generate_random_data(index_data.data(), n_rows * n_dim);
+  generate_random_data(query_data.data(), n_queries * n_dim);
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = index_data.data();
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = NULL;
+
+  // create index
+  cuvsIvfFlatIndex_t index;
+  cuvsIvfFlatIndexCreate(&index);
+
+  // build index
+  cuvsIvfFlatIndexParams_t build_params;
+  cuvsIvfFlatIndexParamsCreate(&build_params);
+  build_params->metric  = metric;
+  build_params->n_lists = n_lists;
+  cuvsIvfFlatBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = query_data.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = NULL;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_data.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = NULL;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_data.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = NULL;
+
+  // Create bitset filter - remove every other index
+  auto bitset_size = (n_rows + 31) / 32;  // number of uint32_t needed
+  rmm::device_uvector<uint32_t> filter_bitset(bitset_size, stream);
+  std::vector<uint32_t> filter_bitset_h(bitset_size);
+  for (size_t i = 0; i < bitset_size; ++i) {
+    filter_bitset_h[i] = 0xAAAAAAAA;  // 10101010... pattern - removes even indices
+  }
+  raft::copy(filter_bitset.data(), filter_bitset_h.data(), bitset_size, stream);
+
+  DLManagedTensor filter_tensor;
+  filter_tensor.dl_tensor.data               = filter_bitset.data();
+  filter_tensor.dl_tensor.device.device_type = kDLCUDA;
+  filter_tensor.dl_tensor.ndim               = 1;
+  filter_tensor.dl_tensor.dtype.code         = kDLUInt;
+  filter_tensor.dl_tensor.dtype.bits         = 32;
+  filter_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t filter_shape[1]                    = {bitset_size};
+  filter_tensor.dl_tensor.shape              = filter_shape;
+  filter_tensor.dl_tensor.strides            = NULL;
+
+  cuvsFilter filter;
+  filter.type = BITSET;
+  filter.addr = (uintptr_t)&filter_tensor;
+
+  // search index with filter
+  cuvsIvfFlatSearchParams_t search_params;
+  cuvsIvfFlatSearchParamsCreate(&search_params);
+  search_params->n_probes = n_probes;
+  cuvsIvfFlatSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // Verify all returned neighbors are odd indices (not filtered out)
+  std::vector<int64_t> neighbors_h(n_queries * n_neighbors);
+  raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream);
+  raft::resource::sync_stream(handle);
+
+  for (size_t i = 0; i < n_queries * n_neighbors; ++i) {
+    // All neighbors should be odd indices (since even indices are filtered)
+    ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1)
+      << "Neighbor at position " << i << " has value " << neighbors_h[i]
+      << " which is an even index (should be filtered)";
+  }
+
+  // de-allocate index and res
+  cuvsIvfFlatSearchParamsDestroy(search_params);
+  cuvsIvfFlatIndexParamsDestroy(build_params);
+  cuvsIvfFlatIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
+TEST(IvfFlatC, BuildSearchBitmapFiltered)
+{
+  int64_t n_rows       = 1000;
+  int64_t n_queries    = 10;
+  int64_t n_dim        = 16;
+  uint32_t n_neighbors = 10;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  cuvsDistanceType metric = L2Expanded;
+  size_t n_probes         = 10;
+  size_t n_lists          = 20;
+
+  // Generate data
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  generate_random_data(index_data.data(), n_rows * n_dim);
+  generate_random_data(query_data.data(), n_queries * n_dim);
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = index_data.data();
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = NULL;
+
+  // create index
+  cuvsIvfFlatIndex_t index;
+  cuvsIvfFlatIndexCreate(&index);
+
+  // build index
+  cuvsIvfFlatIndexParams_t build_params;
+  cuvsIvfFlatIndexParamsCreate(&build_params);
+  build_params->metric  = metric;
+  build_params->n_lists = n_lists;
+  cuvsIvfFlatBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = query_data.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = NULL;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_data.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = NULL;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_data.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = NULL;
+
+  // Create bitmap filter - per query filter
+  // For each query, remove even indices
+  auto bitmap_size = n_queries * ((n_rows + 31) / 32);  // n_queries x (bits for n_rows)
+  rmm::device_uvector<uint32_t> filter_bitmap(bitmap_size, stream);
+  std::vector<uint32_t> filter_bitmap_h(bitmap_size);
+  for (size_t q = 0; q < n_queries; ++q) {
+    for (size_t i = 0; i < (n_rows + 31) / 32; ++i) {
+      filter_bitmap_h[q * ((n_rows + 31) / 32) + i] =
+        0xAAAAAAAA;  // 10101010... pattern - removes even indices
+    }
+  }
+  raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream);
+
+  DLManagedTensor filter_tensor;
+  filter_tensor.dl_tensor.data               = filter_bitmap.data();
+  filter_tensor.dl_tensor.device.device_type = kDLCUDA;
+  filter_tensor.dl_tensor.ndim               = 1;
+  filter_tensor.dl_tensor.dtype.code         = kDLUInt;
+  filter_tensor.dl_tensor.dtype.bits         = 32;
+  filter_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t filter_shape[1]                    = {bitmap_size};
+  filter_tensor.dl_tensor.shape              = filter_shape;
+  filter_tensor.dl_tensor.strides            = NULL;
+
+  cuvsFilter filter;
+  filter.type = BITMAP;
+  filter.addr = (uintptr_t)&filter_tensor;
+
+  // search index with bitmap filter
+  cuvsIvfFlatSearchParams_t search_params;
+  cuvsIvfFlatSearchParamsCreate(&search_params);
+  search_params->n_probes = n_probes;
+  cuvsIvfFlatSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // Verify all returned neighbors are odd indices (not filtered out)
+  std::vector<int64_t> neighbors_h(n_queries * n_neighbors);
+  raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream);
+  raft::resource::sync_stream(handle);
+
+  for (size_t i = 0; i < n_queries * n_neighbors; ++i) {
+    // All neighbors should be odd indices (since even indices are filtered)
+    ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1)
+      << "Neighbor at position " << i << " has value " << neighbors_h[i]
+      << " which is an even index (should be filtered)";
+  }
+
+  // de-allocate index and res
+  cuvsIvfFlatSearchParamsDestroy(search_params);
+  cuvsIvfFlatIndexParamsDestroy(build_params);
+  cuvsIvfFlatIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
diff --git a/c/tests/neighbors/ann_ivf_pq_c.cu b/c/tests/neighbors/ann_ivf_pq_c.cu
index 06c2f7f6e1..9bfedf810a 100644
--- a/c/tests/neighbors/ann_ivf_pq_c.cu
+++ b/c/tests/neighbors/ann_ivf_pq_c.cu
@@ -129,3 +129,274 @@ TEST(IvfPqC, BuildSearch)
               n_probes,
               n_lists);
 }
+
+TEST(IvfPqC, BuildSearchBitsetFiltered)
+{
+  int64_t n_rows       = 1000;
+  int64_t n_queries    = 10;
+  int64_t n_dim        = 16;
+  uint32_t n_neighbors = 10;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  cuvsDistanceType metric = L2Expanded;
+  size_t n_probes         = 10;
+  size_t n_lists          = 20;
+
+  // Generate data
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  generate_random_data(index_data.data(), n_rows * n_dim);
+  generate_random_data(query_data.data(), n_queries * n_dim);
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = index_data.data();
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = NULL;
+
+  // create index
+  cuvsIvfPqIndex_t index;
+  cuvsIvfPqIndexCreate(&index);
+
+  // build index
+  cuvsIvfPqIndexParams_t build_params;
+  cuvsIvfPqIndexParamsCreate(&build_params);
+  build_params->metric  = metric;
+  build_params->n_lists = n_lists;
+  cuvsIvfPqBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = query_data.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = NULL;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_data.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = NULL;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_data.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = NULL;
+
+  // Create bitset filter - remove every other index
+  auto bitset_size = (n_rows + 31) / 32;  // number of uint32_t needed
+  rmm::device_uvector<uint32_t> filter_bitset(bitset_size, stream);
+  std::vector<uint32_t> filter_bitset_h(bitset_size);
+  for (size_t i = 0; i < bitset_size; ++i) {
+    filter_bitset_h[i] = 0xAAAAAAAA;  // 10101010... pattern - removes even indices
+  }
+  raft::copy(filter_bitset.data(), filter_bitset_h.data(), bitset_size, stream);
+
+  DLManagedTensor filter_tensor;
+  filter_tensor.dl_tensor.data               = filter_bitset.data();
+  filter_tensor.dl_tensor.device.device_type = kDLCUDA;
+  filter_tensor.dl_tensor.ndim               = 1;
+  filter_tensor.dl_tensor.dtype.code         = kDLUInt;
+  filter_tensor.dl_tensor.dtype.bits         = 32;
+  filter_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t filter_shape[1]                    = {bitset_size};
+  filter_tensor.dl_tensor.shape              = filter_shape;
+  filter_tensor.dl_tensor.strides            = NULL;
+
+  cuvsFilter filter;
+  filter.type = BITSET;
+  filter.addr = (uintptr_t)&filter_tensor;
+
+  // search index with filter
+  cuvsIvfPqSearchParams_t search_params;
+  cuvsIvfPqSearchParamsCreate(&search_params);
+  search_params->n_probes = n_probes;
+  cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // Verify all returned neighbors are odd indices (not filtered out)
+  std::vector<int64_t> neighbors_h(n_queries * n_neighbors);
+  raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream);
+  raft::resource::sync_stream(handle);
+
+  for (size_t i = 0; i < n_queries * n_neighbors; ++i) {
+    // All neighbors should be odd indices (since even indices are filtered)
+    ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1)
+      << "Neighbor at position " << i << " has value " << neighbors_h[i]
+      << " which is an even index (should be filtered)";
+  }
+
+  // de-allocate index and res
+  cuvsIvfPqSearchParamsDestroy(search_params);
+  cuvsIvfPqIndexParamsDestroy(build_params);
+  cuvsIvfPqIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
+TEST(IvfPqC, BuildSearchBitmapFiltered)
+{
+  int64_t n_rows       = 1000;
+  int64_t n_queries    = 10;
+  int64_t n_dim        = 16;
+  uint32_t n_neighbors = 10;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  cuvsDistanceType metric = L2Expanded;
+  size_t n_probes         = 10;
+  size_t n_lists          = 20;
+
+  // Generate data
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  generate_random_data(index_data.data(), n_rows * n_dim);
+  generate_random_data(query_data.data(), n_queries * n_dim);
+
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = index_data.data();
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = NULL;
+
+  // create index
+  cuvsIvfPqIndex_t index;
+  cuvsIvfPqIndexCreate(&index);
+
+  // build index
+  cuvsIvfPqIndexParams_t build_params;
+  cuvsIvfPqIndexParamsCreate(&build_params);
+  build_params->metric  = metric;
+  build_params->n_lists = n_lists;
+  cuvsIvfPqBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = query_data.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = NULL;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_data.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = NULL;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_data.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = NULL;
+
+  // Create bitmap filter - per query filter
+  // For each query, remove even indices
+  auto bitmap_size = n_queries * ((n_rows + 31) / 32);  // n_queries x (bits for n_rows)
+  rmm::device_uvector<uint32_t> filter_bitmap(bitmap_size, stream);
+  std::vector<uint32_t> filter_bitmap_h(bitmap_size);
+  for (size_t q = 0; q < n_queries; ++q) {
+    for (size_t i = 0; i < (n_rows + 31) / 32; ++i) {
+      filter_bitmap_h[q * ((n_rows + 31) / 32) + i] = 0xAAAAAAAA;  // 10101010... pattern - removes even indices
+    }
+  }
+  raft::copy(filter_bitmap.data(), filter_bitmap_h.data(), bitmap_size, stream);
+
+  DLManagedTensor filter_tensor;
+  filter_tensor.dl_tensor.data               = filter_bitmap.data();
+  filter_tensor.dl_tensor.device.device_type = kDLCUDA;
+  filter_tensor.dl_tensor.ndim               = 1;
+  filter_tensor.dl_tensor.dtype.code         = kDLUInt;
+  filter_tensor.dl_tensor.dtype.bits         = 32;
+  filter_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t filter_shape[1]                    = {bitmap_size};
+  filter_tensor.dl_tensor.shape              = filter_shape;
+  filter_tensor.dl_tensor.strides            = NULL;
+
+  cuvsFilter filter;
+  filter.type = BITMAP;
+  filter.addr = (uintptr_t)&filter_tensor;
+
+  // search index with filter
+  cuvsIvfPqSearchParams_t search_params;
+  cuvsIvfPqSearchParamsCreate(&search_params);
+  search_params->n_probes = n_probes;
+  cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // Verify all returned neighbors are odd indices (not filtered out)
+  std::vector<int64_t> neighbors_h(n_queries * n_neighbors);
+  raft::copy(neighbors_h.data(), neighbors_data.data(), n_queries * n_neighbors, stream);
+  raft::resource::sync_stream(handle);
+
+  for (size_t i = 0; i < n_queries * n_neighbors; ++i) {
+    // All neighbors should be odd indices (since even indices are filtered)
+    ASSERT_TRUE(neighbors_h[i] % 2 == 1 || neighbors_h[i] == -1)
+      << "Neighbor at position " << i << " has value " << neighbors_h[i]
+      << " which is an even index (should be filtered)";
+  }
+
+  // de-allocate index and res
+  cuvsIvfPqSearchParamsDestroy(search_params);
+  cuvsIvfPqIndexParamsDestroy(build_params);
+  cuvsIvfPqIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
diff --git a/c/tests/neighbors/run_ivf_pq_c.c b/c/tests/neighbors/run_ivf_pq_c.c
index 64154fb3c0..1006f1e8a9 100644
--- a/c/tests/neighbors/run_ivf_pq_c.c
+++ b/c/tests/neighbors/run_ivf_pq_c.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -80,11 +80,15 @@ void run_ivf_pq(int64_t n_rows,
   distances_tensor.dl_tensor.shape              = distances_shape;
   distances_tensor.dl_tensor.strides            = NULL;
 
+  cuvsFilter filter;
+  filter.type = NO_FILTER;
+  filter.addr = (uintptr_t)NULL;
+
   // search index
   cuvsIvfPqSearchParams_t search_params;
   cuvsIvfPqSearchParamsCreate(&search_params);
   search_params->n_probes = n_probes;
-  cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor);
+  cuvsIvfPqSearch(res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
 
   // de-allocate index and res
   cuvsIvfPqSearchParamsDestroy(search_params);
diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp
index 050f0cecdc..b0a65b4369 100644
--- a/cpp/include/cuvs/cluster/agglomerative.hpp
+++ b/cpp/include/cuvs/cluster/agglomerative.hpp
@@ -129,7 +129,7 @@ struct distance_params {
 /** Specialized parameters to build the Mutual Reachability graph */
 struct mutual_reachability_params {
   /** this neighborhood will be selected for core distances. */
-  int min_samples;
+  int min_samples = 5;
 
   /** weight applied when internal distance is chosen for mutual reachability (value of 1.0 disables
    * the weighting) */
diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp
index 13c8c7bd7e..7eb4527878 100644
--- a/cpp/include/cuvs/distance/distance.hpp
+++ b/cpp/include/cuvs/distance/distance.hpp
@@ -93,10 +93,10 @@ enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
  */
 struct KernelParams {
   // Kernel function parameters
-  KernelType kernel;  //!< Type of the kernel function
-  int degree;         //!< Degree of polynomial kernel (ignored by others)
-  double gamma;       //!< multiplier in the
-  double coef0;       //!< additive constant in poly and tanh kernels
+  KernelType kernel = KernelType::LINEAR;  //!< Type of the kernel function
+  int degree        = 3;                   //!< Degree of polynomial kernel (ignored by others)
+  double gamma      = 1.0;                 //!< multiplier in the
+  double coef0      = 0.0;                 //!< additive constant in poly and tanh kernels
 };
 }  // end namespace kernels
 
diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp
index 4cb3aa47b6..a97316f9ce 100644
--- a/cpp/include/cuvs/distance/grammian.hpp
+++ b/cpp/include/cuvs/distance/grammian.hpp
@@ -36,15 +36,17 @@ using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int,
 template <typename math_t>
 class GramMatrixBase {
  protected:
-  cublasHandle_t cublas_handle;
+  cublasHandle_t cublas_handle = nullptr;
   bool legacy_interface;
 
  public:
-  GramMatrixBase() : legacy_interface(false) {};
+  GramMatrixBase() : legacy_interface(false) {}
   [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
-    : cublas_handle(cublas_handle), legacy_interface(true) {};
+    : cublas_handle(cublas_handle), legacy_interface(true)
+  {
+  }
 
-  virtual ~GramMatrixBase() {};
+  virtual ~GramMatrixBase() = default;
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
@@ -320,10 +322,14 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param offset
    */
   PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
-    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset) {};
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
 
   [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset) {};
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
 
   /** Evaluate kernel matrix using polynomial kernel.
    *
@@ -436,7 +442,9 @@ class TanhKernel : public GramMatrixBase<math_t> {
   TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
 
   [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset) {};
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
+  {
+  }
 
   /** Evaluate kernel matrix using tanh kernel.
    *
@@ -551,10 +559,12 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {};
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
 
   [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain) {};
+    : GramMatrixBase<math_t>(handle), gain(gain)
+  {
+  }
 
   void matrixRowNormL2(raft::resources const& handle,
                        dense_input_matrix_view_t<math_t> matrix,
diff --git a/cpp/include/cuvs/neighbors/ball_cover.hpp b/cpp/include/cuvs/neighbors/ball_cover.hpp
index 17fa94bfd7..77a1bc1d7c 100644
--- a/cpp/include/cuvs/neighbors/ball_cover.hpp
+++ b/cpp/include/cuvs/neighbors/ball_cover.hpp
@@ -135,7 +135,7 @@ struct index : cuvs::neighbors::index {
   raft::device_matrix<float, int64_t, raft::row_major> X_reordered;
 
  protected:
-  bool index_trained;
+  bool index_trained = false;
 };
 
 /** @} */
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 3909098398..d9cefea4d9 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -498,7 +498,7 @@ namespace filtering {
 enum class FilterType { None, Bitmap, Bitset };
 
 struct base_filter {
-  ~base_filter()                             = default;
+  virtual ~base_filter()                     = default;
   virtual FilterType get_filter_type() const = 0;
 };
 
@@ -972,7 +972,7 @@ struct mg_index {
   auto operator=(mg_index&&) -> mg_index&      = default;
 
   distribution_mode mode_;
-  int num_ranks_;
+  int num_ranks_ = 0;
   std::vector<iface<AnnIndexType, T, IdxT>> ann_interfaces_;
 
   // for load balancing mechanism
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index 7ee91f18ba..26137d6669 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -738,9 +738,9 @@ void extend(raft::resources const& res,
  */
 
 struct search_params : cuvs::neighbors::search_params {
-  int ef;               // size of the candidate list
-  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
-                        // automatically maximizes parallelism
+  int ef          = 200;  // size of the candidate list
+  int num_threads = 0;    // number of host threads to use for concurrent searches. Value of 0
+                          // automatically maximizes parallelism
 };
 
 /**
diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp
index c3ba86d5b6..1c34c1f801 100644
--- a/cpp/include/cuvs/neighbors/vamana.hpp
+++ b/cpp/include/cuvs/neighbors/vamana.hpp
@@ -33,8 +33,8 @@ namespace cuvs::neighbors::vamana {
  */
 template <typename T = float>
 struct codebook_params {
-  int pq_codebook_size;
-  int pq_dim;
+  int pq_codebook_size = 0;
+  int pq_dim           = 0;
   std::vector<T> pq_encoding_table;
   std::vector<T> rotation_matrix;
 };
diff --git a/cpp/include/cuvs/preprocessing/spectral_embedding.hpp b/cpp/include/cuvs/preprocessing/spectral_embedding.hpp
index e7a578d2ab..5575b1bb38 100644
--- a/cpp/include/cuvs/preprocessing/spectral_embedding.hpp
+++ b/cpp/include/cuvs/preprocessing/spectral_embedding.hpp
@@ -24,10 +24,10 @@ namespace cuvs::preprocessing::spectral_embedding {
  */
 struct params {
   /** @brief The number of components to reduce the data to. */
-  int n_components;
+  int n_components = 2;
 
   /** @brief The number of neighbors to use for the nearest neighbors graph. */
-  int n_neighbors;
+  int n_neighbors = 15;
 
   /**
    * @brief Whether to normalize the Laplacian matrix.
@@ -36,7 +36,7 @@ struct params {
    * If false, uses the unnormalized graph Laplacian (L = D - W).
    * Normalized Laplacian often leads to better results for clustering tasks.
    */
-  bool norm_laplacian;
+  bool norm_laplacian = true;
 
   /**
    * @brief Whether to drop the first eigenvector.
@@ -45,7 +45,7 @@ struct params {
    * uninformative. Setting this to true drops it from the embedding.
    * This is typically set to true when norm_laplacian is true.
    */
-  bool drop_first;
+  bool drop_first = true;
 
   /**
    * @brief Tolerance for the eigenvalue solver.
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 82bd6e755a..e3c7e120da 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -136,7 +136,7 @@ struct config<float> {
 };
 template <>
 struct config<half> {
-  using value_t                    = half;
+  using value_t                    = float;
   static constexpr double kDivisor = 1.0;
 };
 template <>
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 9d3198eb74..415ebc635c 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2080,7 +2080,7 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    RAFT_LOG_INFO(
+    RAFT_LOG_DEBUG(
       "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
       (uint64_t)cagra_graph.extent(0),
       (double)cagra_graph.extent(0) / final_graph_size,
@@ -2146,7 +2146,7 @@ auto iterative_build_graph(
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
+    RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
 
     if (flag_last) { break; }
     flag_last       = (curr_graph_size == final_graph_size);
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index f9974fa3df..22b325fca5 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -235,6 +235,7 @@ struct dataset_descriptor_host {
     std::mutex mutex;
     std::atomic<bool> ready;  // Not sure if std::holds_alternative is thread-safe
     std::variant<ready_t, init_f> value;
+    cudaEvent_t init_event{nullptr};
 
     template <typename InitF>
     state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
@@ -247,6 +248,7 @@ struct dataset_descriptor_host {
         auto& [ptr, stream] = std::get<ready_t>(value);
         RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
       }
+      if (init_event != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(init_event)); }
     }
 
     void eval(rmm::cuda_stream_view stream)
@@ -255,8 +257,12 @@ struct dataset_descriptor_host {
       if (std::holds_alternative<init_f>(value)) {
         auto& [fun, size]     = std::get<init_f>(value);
         dev_descriptor_t* ptr = nullptr;
+        RAFT_CUDA_TRY(cudaEventCreateWithFlags(&init_event, cudaEventDisableTiming));
         RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream));
         fun(ptr, stream);
+        // Record an event after initialization so that other streams can establish
+        // a GPU-side dependency without expensive host synchronization.
+        RAFT_CUDA_TRY(cudaEventRecord(init_event, stream));
         value = std::make_tuple(ptr, stream);
         ready.store(true, std::memory_order_release);
       }
@@ -265,6 +271,11 @@ struct dataset_descriptor_host {
     auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t*
     {
       if (!ready.load(std::memory_order_acquire)) { eval(stream); }
+      // Make the caller's stream wait for the init to complete. This is a
+      // lightweight GPU-side dependency with no host blocking.  On the same
+      // stream that performed the init (or after the event has already
+      // completed) this is essentially a no-op.
+      if (init_event != nullptr) { RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, init_event)); }
       return std::get<0>(std::get<ready_t>(value));
     }
   };
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index 9cc6aeb353..b67044ce36 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -110,7 +110,13 @@ struct search_plan_impl_base : public search_params {
   {
     if (algo == search_algo::AUTO) {
       const size_t num_sm = raft::getMultiProcessorCount();
-      if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
+      // SINGLE_CTA computes max_iterations as itopk_size / search_width.
+      // When search_width > 1, this results in significantly fewer iterations
+      // compared to MULTI_CTA (which uses its own internal mc_search_width=1).
+      // This causes recall degradation when AUTO switches to SINGLE_CTA at
+      // large batch sizes. Only select SINGLE_CTA when search_width <= 1 to
+      // ensure consistent recall across batch sizes. (See #1187)
+      if (itopk_size <= 512 && search_width <= 1 && search_params::max_queries >= num_sm * 2lu) {
         algo = search_algo::SINGLE_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
       } else {
diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh
index 41c95c0ccd..ebc0e94495 100644
--- a/cpp/src/neighbors/detail/smem_utils.cuh
+++ b/cpp/src/neighbors/detail/smem_utils.cuh
@@ -5,7 +5,6 @@
 #pragma once
 
 #include <raft/core/error.hpp>
-
 #include <atomic>
 #include <cstdint>
 #include <mutex>
@@ -14,14 +13,19 @@ namespace cuvs::neighbors::detail {
 
 /**
  * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
- * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
- * atomically.
  *
- * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
- * guarantees that the kernel is safe to launch.
+ * Maintains a monotonically growing high-water mark for
+ * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
+ * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
+ * it is grown for the current kernel. This guarantees every kernel's attribute is always >=
+ * smem_size at the time of launch.
+ *
+ * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
+ * template instantiations may share the same KernelT type (e.g. function pointers with the same
+ * signature), so we track the kernel identity alongside the smem high-water mark.
  *
  * @tparam KernelT The type of the kernel.
- * @tparam InvocationT The type of the invocation function.
+ * @tparam KernelLauncherT The type of the launch function/lambda.
  * @param kernel The kernel function address (for whom the smem-size is specified).
  * @param smem_size The size of the dynamic shared memory to be set.
  * @param launch The kernel launch function/lambda.
@@ -31,31 +35,41 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
                                          uint32_t smem_size,
                                          KernelLauncherT const& launch)
 {
-  // the last smem size is parameterized by the kernel thanks to the template parameter.
-  static std::atomic<uint32_t> current_smem_size{0};
-  auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
-  if (smem_size > last_smem_size) {
-    // We still need a mutex for the critical section: actualize last_smem_size and set the
-    // attribute.
-    static auto mutex = std::mutex{};
-    auto guard        = std::lock_guard<std::mutex>{mutex};
-    if (!current_smem_size.compare_exchange_strong(
-          last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
-      // The value has been updated by another thread between the load and the mutex acquisition.
-      if (smem_size > last_smem_size) {
-        current_smem_size.store(smem_size, std::memory_order_relaxed);
-      }
-    }
-    // Only update if the last seen value is smaller than the new one.
-    if (smem_size > last_smem_size) {
-      auto launch_status =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-      RAFT_EXPECTS(launch_status == cudaSuccess,
-                   "Failed to set max dynamic shared memory size to %u bytes",
-                   smem_size);
-    }
+  // last_smem_size is a monotonically growing high-water mark across all kernel pointers.
+  // last_kernel tracks which kernel pointer was last used.
+  static std::atomic<uint32_t> last_smem_size{0};
+  static std::atomic<KernelT> last_kernel{KernelT{}};
+  static std::mutex mutex;
+  bool updated_needed = false;
+  // When the kernel function pointer changes, bring the new kernel up to the global high-water
+  // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer,
+  // not to the pointer type — different template instantiations may share the same KernelT.
+  if (kernel != last_kernel.load(std::memory_order_relaxed))
+  {
+    last_kernel.store(kernel, std::memory_order_relaxed);
+    updated_needed = true;
+  }
+  // Since we first read the kernel pointer, and the shem_size can only grow,
+  // reading an inconsistent state is safe. At worst we will use a larger smem_size
+  uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed);
+  if (smem_size > cur_smem_size)
+  {
+    last_smem_size.store(smem_size, std::memory_order_relaxed);
+    cur_smem_size = smem_size;
+    updated_needed = true;
+  }
+  // Mutex-protected cudaFuncSetAttribute
+  if (updated_needed)
+  {
+    std::lock_guard<std::mutex> guard(mutex);
+    auto launch_status =
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size);
+    RAFT_EXPECTS(launch_status == cudaSuccess,
+                 "Failed to set max dynamic shared memory size to %u bytes",
+                 cur_smem_size);
   }
-  // We don't need to guard the kernel launch because the smem_size can only grow.
+  // The kernel launch is outside the lock: any concurrent cudaFuncSetAttribute can only increase
+  // the limit, so the launch is always safe.
   return launch(kernel);
 }
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
index 4c0bb3644a..4e6c96f68c 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -122,8 +122,8 @@ struct loadAndComputeDist {
       for (int k = 0; k < Veclen; ++k) {
         compute_dist(dist, queryRegs[k], encV[k]);
         if constexpr (ComputeNorm) {
-          norm_query += queryRegs[k] * queryRegs[k];
-          norm_data += encV[k] * encV[k];
+          norm_query += (AccT)(queryRegs[k] * queryRegs[k]);
+          norm_data += (AccT)(encV[k] * encV[k]);
         }
       }
     }
@@ -157,8 +157,8 @@ struct loadAndComputeDist {
           T q = raft::shfl(queryReg, d + k, raft::WarpSize);
           compute_dist(dist, q, encV[k]);
           if constexpr (ComputeNorm) {
-            norm_query += q * q;
-            norm_data += encV[k] * encV[k];
+            norm_query += (AccT)(q * q);
+            norm_data += (AccT)(encV[k] * encV[k]);
           }
         }
       }
@@ -183,8 +183,8 @@ struct loadAndComputeDist {
         T q = raft::shfl(queryReg, d + k, raft::WarpSize);
         compute_dist(dist, q, enc[k]);
         if constexpr (ComputeNorm) {
-          norm_query += q * q;
-          norm_data += enc[k] * enc[k];
+          norm_query += (AccT)(q * q);
+          norm_data += (AccT)(enc[k] * enc[k]);
         }
       }
     }
@@ -875,7 +875,10 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
       uint32_t sample_offset = 0;
       if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; }
       assert(list_length == chunk_indices[probe_id] - sample_offset);
-      assert(sample_offset + list_length <= max_samples);
+      if constexpr (!kManageLocalTopK) {
+        // max_samples is zero/unused in the kManageLocalTopK mode
+        assert(sample_offset + list_length <= max_samples);
+      }
 
       constexpr int kUnroll        = raft::WarpSize / Veclen;
       constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize;
diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh
index bf86fbb118..d48a6578fd 100644
--- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh
+++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/ivf_flat_interleaved_scan_kernel.cuh
@@ -833,7 +833,10 @@ __device__ __forceinline__ void interleaved_scan_kernel_impl(const uint32_t quer
       uint32_t sample_offset = 0;
       if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; }
       assert(list_length == chunk_indices[probe_id] - sample_offset);
-      assert(sample_offset + list_length <= max_samples);
+      if constexpr (!kManageLocalTopK) {
+        // max_samples is zero/unused in the kManageLocalTopK mode
+        assert(sample_offset + list_length <= max_samples);
+      }
 
       constexpr int kUnroll        = raft::WarpSize / Veclen;
       constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize;
diff --git a/examples/c/src/ivf_pq_c_example.c b/examples/c/src/ivf_pq_c_example.c
index 9ec8221431..3620543015 100644
--- a/examples/c/src/ivf_pq_c_example.c
+++ b/examples/c/src/ivf_pq_c_example.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -57,9 +57,14 @@ void ivf_pq_build_search(cuvsResources_t* res,
   search_params->internal_distance_dtype = CUDA_R_16F;
   search_params->lut_dtype               = CUDA_R_16F;
 
+  // Create filter (no filtering)
+  cuvsFilter filter;
+  filter.type = NO_FILTER;
+  filter.addr = (uintptr_t)NULL;
+
   // Search the `index` built using `cuvsIvfPqBuild`
   CHECK_CUVS(cuvsIvfPqSearch(
-    *res, search_params, index, queries_tensor, &neighbors_tensor, &distances_tensor));
+    *res, search_params, index, queries_tensor, &neighbors_tensor, &distances_tensor, filter));
 
   int64_t* neighbors = (int64_t*)malloc(n_queries * topk * sizeof(int64_t));
   float* distances   = (float*)malloc(n_queries * topk * sizeof(float));
diff --git a/go/ivf_pq/ivf_pq.go b/go/ivf_pq/ivf_pq.go
index cbbec629d1..c6bf5852b3 100644
--- a/go/ivf_pq/ivf_pq.go
+++ b/go/ivf_pq/ivf_pq.go
@@ -68,6 +68,10 @@ func SearchIndex[T any](Resources cuvs.Resource, params *SearchParams, index *Iv
 	if !index.trained {
 		return errors.New("index needs to be built before calling search")
 	}
+	prefilter := C.cuvsFilter{
+		addr:  0,
+		_type: C.NO_FILTER,
+	}
 
-	return cuvs.CheckCuvs(cuvs.CuvsError(C.cuvsIvfPqSearch(C.cuvsResources_t(Resources.Resource), params.params, index.index, (*C.DLManagedTensor)(unsafe.Pointer(queries.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(neighbors.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(distances.C_tensor)))))
+	return cuvs.CheckCuvs(cuvs.CuvsError(C.cuvsIvfPqSearch(C.cuvsResources_t(Resources.Resource), params.params, index.index, (*C.DLManagedTensor)(unsafe.Pointer(queries.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(neighbors.C_tensor)), (*C.DLManagedTensor)(unsafe.Pointer(distances.C_tensor)), prefilter)))
 }
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
index 24bfcc62d7..caaf660460 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
@@ -10,6 +10,7 @@ from libcpp cimport bool
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
 from cuvs.distance_type cimport cuvsDistanceType
+from cuvs.neighbors.filters.filters cimport cuvsFilter
 
 
 cdef extern from "library_types.h":
@@ -141,7 +142,8 @@ cdef extern from "cuvs/neighbors/ivf_pq.h" nogil:
                                 cuvsIvfPqIndex_t index,
                                 DLManagedTensor* queries,
                                 DLManagedTensor* neighbors,
-                                DLManagedTensor* distances)
+                                DLManagedTensor* distances,
+                                cuvsFilter filter)
 
     cuvsError_t cuvsIvfPqSerialize(cuvsResources_t res,
                                    const char * filename,
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
index 3bc96b44eb..509df6801e 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
@@ -35,6 +35,7 @@ from libc.stdint cimport (
 )
 
 from cuvs.common.exceptions import check_cuvs
+from cuvs.neighbors.filters import no_filter
 
 
 cdef class IndexParams:
@@ -748,7 +749,8 @@ def search(SearchParams search_params,
            k,
            neighbors=None,
            distances=None,
-           resources=None):
+           resources=None,
+           filter=None):
     """
     Find the k nearest neighbors for each query.
 
@@ -769,6 +771,7 @@ def search(SearchParams search_params,
                 (n_queries, k) If supplied, the distances to the
                 neighbors will be written here in-place. (default None)
     {resources_docstring}
+    filter : Optional cuvs.neighbors.filters.Filter for prefiltering
 
     Examples
     --------
@@ -826,6 +829,9 @@ def search(SearchParams search_params,
         cydlpack.dlpack_c(distances_cai)
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
 
+    if filter is None:
+        filter = no_filter()
+
     with cuda_interruptible():
         check_cuvs(cuvsIvfPqSearch(
             res,
@@ -833,7 +839,8 @@ def search(SearchParams search_params,
             index.index,
             queries_dlpack,
             neighbors_dlpack,
-            distances_dlpack
+            distances_dlpack,
+            filter.prefilter
         ))
 
     return (distances, neighbors)
diff --git a/python/cuvs/cuvs/tests/test_cagra_batch_recall.py b/python/cuvs/cuvs/tests/test_cagra_batch_recall.py
new file mode 100644
index 0000000000..08755d73e0
--- /dev/null
+++ b/python/cuvs/cuvs/tests/test_cagra_batch_recall.py
@@ -0,0 +1,202 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Regression tests for CAGRA search recall consistency across query batch sizes.
+# See https://github.com/rapidsai/cuvs/issues/1187
+#
+# When the AUTO algorithm selector switches from MULTI_CTA to SINGLE_CTA
+# at larger batch sizes, SINGLE_CTA computes max_iterations = itopk_size /
+# search_width. With search_width > 1, this gives far fewer iterations than
+# MULTI_CTA (which uses internal mc_search_width=1), causing a recall cliff.
+# The fix prevents SINGLE_CTA selection when search_width > 1.
+
+import numpy as np
+import pytest
+from pylibraft.common import device_ndarray
+
+from cuvs.neighbors import cagra
+
+
+def calc_recall(found_indices, ground_truth, k):
+    """Calculate recall@k between found indices and ground truth."""
+    n_queries = found_indices.shape[0]
+    correct = 0
+    total = n_queries * k
+    for i in range(n_queries):
+        correct += len(
+            set(found_indices[i, :k].tolist())
+            & set(ground_truth[i, :k].tolist())
+        )
+    return correct / total
+
+
+def search_in_batches(search_params, index, queries, k, batch_size):
+    """Search the index in fixed-size batches, return host neighbors array."""
+    n_queries = queries.shape[0]
+    all_neighbors = np.zeros((n_queries, k), dtype=np.uint32)
+    for start in range(0, n_queries, batch_size):
+        end = min(start + batch_size, n_queries)
+        q = device_ndarray(queries[start:end])
+        _, neighbors = cagra.search(search_params, index, q, k=k)
+        neighbors_host = neighbors.copy_to_host()
+        all_neighbors[start:end] = neighbors_host
+    return all_neighbors
+
+
+@pytest.fixture(scope="module")
+def cagra_test_data():
+    """Build CAGRA index and brute-force ground truth once for all tests."""
+    np.random.seed(42)
+    n_samples = 50000
+    n_queries = 512
+    dim = 64
+    k = 10
+
+    dataset = np.random.randn(n_samples, dim).astype(np.float32)
+    queries = np.random.randn(n_queries, dim).astype(np.float32)
+    dataset_device = device_ndarray(dataset)
+
+    build_params = cagra.IndexParams(
+        graph_degree=32, intermediate_graph_degree=64
+    )
+    index = cagra.build(build_params, dataset_device)
+
+    # Brute-force ground truth via NumPy (L2 distance), chunked to avoid OOM.
+    # Each chunk creates a (chunk_size x n_samples x dim) array.
+    # chunk_size=16 => 16 * 50000 * 64 * 4 bytes = ~200MB per chunk.
+    gt_neighbors = np.zeros((n_queries, k), dtype=np.uint32)
+    chunk_size = 16
+    for start in range(0, n_queries, chunk_size):
+        end = min(start + chunk_size, n_queries)
+        q_chunk = queries[start:end]
+        dists = np.sum(
+            (q_chunk[:, None, :] - dataset[None, :, :]) ** 2, axis=2
+        )
+        gt_neighbors[start:end] = np.argsort(dists, axis=1)[:, :k].astype(
+            np.uint32
+        )
+        del dists
+
+    return {
+        "index": index,
+        "dataset_device": dataset_device,  # keep alive for index search
+        "queries": queries,
+        "gt_neighbors": gt_neighbors,
+        "n_queries": n_queries,
+        "k": k,
+    }
+
+
+@pytest.mark.parametrize("search_width", [1, 4, 8])
+def test_cagra_batch_recall_consistency(cagra_test_data, search_width):
+    """
+    Recall must be consistent across batch sizes regardless of search_width.
+
+    The AUTO selector may switch algorithms at larger batch sizes. This test
+    verifies that the switch does not cause a recall cliff. Without the fix,
+    search_width=8 drops from ~0.87 recall (MULTI_CTA) to ~0.65 (SINGLE_CTA)
+    at batch_size >= 256 (on machines with >= 128 SMs, fewer for smaller GPUs).
+    """
+    data = cagra_test_data
+    search_params = cagra.SearchParams(
+        itopk_size=64, search_width=search_width
+    )
+
+    batch_sizes = [32, 64, 128, 256, 512]
+    recalls = []
+
+    for batch_size in batch_sizes:
+        neighbors = search_in_batches(
+            search_params,
+            data["index"],
+            data["queries"],
+            data["k"],
+            batch_size,
+        )
+        recall = calc_recall(neighbors, data["gt_neighbors"], data["k"])
+        recalls.append(recall)
+
+    recall_std = np.std(recalls)
+    recall_range = max(recalls) - min(recalls)
+
+    assert recall_std < 0.02, (
+        f"search_width={search_width}: recall varies too much across batch "
+        f"sizes (std={recall_std:.4f}). "
+        + ", ".join(f"bs={bs}:{r:.4f}" for bs, r in zip(batch_sizes, recalls))
+    )
+    assert recall_range < 0.05, (
+        f"search_width={search_width}: recall range too wide "
+        f"({recall_range:.4f}). "
+        + ", ".join(f"bs={bs}:{r:.4f}" for bs, r in zip(batch_sizes, recalls))
+    )
+
+
+@pytest.mark.parametrize(
+    "search_width,min_expected_recall",
+    [(1, 0.4), (4, 0.6), (8, 0.7)],
+)
+def test_cagra_search_width_recall_quality(
+    cagra_test_data, search_width, min_expected_recall
+):
+    """
+    Higher search_width must yield higher recall, not lower.
+
+    SINGLE_CTA with search_width > 1 previously got fewer iterations,
+    producing LOWER recall with HIGHER search_width — the opposite of
+    expected behavior. This test catches that inversion.
+    """
+    data = cagra_test_data
+    search_params = cagra.SearchParams(
+        itopk_size=64, search_width=search_width
+    )
+
+    # Use batch_size=512 to trigger AUTO algorithm selection
+    neighbors = search_in_batches(
+        search_params,
+        data["index"],
+        data["queries"],
+        data["k"],
+        batch_size=512,
+    )
+    recall = calc_recall(neighbors, data["gt_neighbors"], data["k"])
+
+    assert recall >= min_expected_recall, (
+        f"search_width={search_width}: recall={recall:.4f} is below "
+        f"minimum expected {min_expected_recall:.4f} at batch_size=512. "
+        f"This suggests SINGLE_CTA may be selected with too few iterations."
+    )
+
+
+def test_cagra_search_width_monotonicity(cagra_test_data):
+    """
+    Recall must increase (or stay flat) as search_width increases.
+
+    Wider search explores more neighbors per iteration. If recall decreases
+    with higher search_width, the algorithm is getting fewer effective
+    iterations — the exact bug this fix addresses.
+    """
+    data = cagra_test_data
+    search_widths = [1, 2, 4, 8]
+    recalls = []
+
+    for sw in search_widths:
+        search_params = cagra.SearchParams(itopk_size=64, search_width=sw)
+        neighbors = search_in_batches(
+            search_params,
+            data["index"],
+            data["queries"],
+            data["k"],
+            batch_size=512,
+        )
+        recall = calc_recall(neighbors, data["gt_neighbors"], data["k"])
+        recalls.append(recall)
+
+    # Each subsequent search_width should have recall >= previous - tolerance
+    tolerance = 0.02
+    for i in range(1, len(search_widths)):
+        assert recalls[i] >= recalls[i - 1] - tolerance, (
+            f"Recall decreased from sw={search_widths[i - 1]} "
+            f"({recalls[i - 1]:.4f}) to sw={search_widths[i]} "
+            f"({recalls[i]:.4f}) at batch_size=512. "
+            f"Higher search_width should not reduce recall."
+        )
diff --git a/rust/cuvs/examples/cagra.rs b/rust/cuvs/examples/cagra.rs
index b118c3abc5..80d866112f 100644
--- a/rust/cuvs/examples/cagra.rs
+++ b/rust/cuvs/examples/cagra.rs
@@ -10,7 +10,11 @@ use ndarray::s;
 use ndarray_rand::rand_distr::Uniform;
 use ndarray_rand::RandomExt;
 
-/// Example showing how to index and search data with CAGRA
+/// Example showing how to index and search data with CAGRA using the validated builder API.
+///
+/// `IndexParams::builder()` validates parameters before any GPU allocation, surfacing
+/// misconfiguration immediately with a clear error message instead of an opaque CUDA
+/// assertion 1-2 seconds into `Index::build()`.
 fn cagra_example() -> Result<()> {
     let res = Resources::new()?;
 
@@ -20,8 +24,14 @@ fn cagra_example() -> Result<()> {
     let dataset =
         ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
 
-    // build the cagra index
-    let build_params = IndexParams::new()?;
+    // Build the CAGRA index using the validated builder.
+    // Parameters are checked in Rust before any FFI call — invalid values (e.g.
+    // graph_degree=0) produce an error here, not inside Index::build().
+    let build_params = IndexParams::builder()
+        .graph_degree(32)
+        .intermediate_graph_degree(64)
+        .nn_descent_niter(20)
+        .build()?;
     let index = Index::build(&res, &build_params, &dataset)?;
     println!(
         "Indexed {}x{} datapoints into cagra index",
@@ -47,7 +57,7 @@ fn cagra_example() -> Result<()> {
 
     let search_params = SearchParams::new()?;
 
-    index.search(&res, &search_params, &queries, &neighbors, &distances)?;
+    index.search(&res, &search_params, &queries, &neighbors, &distances, None)?;
 
     // Copy back to host memory
     distances.to_host(&res, &mut distances_host)?;
diff --git a/rust/cuvs/src/brute_force.rs b/rust/cuvs/src/brute_force.rs
index 1440bb3205..9437f42c26 100644
--- a/rust/cuvs/src/brute_force.rs
+++ b/rust/cuvs/src/brute_force.rs
@@ -5,18 +5,68 @@
 //! Brute Force KNN
 
 use std::io::{stderr, Write};
+use std::marker::PhantomData;
 
 use crate::distance_type::DistanceType;
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{DatasetOwnership, ManagedTensor};
 use crate::error::{check_cuvs, Result};
 use crate::resources::Resources;
 
 /// Brute Force KNN Index
+///
+/// The brute force C library stores a non-owning view into the original dataset.
+/// The lifetime parameter `'a` ensures the dataset outlives the index when built
+/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index
+/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]).
+///
+/// # Examples
+///
+/// ## Borrowed dataset (compiler enforces lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::brute_force::Index;
+/// # use cuvs::distance_type::DistanceType;
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((64, 8));
+/// let tensor = ManagedTensor::from(&arr);
+/// let index = Index::build(&res, DistanceType::L2Expanded, None, &tensor).unwrap();
+/// // arr and tensor must remain alive while index is in use
+/// ```
+///
+/// ## Owned dataset ('static lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::brute_force::Index;
+/// # use cuvs::distance_type::DistanceType;
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((64, 8));
+/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap();
+/// let index = Index::build_owned(&res, DistanceType::L2Expanded, None, device_tensor).unwrap();
+/// drop(arr); // Fine — index owns the device copy
+/// ```
 #[derive(Debug)]
-pub struct Index(ffi::cuvsBruteForceIndex_t);
+pub struct Index<'a> {
+    inner: ffi::cuvsBruteForceIndex_t,
+    _data: DatasetOwnership<'a>,
+}
 
-impl Index {
-    /// Builds a new Brute Force KNN Index from the dataset for efficient search.
+impl<'a> Index<'a> {
+    /// Creates a new FFI index handle.
+    fn create_handle() -> Result<ffi::cuvsBruteForceIndex_t> {
+        unsafe {
+            let mut index = std::mem::MaybeUninit::<ffi::cuvsBruteForceIndex_t>::uninit();
+            check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?;
+            Ok(index.assume_init())
+        }
+    }
+
+    /// Builds a new Brute Force KNN Index from a borrowed dataset.
+    ///
+    /// The compiler enforces that `dataset` outlives the returned index,
+    /// preventing use-after-free when the C library dereferences its
+    /// internal view of the data.
     ///
     /// # Arguments
     ///
@@ -24,33 +74,34 @@ impl Index {
     /// * `metric` - DistanceType to use for building the index
     /// * `metric_arg` - Optional value of `p` for Minkowski distances
     /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    pub fn build(
         res: &Resources,
         metric: DistanceType,
         metric_arg: Option<f32>,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
-        let index = Index::new()?;
+        dataset: &'a ManagedTensor,
+    ) -> Result<Index<'a>> {
+        let inner = Self::create_handle()?;
         unsafe {
             check_cuvs(ffi::cuvsBruteForceBuild(
                 res.0,
                 dataset.as_ptr(),
                 metric,
                 metric_arg.unwrap_or(2.0),
-                index.0,
+                inner,
             ))?;
         }
-        Ok(index)
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
-        unsafe {
-            let mut index = std::mem::MaybeUninit::<ffi::cuvsBruteForceIndex_t>::uninit();
-            check_cuvs(ffi::cuvsBruteForceIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index(index.assume_init()))
-        }
+    pub fn new() -> Result<Index<'a>> {
+        Ok(Index {
+            inner: Self::create_handle()?,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Perform a Nearest Neighbors search on the Index
@@ -76,7 +127,7 @@ impl Index {
 
             check_cuvs(ffi::cuvsBruteForceSearch(
                 res.0,
-                self.0,
+                self.inner,
                 queries.as_ptr(),
                 neighbors.as_ptr(),
                 distances.as_ptr(),
@@ -86,9 +137,46 @@ impl Index {
     }
 }
 
-impl Drop for Index {
+impl Index<'static> {
+    /// Builds a new Brute Force KNN Index from an owned dataset.
+    ///
+    /// The index takes ownership of `dataset`, making it self-contained
+    /// with a `'static` lifetime. This is useful when the dataset is a
+    /// device copy (from [`ManagedTensor::to_device`]) that should live
+    /// as long as the index.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `metric` - DistanceType to use for building the index
+    /// * `metric_arg` - Optional value of `p` for Minkowski distances
+    /// * `dataset` - A row-major matrix to index (ownership transferred to the index)
+    pub fn build_owned(
+        res: &Resources,
+        metric: DistanceType,
+        metric_arg: Option<f32>,
+        dataset: ManagedTensor,
+    ) -> Result<Index<'static>> {
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsBruteForceBuild(
+                res.0,
+                dataset.as_ptr(),
+                metric,
+                metric_arg.unwrap_or(2.0),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Owned(dataset),
+        })
+    }
+}
+
+impl Drop for Index<'_> {
     fn drop(&mut self) {
-        if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.0) }) {
+        if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.inner) }) {
             write!(stderr(), "failed to call bruteForceIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
@@ -116,9 +204,9 @@ mod tests {
 
         println!("dataset {:#?}", dataset_host);
 
-        // build the brute force index
-        let index =
-            Index::build(&res, metric, None, dataset).expect("failed to create brute force index");
+        // build the brute force index (owned — device copy lives in the index)
+        let index = Index::build_owned(&res, metric, None, dataset)
+            .expect("failed to create brute force index");
 
         res.sync_stream().unwrap();
 
@@ -173,10 +261,104 @@ mod tests {
         test_bfknn(DistanceType::L2Expanded);
     }
 
-    // NOTE: brute_force multiple-search test is omitted here because the C++
-    // brute_force::index stores a non-owning view into the dataset. Building
-    // from device data via `build()` drops the ManagedTensor after the call,
-    // leaving a dangling pointer. A follow-up PR will add dataset lifetime
-    // enforcement (DatasetOwnership<'a>) to make this safe.
-    // See: https://github.com/rapidsai/cuvs/issues/1838
+    /// Test that an index built with build_owned can be searched multiple times.
+    #[test]
+    fn test_brute_force_multiple_searches() {
+        let res = Resources::new().unwrap();
+
+        // Create a random dataset
+        let n_datapoints = 64;
+        let n_features = 8;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Build the brute force index with owned device memory
+        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let index = Index::build_owned(&res, DistanceType::L2Expanded, None, dataset_device)
+            .expect("failed to create brute force index");
+
+        res.sync_stream().unwrap();
+
+        let k = 4;
+
+        // Perform multiple searches on the same index
+        for search_iter in 0..3 {
+            let n_queries = 4;
+            let queries = dataset.slice(s![0..n_queries, ..]);
+            let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+            let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
+            let neighbors = ManagedTensor::from(&neighbors_host)
+                .to_device(&res)
+                .unwrap();
+
+            let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+            let distances = ManagedTensor::from(&distances_host)
+                .to_device(&res)
+                .unwrap();
+
+            index
+                .search(&res, &queries, &neighbors, &distances)
+                .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e));
+
+            // Copy back to host memory
+            distances.to_host(&res, &mut distances_host).unwrap();
+            neighbors.to_host(&res, &mut neighbors_host).unwrap();
+            res.sync_stream().unwrap();
+
+            // Verify results are consistent
+            assert_eq!(
+                neighbors_host[[0, 0]],
+                0,
+                "iteration {}: first query should find itself",
+                search_iter
+            );
+        }
+    }
+
+    /// Test that an index built with build (borrowed) ties the dataset lifetime.
+    #[test]
+    fn test_brute_force_borrowed_build() {
+        let res = Resources::new().unwrap();
+
+        let n_datapoints = 64;
+        let n_features = 8;
+        let dataset_host =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Create a device tensor and borrow it for the index
+        let dataset_device = ManagedTensor::from(&dataset_host).to_device(&res).unwrap();
+        let index = Index::build(&res, DistanceType::L2Expanded, None, &dataset_device)
+            .expect("failed to create brute force index");
+
+        res.sync_stream().unwrap();
+
+        // Search while the borrowed dataset is still alive
+        let n_queries = 4;
+        let k = 4;
+        let queries = dataset_host.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+        let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        index
+            .search(&res, &queries, &neighbors, &distances)
+            .unwrap();
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+        res.sync_stream().unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+        assert_eq!(neighbors_host[[1, 0]], 1);
+        // dataset_device is still alive here — compiler ensures it
+    }
 }
diff --git a/rust/cuvs/src/cagra/index.rs b/rust/cuvs/src/cagra/index.rs
index 789f72b603..341db36cfd 100644
--- a/rust/cuvs/src/cagra/index.rs
+++ b/rust/cuvs/src/cagra/index.rs
@@ -3,50 +3,102 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+use std::ffi::CString;
 use std::io::{stderr, Write};
+use std::marker::PhantomData;
 
 use crate::cagra::{IndexParams, SearchParams};
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{DatasetOwnership, ManagedTensor};
 use crate::error::{check_cuvs, Result};
+use crate::filters::{Filter, NoFilter};
 use crate::resources::Resources;
 
 /// CAGRA ANN Index
+///
+/// The lifetime parameter `'a` ensures the dataset outlives the index when built
+/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index
+/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]).
+///
+/// # Examples
+///
+/// ## Borrowed dataset (compiler enforces lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::cagra::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((256, 16));
+/// let params = IndexParams::new().unwrap();
+/// let tensor = ManagedTensor::from(&arr);
+/// let index = Index::build(&res, &params, &tensor).unwrap();
+/// // arr and tensor must remain alive while index is in use
+/// ```
+///
+/// ## Owned dataset ('static lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::cagra::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((256, 16));
+/// let params = IndexParams::new().unwrap();
+/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap();
+/// let index = Index::build_owned(&res, &params, device_tensor).unwrap();
+/// drop(arr); // Fine — index owns the device copy
+/// ```
 #[derive(Debug)]
-pub struct Index(ffi::cuvsCagraIndex_t);
+pub struct Index<'a> {
+    inner: ffi::cuvsCagraIndex_t,
+    _data: DatasetOwnership<'a>,
+}
+
+impl<'a> Index<'a> {
+    /// Creates a new FFI index handle.
+    fn create_handle() -> Result<ffi::cuvsCagraIndex_t> {
+        unsafe {
+            let mut index = std::mem::MaybeUninit::<ffi::cuvsCagraIndex_t>::uninit();
+            check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?;
+            Ok(index.assume_init())
+        }
+    }
 
-impl Index {
-    /// Builds a new Index from the dataset for efficient search.
+    /// Builds a new CAGRA Index from a borrowed dataset.
+    ///
+    /// The compiler enforces that `dataset` outlives the returned index,
+    /// preventing use-after-free when the C library dereferences its
+    /// internal view of the data.
     ///
     /// # Arguments
     ///
     /// * `res` - Resources to use
     /// * `params` - Parameters for building the index
     /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    pub fn build(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
-        let index = Index::new()?;
+        dataset: &'a ManagedTensor,
+    ) -> Result<Index<'a>> {
+        let inner = Self::create_handle()?;
         unsafe {
             check_cuvs(ffi::cuvsCagraBuild(
                 res.0,
                 params.0,
                 dataset.as_ptr(),
-                index.0,
+                inner,
             ))?;
         }
-        Ok(index)
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
-        unsafe {
-            let mut index = std::mem::MaybeUninit::<ffi::cuvsCagraIndex_t>::uninit();
-            check_cuvs(ffi::cuvsCagraIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index(index.assume_init()))
-        }
+    pub fn new() -> Result<Index<'a>> {
+        Ok(Index {
+            inner: Self::create_handle()?,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Perform a Approximate Nearest Neighbors search on the Index
@@ -58,6 +110,7 @@ impl Index {
     /// * `queries` - A matrix in device memory to query for
     /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
     /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
+    /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided)
     pub fn search(
         &self,
         res: &Resources,
@@ -65,29 +118,130 @@ impl Index {
         queries: &ManagedTensor,
         neighbors: &ManagedTensor,
         distances: &ManagedTensor,
+        filter: Option<&dyn Filter>,
     ) -> Result<()> {
         unsafe {
-            let prefilter = ffi::cuvsFilter {
-                addr: 0,
-                type_: ffi::cuvsFilterType::NO_FILTER,
-            };
+            let filter_ffi = filter
+                .map(|f| f.into_ffi())
+                .unwrap_or_else(|| NoFilter.into_ffi());
 
             check_cuvs(ffi::cuvsCagraSearch(
                 res.0,
                 params.0,
-                self.0,
+                self.inner,
                 queries.as_ptr(),
                 neighbors.as_ptr(),
                 distances.as_ptr(),
-                prefilter,
+                filter_ffi,
             ))
         }
     }
+
+    /// Save the CAGRA index to file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The file name for saving the index
+    /// * `include_dataset` - Whether to write out the dataset to the file
+    pub fn serialize(&self, res: &Resources, filename: &str, include_dataset: bool) -> Result<()> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        unsafe {
+            check_cuvs(ffi::cuvsCagraSerialize(
+                res.0,
+                c_filename.as_ptr(),
+                self.inner,
+                include_dataset,
+            ))
+        }
+    }
+
+    /// Save the CAGRA index to file in hnswlib format.
+    ///
+    /// NOTE: The saved index can only be read by the hnswlib wrapper in cuVS,
+    /// as the serialization format is not compatible with the original hnswlib.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The file name for saving the index
+    pub fn serialize_to_hnswlib(&self, res: &Resources, filename: &str) -> Result<()> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        unsafe {
+            check_cuvs(ffi::cuvsCagraSerializeToHnswlib(
+                res.0,
+                c_filename.as_ptr(),
+                self.inner,
+            ))
+        }
+    }
+
+    /// Load a CAGRA index from file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The name of the file that stores the index
+    pub fn deserialize(res: &Resources, filename: &str) -> Result<Index<'a>> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsCagraDeserialize(
+                res.0,
+                c_filename.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
+    }
+}
+
+impl Index<'static> {
+    /// Builds a new CAGRA Index from an owned dataset.
+    ///
+    /// The index takes ownership of `dataset`, making it self-contained
+    /// with a `'static` lifetime. This is useful when the dataset is a
+    /// device copy (from [`ManagedTensor::to_device`]) that should live
+    /// as long as the index.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `params` - Parameters for building the index
+    /// * `dataset` - A row-major matrix to index (ownership transferred to the index)
+    pub fn build_owned(
+        res: &Resources,
+        params: &IndexParams,
+        dataset: ManagedTensor,
+    ) -> Result<Index<'static>> {
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsCagraBuild(
+                res.0,
+                params.0,
+                dataset.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Owned(dataset),
+        })
+    }
 }
 
-impl Drop for Index {
+impl Drop for Index<'_> {
     fn drop(&mut self) {
-        if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.0) }) {
+        if let Err(e) = check_cuvs(unsafe { ffi::cuvsCagraIndexDestroy(self.inner) }) {
             write!(stderr(), "failed to call cagraIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
@@ -110,9 +264,10 @@ mod tests {
         let dataset =
             ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
 
-        // build the cagra index
+        // build the cagra index (borrowed — dataset tensor must outlive index)
+        let tensor = ManagedTensor::from(&dataset);
         let index =
-            Index::build(&res, &build_params, &dataset).expect("failed to create cagra index");
+            Index::build(&res, &build_params, &tensor).expect("failed to create cagra index");
 
         // use the first 4 points from the dataset as queries : will test that we get them back
         // as their own nearest neighbor
@@ -138,7 +293,7 @@ mod tests {
         let search_params = SearchParams::new().unwrap();
 
         index
-            .search(&res, &search_params, &queries, &neighbors, &distances)
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
             .unwrap();
 
         // Copy back to host memory
@@ -181,9 +336,10 @@ mod tests {
         let dataset =
             ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
 
-        // Build the index once
+        // Build the index once (borrowed)
+        let tensor = ManagedTensor::from(&dataset);
         let index =
-            Index::build(&res, &build_params, &dataset).expect("failed to create cagra index");
+            Index::build(&res, &build_params, &tensor).expect("failed to create cagra index");
 
         let search_params = SearchParams::new().unwrap();
         let k = 5;
@@ -206,8 +362,8 @@ mod tests {
 
             // This should work on every iteration because search() takes &self
             index
-                .search(&res, &search_params, &queries, &neighbors, &distances)
-                .expect(&format!("search iteration {} failed", search_iter));
+                .search(&res, &search_params, &queries, &neighbors, &distances, None)
+                .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e));
 
             // Copy back to host memory
             distances.to_host(&res, &mut distances_host).unwrap();
@@ -222,4 +378,128 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_cagra_serialize_deserialize() {
+        let res = Resources::new().unwrap();
+
+        let n_datapoints = 256;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        let build_params = IndexParams::new().unwrap();
+        let tensor = ManagedTensor::from(&dataset);
+        let index =
+            Index::build(&res, &build_params, &tensor).expect("failed to create cagra index");
+
+        let dir = std::env::temp_dir();
+        let filepath = dir.join("test_cagra_index.bin");
+        let filepath_str = filepath.to_str().unwrap();
+        index
+            .serialize(&res, filepath_str, true)
+            .expect("failed to serialize cagra index");
+
+        assert!(filepath.exists());
+        assert!(std::fs::metadata(&filepath).unwrap().len() > 0);
+
+        let loaded_index: Index<'_> =
+            Index::deserialize(&res, filepath_str).expect("failed to deserialize cagra index");
+
+        let n_queries = 4;
+        let k = 10;
+        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        let search_params = SearchParams::new().unwrap();
+        loaded_index
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
+            .expect("failed to search deserialized index");
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+        assert_eq!(neighbors_host[[1, 0]], 1);
+
+        let _ = std::fs::remove_file(&filepath);
+    }
+
+    #[test]
+    fn test_cagra_serialize_to_hnswlib() {
+        let res = Resources::new().unwrap();
+
+        let n_datapoints = 256;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        let build_params = IndexParams::new().unwrap();
+        let tensor = ManagedTensor::from(&dataset);
+        let index =
+            Index::build(&res, &build_params, &tensor).expect("failed to create cagra index");
+
+        let dir = std::env::temp_dir();
+        let filepath = dir.join("test_cagra_index_hnsw.bin");
+        let filepath_str = filepath.to_str().unwrap();
+        index
+            .serialize_to_hnswlib(&res, filepath_str)
+            .expect("failed to serialize to hnswlib format");
+
+        assert!(filepath.exists());
+        assert!(std::fs::metadata(&filepath).unwrap().len() > 0);
+
+        let _ = std::fs::remove_file(&filepath);
+    }
+
+    /// Test that an index built with build_owned can be searched after the
+    /// original host data is dropped.
+    #[test]
+    fn test_cagra_build_owned() {
+        let res = Resources::new().unwrap();
+        let build_params = IndexParams::new().unwrap();
+
+        let n_datapoints = 256;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Build with owned device memory
+        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let index = Index::build_owned(&res, &build_params, dataset_device)
+            .expect("failed to create cagra index");
+
+        let search_params = SearchParams::new().unwrap();
+        let k = 5;
+        let n_queries = 4;
+        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+        let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        index
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
+            .unwrap();
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+    }
 }
diff --git a/rust/cuvs/src/cagra/index_params.rs b/rust/cuvs/src/cagra/index_params.rs
index ea34959147..6655b24741 100644
--- a/rust/cuvs/src/cagra/index_params.rs
+++ b/rust/cuvs/src/cagra/index_params.rs
@@ -137,6 +137,31 @@ impl IndexParams {
     }
 }
 
+impl IndexParams {
+    /// Returns a builder for constructing [`IndexParams`] with validated parameters.
+    ///
+    /// Unlike the `IndexParams::new()?.set_*()` setter chain, [`IndexParamsBuilder::build`]
+    /// validates all parameters in Rust before any FFI allocation. Invalid values produce a
+    /// clear error message naming the offending field and its valid range, before any GPU
+    /// work begins.
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use cuvs::cagra::IndexParams;
+    ///
+    /// let params = IndexParams::builder()
+    ///     .graph_degree(32)
+    ///     .intermediate_graph_degree(64)
+    ///     .nn_descent_niter(20)
+    ///     .build()
+    ///     .unwrap();
+    /// ```
+    pub fn builder() -> IndexParamsBuilder {
+        IndexParamsBuilder::default()
+    }
+}
+
 impl fmt::Debug for IndexParams {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // custom debug trait here, default value will show the pointer address
@@ -177,6 +202,115 @@ impl Drop for CompressionParams {
     }
 }
 
+/// Builder for [`IndexParams`] with pre-validated parameters.
+///
+/// Construct via [`IndexParams::builder()`]. Call [`IndexParamsBuilder::build`] to
+/// validate all parameters and allocate the FFI struct in one step.
+///
+/// Defaults match the cuVS C API defaults: `graph_degree=64`,
+/// `intermediate_graph_degree=128`, `nn_descent_niter=20`.
+pub struct IndexParamsBuilder {
+    graph_degree: usize,
+    intermediate_graph_degree: usize,
+    nn_descent_niter: usize,
+    build_algo: Option<BuildAlgo>,
+    compression: Option<CompressionParams>,
+}
+
+impl Default for IndexParamsBuilder {
+    fn default() -> Self {
+        Self {
+            graph_degree: 64,
+            intermediate_graph_degree: 128,
+            nn_descent_niter: 20,
+            build_algo: None,
+            compression: None,
+        }
+    }
+}
+
+impl IndexParamsBuilder {
+    /// Degree of output graph.
+    ///
+    /// Must be > 0. Values that are multiples of 32 are preferred for warp alignment.
+    pub fn graph_degree(mut self, v: usize) -> Self {
+        self.graph_degree = v;
+        self
+    }
+
+    /// Degree of input graph for pruning.
+    ///
+    /// Must be >= `graph_degree`.
+    pub fn intermediate_graph_degree(mut self, v: usize) -> Self {
+        self.intermediate_graph_degree = v;
+        self
+    }
+
+    /// Number of iterations to run if building with NN_DESCENT.
+    ///
+    /// Must be > 0.
+    pub fn nn_descent_niter(mut self, v: usize) -> Self {
+        self.nn_descent_niter = v;
+        self
+    }
+
+    /// ANN algorithm to build knn graph.
+    pub fn build_algo(mut self, v: BuildAlgo) -> Self {
+        self.build_algo = Some(v);
+        self
+    }
+
+    /// Vector compression parameters.
+    pub fn compression(mut self, v: CompressionParams) -> Self {
+        self.compression = Some(v);
+        self
+    }
+
+    /// Validate all parameters without allocating any GPU resources.
+    ///
+    /// Returns `Ok(())` if all parameters are valid, or `Err` with a message naming
+    /// the offending field and its valid range.
+    pub fn validate(&self) -> crate::error::Result<()> {
+        if self.graph_degree == 0 {
+            return Err(format!("graph_degree must be > 0; got {}", self.graph_degree).into());
+        }
+        if self.intermediate_graph_degree < self.graph_degree {
+            return Err(format!(
+                "intermediate_graph_degree ({}) must be >= graph_degree ({})",
+                self.intermediate_graph_degree, self.graph_degree
+            )
+            .into());
+        }
+        if self.nn_descent_niter == 0 {
+            return Err(format!(
+                "nn_descent_niter must be > 0; got {}",
+                self.nn_descent_niter
+            )
+            .into());
+        }
+        Ok(())
+    }
+
+    /// Validate all parameters and allocate the FFI struct.
+    ///
+    /// Returns `Err` with a message naming the offending field and its valid range
+    /// before any GPU work begins.
+    pub fn build(self) -> crate::error::Result<IndexParams> {
+        self.validate()?;
+        let mut params = IndexParams::new()?
+            .set_graph_degree(self.graph_degree)
+            .set_intermediate_graph_degree(self.intermediate_graph_degree)
+            .set_nn_descent_niter(self.nn_descent_niter);
+        if let Some(algo) = self.build_algo {
+            params = params.set_build_algo(algo);
+        }
+        if let Some(compression) = self.compression {
+            params = params.set_compression(compression);
+        }
+        Ok(params)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -206,4 +340,95 @@ mod tests {
             assert_eq!((*(*params.0).compression).pq_bits, 4);
         }
     }
+
+    // --- IndexParamsBuilder tests ---
+
+    #[test]
+    fn builder_rejects_zero_graph_degree() {
+        let err = IndexParams::builder()
+            .graph_degree(0)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("graph_degree"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_invalid_intermediate_degree() {
+        let err = IndexParams::builder()
+            .graph_degree(32)
+            .intermediate_graph_degree(16)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("intermediate_graph_degree"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_zero_niter() {
+        let err = IndexParams::builder()
+            .nn_descent_niter(0)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("nn_descent_niter"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_accepts_valid_params() {
+        assert!(IndexParams::builder()
+            .graph_degree(32)
+            .intermediate_graph_degree(64)
+            .nn_descent_niter(20)
+            .validate()
+            .is_ok());
+    }
+
+    #[test]
+    fn builder_round_trips_to_ffi() {
+        // Built params must produce the same FFI struct values as the manual setter chain.
+        let via_builder = IndexParams::builder()
+            .graph_degree(32)
+            .intermediate_graph_degree(64)
+            .nn_descent_niter(20)
+            .build()
+            .unwrap();
+        let via_setters = IndexParams::new()
+            .unwrap()
+            .set_graph_degree(32)
+            .set_intermediate_graph_degree(64)
+            .set_nn_descent_niter(20);
+        unsafe {
+            assert_eq!((*via_builder.0).graph_degree, (*via_setters.0).graph_degree);
+            assert_eq!(
+                (*via_builder.0).intermediate_graph_degree,
+                (*via_setters.0).intermediate_graph_degree
+            );
+            assert_eq!(
+                (*via_builder.0).nn_descent_niter,
+                (*via_setters.0).nn_descent_niter
+            );
+        }
+    }
+
+    #[test]
+    fn existing_setter_api_unchanged() {
+        // Ensure the original API still compiles and sets values correctly.
+        let params = IndexParams::new()
+            .unwrap()
+            .set_graph_degree(32)
+            .set_intermediate_graph_degree(64)
+            .set_nn_descent_niter(20);
+        unsafe {
+            assert_eq!((*params.0).graph_degree, 32);
+            assert_eq!((*params.0).intermediate_graph_degree, 64);
+            assert_eq!((*params.0).nn_descent_niter, 20);
+        }
+    }
 }
diff --git a/rust/cuvs/src/cagra/mod.rs b/rust/cuvs/src/cagra/mod.rs
index d39de752cf..b7954c2f9f 100644
--- a/rust/cuvs/src/cagra/mod.rs
+++ b/rust/cuvs/src/cagra/mod.rs
@@ -27,7 +27,8 @@
 //!
 //!     // build the cagra index
 //!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
+//!     let tensor = ManagedTensor::from(&dataset);
+//!     let index = Index::build(&res, &build_params, &tensor)?;
 //!     println!(
 //!         "Indexed {}x{} datapoints into cagra index",
 //!         n_datapoints, n_features
@@ -71,5 +72,5 @@ mod index_params;
 mod search_params;
 
 pub use index::Index;
-pub use index_params::{BuildAlgo, CompressionParams, IndexParams};
-pub use search_params::{HashMode, SearchAlgo, SearchParams};
+pub use index_params::{BuildAlgo, CompressionParams, IndexParams, IndexParamsBuilder};
+pub use search_params::{HashMode, SearchAlgo, SearchParams, SearchParamsBuilder};
diff --git a/rust/cuvs/src/cagra/search_params.rs b/rust/cuvs/src/cagra/search_params.rs
index 59537d7718..7914b1dbc2 100644
--- a/rust/cuvs/src/cagra/search_params.rs
+++ b/rust/cuvs/src/cagra/search_params.rs
@@ -122,6 +122,15 @@ impl SearchParams {
     }
 }
 
+impl SearchParams {
+    /// Returns a builder for constructing [`SearchParams`] with validated parameters.
+    ///
+    /// See [`SearchParamsBuilder`] for details.
+    pub fn builder() -> SearchParamsBuilder {
+        SearchParamsBuilder::default()
+    }
+}
+
 impl fmt::Debug for SearchParams {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // custom debug trait here, default value will show the pointer address
@@ -143,6 +152,180 @@ impl Drop for SearchParams {
     }
 }
 
+/// Builder for [`SearchParams`] with pre-validated parameters.
+///
+/// Construct via [`SearchParams::builder()`]. Call [`SearchParamsBuilder::build`] to
+/// validate all parameters and allocate the FFI struct in one step.
+pub struct SearchParamsBuilder {
+    itopk_size: usize,
+    max_queries: usize,
+    max_iterations: usize,
+    min_iterations: usize,
+    team_size: usize,
+    thread_block_size: usize,
+    hashmap_max_fill_rate: f32,
+    hashmap_min_bitlen: usize,
+    num_random_samplings: u32,
+    rand_xor_mask: u64,
+    algo: Option<SearchAlgo>,
+    hashmap_mode: Option<HashMode>,
+}
+
+impl Default for SearchParamsBuilder {
+    fn default() -> Self {
+        Self {
+            itopk_size: 64,
+            max_queries: 0,
+            max_iterations: 0,
+            min_iterations: 0,
+            team_size: 0,
+            thread_block_size: 0,
+            hashmap_max_fill_rate: 0.5,
+            hashmap_min_bitlen: 0,
+            num_random_samplings: 1,
+            rand_xor_mask: 0x128394,
+            algo: None,
+            hashmap_mode: None,
+        }
+    }
+}
+
+impl SearchParamsBuilder {
+    /// Number of intermediate search results retained during the search.
+    ///
+    /// Must be a power of 2 (or 0 to use the cuVS default).
+    pub fn itopk_size(mut self, v: usize) -> Self {
+        self.itopk_size = v;
+        self
+    }
+
+    /// Maximum number of queries to search at the same time. 0 = auto.
+    pub fn max_queries(mut self, v: usize) -> Self {
+        self.max_queries = v;
+        self
+    }
+
+    /// Upper limit of search iterations. 0 = auto.
+    pub fn max_iterations(mut self, v: usize) -> Self {
+        self.max_iterations = v;
+        self
+    }
+
+    /// Lower limit of search iterations.
+    pub fn min_iterations(mut self, v: usize) -> Self {
+        self.min_iterations = v;
+        self
+    }
+
+    /// Number of threads used to calculate a single distance.
+    ///
+    /// Must be 0 (auto), 4, 8, 16, or 32.
+    pub fn team_size(mut self, v: usize) -> Self {
+        self.team_size = v;
+        self
+    }
+
+    /// Thread block size. 0 (auto), 64, 128, 256, 512, or 1024.
+    pub fn thread_block_size(mut self, v: usize) -> Self {
+        self.thread_block_size = v;
+        self
+    }
+
+    /// Upper limit of hashmap fill rate.
+    ///
+    /// Must be in the exclusive range (0.1, 0.9).
+    pub fn hashmap_max_fill_rate(mut self, v: f32) -> Self {
+        self.hashmap_max_fill_rate = v;
+        self
+    }
+
+    /// Lower limit of hashmap bit length.
+    pub fn hashmap_min_bitlen(mut self, v: usize) -> Self {
+        self.hashmap_min_bitlen = v;
+        self
+    }
+
+    /// Number of iterations of initial random seed node selection.
+    pub fn num_random_samplings(mut self, v: u32) -> Self {
+        self.num_random_samplings = v;
+        self
+    }
+
+    /// Bit mask used for initial random seed node selection.
+    pub fn rand_xor_mask(mut self, v: u64) -> Self {
+        self.rand_xor_mask = v;
+        self
+    }
+
+    /// Which search implementation to use.
+    pub fn algo(mut self, v: SearchAlgo) -> Self {
+        self.algo = Some(v);
+        self
+    }
+
+    /// Hashmap type.
+    pub fn hashmap_mode(mut self, v: HashMode) -> Self {
+        self.hashmap_mode = Some(v);
+        self
+    }
+
+    /// Validate all parameters without allocating any GPU resources.
+    ///
+    /// Returns `Ok(())` if all parameters are valid, or `Err` with a message naming
+    /// the offending field and its valid range.
+    pub fn validate(&self) -> crate::error::Result<()> {
+        if self.itopk_size != 0 && !self.itopk_size.is_power_of_two() {
+            return Err(format!(
+                "itopk_size must be a power of 2 or 0 (auto); got {}",
+                self.itopk_size
+            )
+            .into());
+        }
+        const VALID_TEAM_SIZES: &[usize] = &[0, 4, 8, 16, 32];
+        if !VALID_TEAM_SIZES.contains(&self.team_size) {
+            return Err(format!(
+                "team_size must be one of {{0, 4, 8, 16, 32}}; got {}",
+                self.team_size
+            )
+            .into());
+        }
+        if self.hashmap_max_fill_rate <= 0.1 || self.hashmap_max_fill_rate >= 0.9 {
+            return Err(format!(
+                "hashmap_max_fill_rate must be in (0.1, 0.9); got {}",
+                self.hashmap_max_fill_rate
+            )
+            .into());
+        }
+        Ok(())
+    }
+
+    /// Validate all parameters and allocate the FFI struct.
+    ///
+    /// Returns `Err` with a message naming the offending field and its valid range
+    /// before any GPU work begins.
+    pub fn build(self) -> crate::error::Result<SearchParams> {
+        self.validate()?;
+        let mut params = SearchParams::new()?
+            .set_itopk_size(self.itopk_size)
+            .set_max_queries(self.max_queries)
+            .set_max_iterations(self.max_iterations)
+            .set_min_iterations(self.min_iterations)
+            .set_team_size(self.team_size)
+            .set_thread_block_size(self.thread_block_size)
+            .set_hashmap_max_fill_rate(self.hashmap_max_fill_rate)
+            .set_hashmap_min_bitlen(self.hashmap_min_bitlen)
+            .set_num_random_samplings(self.num_random_samplings)
+            .set_rand_xor_mask(self.rand_xor_mask);
+        if let Some(algo) = self.algo {
+            params = params.set_algo(algo);
+        }
+        if let Some(mode) = self.hashmap_mode {
+            params = params.set_hashmap_mode(mode);
+        }
+        Ok(params)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -155,4 +338,67 @@ mod tests {
             assert_eq!((*params.0).itopk_size, 128);
         }
     }
+
+    // --- SearchParamsBuilder tests ---
+
+    #[test]
+    fn builder_rejects_non_power_of_two_itopk() {
+        let err = SearchParams::builder()
+            .itopk_size(100)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("itopk_size"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_invalid_team_size() {
+        let err = SearchParams::builder().team_size(7).validate().unwrap_err();
+        assert!(
+            err.to_string().contains("team_size"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_fill_rate_too_high() {
+        let err = SearchParams::builder()
+            .hashmap_max_fill_rate(0.95)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("hashmap_max_fill_rate"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_fill_rate_too_low() {
+        let err = SearchParams::builder()
+            .hashmap_max_fill_rate(0.05)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("hashmap_max_fill_rate"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_accepts_valid_params() {
+        assert!(SearchParams::builder()
+            .itopk_size(64)
+            .team_size(8)
+            .hashmap_max_fill_rate(0.5)
+            .validate()
+            .is_ok());
+    }
+
+    #[test]
+    fn builder_accepts_zero_itopk_as_auto() {
+        // itopk_size=0 means "auto select" in cuVS — should be valid
+        assert!(SearchParams::builder().itopk_size(0).validate().is_ok());
+    }
 }
diff --git a/rust/cuvs/src/dlpack.rs b/rust/cuvs/src/dlpack.rs
index d60b0acf6a..b0232c24f0 100644
--- a/rust/cuvs/src/dlpack.rs
+++ b/rust/cuvs/src/dlpack.rs
@@ -4,10 +4,25 @@
  */
 
 use std::convert::From;
+use std::marker::PhantomData;
 
 use crate::error::{check_cuda, check_cuvs, Result};
 use crate::resources::Resources;
 
+/// Tracks whether an Index borrows or owns its dataset.
+///
+/// When an index is built from a borrowed `ManagedTensor`, the compiler enforces
+/// that the dataset outlives the index. When built from an owned `ManagedTensor`
+/// (e.g., a device copy from [`ManagedTensor::to_device`]), the index is
+/// self-contained and has a `'static` lifetime.
+#[derive(Debug)]
+pub(crate) enum DatasetOwnership<'a> {
+    /// Dataset is borrowed — caller must keep it alive.
+    Borrowed(PhantomData<&'a ()>),
+    /// Dataset is owned by the index (e.g., a device copy from `to_device()`).
+    Owned(ManagedTensor),
+}
+
 /// ManagedTensor is a wrapper around a dlpack DLManagedTensor object.
 /// This lets you pass matrices in device or host memory into cuvs.
 #[derive(Debug)]
diff --git a/rust/cuvs/src/error.rs b/rust/cuvs/src/error.rs
index 74cb1037f6..ab906c399b 100644
--- a/rust/cuvs/src/error.rs
+++ b/rust/cuvs/src/error.rs
@@ -60,3 +60,12 @@ pub fn check_cuda(err: ffi::cudaError_t) -> Result<()> {
         _ => Err(Error::CudaError(err)),
     }
 }
+
+impl From<String> for Error {
+    fn from(text: String) -> Self {
+        Error::CuvsError(CuvsError {
+            code: ffi::cuvsError_t::CUVS_ERROR,
+            text,
+        })
+    }
+}
diff --git a/rust/cuvs/src/filters.rs b/rust/cuvs/src/filters.rs
new file mode 100644
index 0000000000..b315c1954d
--- /dev/null
+++ b/rust/cuvs/src/filters.rs
@@ -0,0 +1,605 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//! Filters for approximate nearest neighbor search
+//!
+//! This module provides filtering functionality for ANN search operations,
+//! allowing you to exclude certain vectors from search results.
+//!
+//! # Filter Types
+//!
+//! - **No Filter**: Default behavior, includes all vectors
+//! - **Bitset**: Global filter applied to all queries
+//! - **Bitmap**: Per-query filter for batch operations
+//!
+//! # Examples
+//!
+//! ## Creating a Bitset Filter (Exclude Specific Vectors)
+//!
+//! ```no_run
+//! use cuvs::filters::{Bitset, bitset_from_excluded_indices};
+//! use cuvs::Resources;
+//!
+//! let res = Resources::new().unwrap();
+//! let n_samples = 1000;
+//!
+//! // Exclude specific vector indices from search
+//! let excluded = vec![0, 5, 10, 15, 20];
+//! let tensor = bitset_from_excluded_indices(n_samples, &excluded);
+//! let device_tensor = tensor.to_device(&res).unwrap();
+//! let filter = Bitset::new(&device_tensor);
+//!
+//! // Use with search:
+//! // index.search(&res, &params, &queries, &neighbors, &distances, Some(&filter));
+//! ```
+//!
+//! ## Creating a Bitset Filter (Include Only Specific Vectors)
+//!
+//! ```no_run
+//! use cuvs::filters::{Bitset, bitset_from_included_indices};
+//! use cuvs::Resources;
+//!
+//! let res = Resources::new().unwrap();
+//! let n_samples = 1000;
+//!
+//! // Only search these specific vectors
+//! let included = vec![100, 200, 300];
+//! let tensor = bitset_from_included_indices(n_samples, &included);
+//! let device_tensor = tensor.to_device(&res).unwrap();
+//! let filter = Bitset::new(&device_tensor);
+//! ```
+//!
+//! ## Creating a Bitmap Filter (Per-Query Exclusions)
+//!
+//! ```no_run
+//! use cuvs::filters::{Bitmap, bitmap_from_excluded_indices};
+//! use cuvs::Resources;
+//!
+//! let res = Resources::new().unwrap();
+//! let n_queries = 10;
+//! let n_samples = 1000;
+//!
+//! // Different exclusions for each query
+//! let excluded_per_query = vec![
+//!     vec![0, 1, 2],      // Query 0 excludes these
+//!     vec![10, 20, 30],   // Query 1 excludes these
+//!     vec![5],            // Query 2 excludes this
+//!     // ... one per query
+//! ];
+//! let tensor = bitmap_from_excluded_indices(n_queries, n_samples, &excluded_per_query);
+//! let device_tensor = tensor.to_device(&res).unwrap();
+//! let filter = Bitmap::new(&device_tensor);
+//! ```
+//!
+//! ## Manual Construction (Advanced)
+//!
+//! For fine-grained control, you can manually construct the bitset:
+//!
+//! ```no_run
+//! use cuvs::filters::Bitset;
+//! use cuvs::{Resources, ManagedTensor};
+//! use ndarray::Array1;
+//!
+//! let res = Resources::new().unwrap();
+//! let n_samples = 1000;
+//! let bitset_size = (n_samples + 31) / 32;
+//!
+//! // Create bitset manually with custom bit patterns
+//! let mut bitset_data = Array1::<u32>::from_elem(bitset_size, 0xFFFFFFFF);
+//! bitset_data[0] = 0xAAAAAAAA; // Custom pattern for first 32 vectors
+//!
+//! let bitset_tensor = ManagedTensor::from(&bitset_data).to_device(&res).unwrap();
+//! let filter = Bitset::new(&bitset_tensor);
+//! ```
+
+use crate::dlpack::ManagedTensor;
+
+pub type FilterType = ffi::cuvsFilterType;
+
+/// Base trait for all filter types
+pub trait Filter {
+    /// Convert this filter into a C FFI filter struct
+    fn into_ffi(&self) -> ffi::cuvsFilter;
+}
+
+/// No filter - includes all vectors in search results
+///
+/// This is the default behavior when no filter is specified.
+#[derive(Debug)]
+pub struct NoFilter;
+
+impl Filter for NoFilter {
+    fn into_ffi(&self) -> ffi::cuvsFilter {
+        ffi::cuvsFilter {
+            addr: 0,
+            type_: ffi::cuvsFilterType::NO_FILTER,
+        }
+    }
+}
+
+/// Bitset filter - applies the same filter to all queries
+///
+/// A bitset is a compact representation where each bit indicates whether
+/// a vector should be included (1) or excluded (0) from search results.
+/// This filter type applies the same filtering to all queries in a batch.
+///
+/// # Tensor Format
+///
+/// The tensor must be a 1D array of `uint32` elements:
+/// - **Shape**: `[(n_samples + 31) / 32]`
+/// - **Type**: `uint32`
+/// - **Device**: Must be in device (GPU) memory
+/// - Each bit represents one vector in the dataset
+/// - Bit value 1: vector is included in search
+/// - Bit value 0: vector is excluded from search
+///
+/// The bitset uses little-endian bit ordering within each uint32 element.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitset, bitset_from_excluded_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let n_samples = 1000;
+/// let excluded = vec![0, 5, 10];
+/// let tensor = bitset_from_excluded_indices(n_samples, &excluded);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitset::new(&device_tensor);
+/// ```
+#[derive(Debug)]
+pub struct Bitset<'a> {
+    tensor: &'a ManagedTensor,
+}
+
+impl<'a> Bitset<'a> {
+    /// Create a new bitset filter from a tensor
+    ///
+    /// Use [`bitset_from_excluded_indices`] or [`bitset_from_included_indices`]
+    /// to create the tensor from index lists.
+    ///
+    /// # Arguments
+    ///
+    /// * `tensor` - Device tensor containing bitset data as uint32 elements.
+    ///              Must have shape `[(n_samples + 31) / 32]` where `n_samples`
+    ///              is the number of vectors in the dataset being filtered.
+    pub fn new(tensor: &'a ManagedTensor) -> Self {
+        Bitset { tensor }
+    }
+}
+
+impl<'a> Filter for Bitset<'a> {
+    fn into_ffi(&self) -> ffi::cuvsFilter {
+        ffi::cuvsFilter {
+            addr: self.tensor.as_ptr() as uintptr_t,
+            type_: ffi::cuvsFilterType::BITSET,
+        }
+    }
+}
+
+/// Bitmap filter - applies different filters for each query
+///
+/// A bitmap allows per-query filtering in batch search operations.
+/// Each query can have its own set of allowed/disallowed vectors.
+///
+/// # Tensor Format
+///
+/// The tensor must be a 1D array of `uint32` elements:
+/// - **Shape**: `[n_queries * ((n_samples + 31) / 32)]`
+/// - **Type**: `uint32`
+/// - **Device**: Must be in device (GPU) memory
+/// - Layout: Row-major, where each row is one query's bitset
+/// - Each query has its own bitset of size `(n_samples + 31) / 32`
+/// - Bit value 1: vector is included for this query
+/// - Bit value 0: vector is excluded for this query
+///
+/// The bitmap uses little-endian bit ordering within each uint32 element.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitmap, bitmap_from_excluded_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let n_queries = 10;
+/// let n_samples = 1000;
+/// let excluded_per_query = vec![vec![0, 1, 2], vec![5, 10]];
+/// let tensor = bitmap_from_excluded_indices(n_queries, n_samples, &excluded_per_query);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitmap::new(&device_tensor);
+/// ```
+#[derive(Debug)]
+pub struct Bitmap<'a> {
+    tensor: &'a ManagedTensor,
+}
+
+impl<'a> Bitmap<'a> {
+    /// Create a new bitmap filter from a tensor
+    ///
+    /// Use [`bitmap_from_excluded_indices`] or [`bitmap_from_included_indices`]
+    /// to create the tensor from index lists.
+    ///
+    /// # Arguments
+    ///
+    /// * `tensor` - Device tensor containing bitmap data as uint32 elements.
+    ///              Must have shape `[n_queries * ((n_samples + 31) / 32)]` where
+    ///              `n_queries` is the number of queries in the batch and `n_samples`
+    ///              is the number of vectors in the dataset being filtered.
+    pub fn new(tensor: &'a ManagedTensor) -> Self {
+        Bitmap { tensor }
+    }
+}
+
+impl<'a> Filter for Bitmap<'a> {
+    fn into_ffi(&self) -> ffi::cuvsFilter {
+        ffi::cuvsFilter {
+            addr: self.tensor.as_ptr() as uintptr_t,
+            type_: ffi::cuvsFilterType::BITMAP,
+        }
+    }
+}
+
+// Re-export for convenience
+use ffi::cuvs_sys as ffi;
+type uintptr_t = usize;
+
+/// Create a bitmap tensor by excluding specific indices per query
+///
+/// Creates a bitmap tensor in host memory where each query can have its own set of excluded vectors.
+/// All vectors are included by default, and specified indices are excluded.
+/// Call `.to_device()` on the returned tensor before using it with a filter.
+///
+/// # Arguments
+///
+/// * `n_queries` - Number of queries in the batch
+/// * `n_samples` - Total number of vectors in the dataset
+/// * `excluded_indices_per_query` - Slice of vectors, one per query, containing indices to exclude
+///
+/// # Returns
+///
+/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitmap, bitmap_from_excluded_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let excluded_per_query = vec![
+///     vec![0, 1, 2],      // Query 0 excludes these
+///     vec![10, 20, 30],   // Query 1 excludes these
+/// ];
+/// let tensor = bitmap_from_excluded_indices(2, 1000, &excluded_per_query);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitmap::new(&device_tensor);
+/// ```
+pub fn bitmap_from_excluded_indices(
+    n_queries: usize,
+    n_samples: usize,
+    excluded_indices_per_query: &[Vec<usize>],
+) -> ManagedTensor {
+    use ndarray::Array1;
+
+    let bits_per_query = (n_samples + 31) / 32;
+    let bitmap_size = n_queries * bits_per_query;
+    let mut bitmap_data = Array1::<u32>::from_elem(bitmap_size, 0xFFFFFFFF);
+
+    // Process each query's exclusion list
+    for (query_idx, excluded_indices) in excluded_indices_per_query.iter().enumerate() {
+        if query_idx >= n_queries {
+            break;
+        }
+        let offset = query_idx * bits_per_query;
+        for &idx in excluded_indices {
+            if idx < n_samples {
+                let word_idx = offset + (idx / 32);
+                let bit_idx = idx % 32;
+                bitmap_data[word_idx] &= !(1u32 << bit_idx);
+            }
+        }
+    }
+
+    ManagedTensor::from(&bitmap_data)
+}
+
+/// Create a bitmap tensor by including only specific indices per query
+///
+/// Creates a bitmap tensor in host memory where each query specifies only the vectors to include.
+/// All vectors are excluded by default, and only specified indices are included.
+/// Call `.to_device()` on the returned tensor before using it with a filter.
+///
+/// # Arguments
+///
+/// * `n_queries` - Number of queries in the batch
+/// * `n_samples` - Total number of vectors in the dataset
+/// * `included_indices_per_query` - Slice of vectors, one per query, containing indices to include
+///
+/// # Returns
+///
+/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitmap, bitmap_from_included_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let included_per_query = vec![
+///     vec![0, 1, 2],      // Query 0 only searches these
+///     vec![10, 20, 30],   // Query 1 only searches these
+/// ];
+/// let tensor = bitmap_from_included_indices(2, 1000, &included_per_query);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitmap::new(&device_tensor);
+/// ```
+pub fn bitmap_from_included_indices(
+    n_queries: usize,
+    n_samples: usize,
+    included_indices_per_query: &[Vec<usize>],
+) -> ManagedTensor {
+    use ndarray::Array1;
+
+    let bits_per_query = (n_samples + 31) / 32;
+    let bitmap_size = n_queries * bits_per_query;
+    let mut bitmap_data = Array1::<u32>::zeros(bitmap_size);
+
+    // Process each query's inclusion list
+    for (query_idx, included_indices) in included_indices_per_query.iter().enumerate() {
+        if query_idx >= n_queries {
+            break;
+        }
+        let offset = query_idx * bits_per_query;
+        for &idx in included_indices {
+            if idx < n_samples {
+                let word_idx = offset + (idx / 32);
+                let bit_idx = idx % 32;
+                bitmap_data[word_idx] |= 1u32 << bit_idx;
+            }
+        }
+    }
+
+    ManagedTensor::from(&bitmap_data)
+}
+
+/// Create a bitset tensor by excluding specific indices
+///
+/// Creates a bitset tensor in host memory where all vectors are included except those specified.
+/// This is a special case of bitmap with a single query.
+/// Call `.to_device()` on the returned tensor before using it with a filter.
+///
+/// # Arguments
+///
+/// * `n_samples` - Total number of vectors in the dataset
+/// * `excluded_indices` - Slice of vector indices to exclude from search
+///
+/// # Returns
+///
+/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitset, bitset_from_excluded_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let excluded = vec![0, 5, 10, 15];
+/// let tensor = bitset_from_excluded_indices(1000, &excluded);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitset::new(&device_tensor);
+/// ```
+pub fn bitset_from_excluded_indices(n_samples: usize, excluded_indices: &[usize]) -> ManagedTensor {
+    // Bitset is a special case of bitmap with n_queries = 1
+    bitmap_from_excluded_indices(1, n_samples, &[excluded_indices.to_vec()])
+}
+
+/// Create a bitset tensor by including only specific indices
+///
+/// Creates a bitset tensor in host memory where only specified vectors are included.
+/// This is a special case of bitmap with a single query.
+/// Call `.to_device()` on the returned tensor before using it with a filter.
+///
+/// # Arguments
+///
+/// * `n_samples` - Total number of vectors in the dataset
+/// * `included_indices` - Slice of vector indices to include in search
+///
+/// # Returns
+///
+/// A managed tensor in host memory. Use `.to_device(&res)` to move it to GPU before creating a filter.
+///
+/// # Example
+///
+/// ```no_run
+/// use cuvs::filters::{Bitset, bitset_from_included_indices};
+/// use cuvs::Resources;
+///
+/// let res = Resources::new().unwrap();
+/// let included = vec![0, 5, 10, 15];
+/// let tensor = bitset_from_included_indices(1000, &included);
+/// let device_tensor = tensor.to_device(&res).unwrap();
+/// let filter = Bitset::new(&device_tensor);
+/// ```
+pub fn bitset_from_included_indices(n_samples: usize, included_indices: &[usize]) -> ManagedTensor {
+    // Bitset is a special case of bitmap with n_queries = 1
+    bitmap_from_included_indices(1, n_samples, &[included_indices.to_vec()])
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_no_filter() {
+        let filter = NoFilter;
+        let ffi_filter = filter.into_ffi();
+
+        assert_eq!(ffi_filter.addr, 0);
+        assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::NO_FILTER);
+    }
+
+    #[test]
+    fn test_bitset_filter() {
+        let arr = ndarray::Array::<u32, _>::zeros(32);
+        let tensor = ManagedTensor::from(&arr);
+        let filter = Bitset::new(&tensor);
+        let ffi_filter = filter.into_ffi();
+
+        assert_eq!(ffi_filter.addr, tensor.as_ptr() as uintptr_t);
+        assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::BITSET);
+    }
+
+    #[test]
+    fn test_bitmap_filter() {
+        let arr = ndarray::Array::<u32, _>::zeros(320);
+        let tensor = ManagedTensor::from(&arr);
+        let filter = Bitmap::new(&tensor);
+        let ffi_filter = filter.into_ffi();
+
+        assert_eq!(ffi_filter.addr, tensor.as_ptr() as uintptr_t);
+        assert_eq!(ffi_filter.type_, ffi::cuvsFilterType::BITMAP);
+    }
+
+    #[test]
+    fn test_bitset_from_excluded_indices() {
+        use ndarray::Array1;
+
+        let n_samples = 100;
+        let excluded = vec![0, 5, 10, 99];
+        let bitset_size = (n_samples + 31) / 32;
+
+        // Create manually for comparison
+        let mut expected = Array1::<u32>::from_elem(bitset_size, 0xFFFFFFFF);
+        for &idx in &excluded {
+            let word_idx = idx / 32;
+            let bit_idx = idx % 32;
+            expected[word_idx] &= !(1u32 << bit_idx);
+        }
+
+        // Create using from_excluded_indices (host version for testing)
+        let mut actual = Array1::<u32>::from_elem(bitset_size, 0xFFFFFFFF);
+        for &idx in &excluded {
+            if idx < n_samples {
+                let word_idx = idx / 32;
+                let bit_idx = idx % 32;
+                actual[word_idx] &= !(1u32 << bit_idx);
+            }
+        }
+
+        assert_eq!(actual, expected);
+
+        // Verify specific bits are cleared
+        assert_eq!(actual[0] & 1, 0); // index 0
+        assert_eq!(actual[0] & (1 << 5), 0); // index 5
+        assert_eq!(actual[0] & (1 << 10), 0); // index 10
+        assert_eq!(actual[3] & (1 << 3), 0); // index 99 (word 3, bit 3)
+    }
+
+    #[test]
+    fn test_bitset_from_included_indices() {
+        use ndarray::Array1;
+
+        let n_samples = 100;
+        let included = vec![0, 5, 10, 99];
+        let bitset_size = (n_samples + 31) / 32;
+
+        // Create using from_included_indices logic (host version for testing)
+        let mut actual = Array1::<u32>::zeros(bitset_size);
+        for &idx in &included {
+            if idx < n_samples {
+                let word_idx = idx / 32;
+                let bit_idx = idx % 32;
+                actual[word_idx] |= 1u32 << bit_idx;
+            }
+        }
+
+        // Verify specific bits are set
+        assert_eq!(actual[0] & 1, 1); // index 0
+        assert_eq!(actual[0] & (1 << 5), 1 << 5); // index 5
+        assert_eq!(actual[0] & (1 << 10), 1 << 10); // index 10
+        assert_eq!(actual[3] & (1 << 3), 1 << 3); // index 99 (word 3, bit 3)
+
+        // Verify other bits are not set
+        assert_eq!(actual[0] & (1 << 1), 0); // index 1
+        assert_eq!(actual[0] & (1 << 2), 0); // index 2
+    }
+
+    #[test]
+    fn test_bitmap_from_excluded_indices() {
+        use ndarray::Array1;
+
+        let n_queries = 3;
+        let n_samples = 100;
+        let bits_per_query = (n_samples + 31) / 32;
+        let bitmap_size = n_queries * bits_per_query;
+
+        let excluded_per_query = vec![vec![0, 1], vec![50], vec![99]];
+
+        // Create using from_excluded_indices logic (host version for testing)
+        let mut actual = Array1::<u32>::from_elem(bitmap_size, 0xFFFFFFFF);
+        for (query_idx, excluded_indices) in excluded_per_query.iter().enumerate() {
+            let offset = query_idx * bits_per_query;
+            for &idx in excluded_indices {
+                if idx < n_samples {
+                    let word_idx = offset + (idx / 32);
+                    let bit_idx = idx % 32;
+                    actual[word_idx] &= !(1u32 << bit_idx);
+                }
+            }
+        }
+
+        // Verify specific bits are cleared
+        // Query 0, index 0
+        assert_eq!(actual[0] & 1, 0);
+        // Query 0, index 1
+        assert_eq!(actual[0] & 2, 0);
+        // Query 1, index 50 (word bits_per_query + 1, bit 18)
+        let word_idx = bits_per_query + 50 / 32;
+        let bit_idx = 50 % 32;
+        assert_eq!(actual[word_idx] & (1 << bit_idx), 0);
+    }
+
+    #[test]
+    fn test_bitmap_from_included_indices() {
+        use ndarray::Array1;
+
+        let n_queries = 3;
+        let n_samples = 100;
+        let bits_per_query = (n_samples + 31) / 32;
+        let bitmap_size = n_queries * bits_per_query;
+
+        let included_per_query = vec![vec![0, 1], vec![50], vec![99]];
+
+        // Create using from_included_indices logic (host version for testing)
+        let mut actual = Array1::<u32>::zeros(bitmap_size);
+        for (query_idx, included_indices) in included_per_query.iter().enumerate() {
+            let offset = query_idx * bits_per_query;
+            for &idx in included_indices {
+                if idx < n_samples {
+                    let word_idx = offset + (idx / 32);
+                    let bit_idx = idx % 32;
+                    actual[word_idx] |= 1u32 << bit_idx;
+                }
+            }
+        }
+
+        // Verify specific bits are set
+        // Query 0, index 0
+        assert_eq!(actual[0] & 1, 1);
+        // Query 0, index 1
+        assert_eq!(actual[0] & 2, 2);
+        // Query 1, index 50 (word bits_per_query + 1, bit 18)
+        let word_idx = bits_per_query + 50 / 32;
+        let bit_idx = 50 % 32;
+        assert_eq!(actual[word_idx] & (1 << bit_idx), 1 << bit_idx);
+
+        // Verify other bits are not set (Query 0, index 2)
+        assert_eq!(actual[0] & 4, 0);
+    }
+}
diff --git a/rust/cuvs/src/ivf_flat/index.rs b/rust/cuvs/src/ivf_flat/index.rs
index c38be828de..474dd486c2 100644
--- a/rust/cuvs/src/ivf_flat/index.rs
+++ b/rust/cuvs/src/ivf_flat/index.rs
@@ -3,50 +3,102 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+use std::ffi::CString;
 use std::io::{stderr, Write};
+use std::marker::PhantomData;
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{DatasetOwnership, ManagedTensor};
 use crate::error::{check_cuvs, Result};
+use crate::filters::{Filter, NoFilter};
 use crate::ivf_flat::{IndexParams, SearchParams};
 use crate::resources::Resources;
 
 /// Ivf-Flat ANN Index
+///
+/// The lifetime parameter `'a` ensures the dataset outlives the index when built
+/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index
+/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]).
+///
+/// # Examples
+///
+/// ## Borrowed dataset (compiler enforces lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::ivf_flat::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((1024, 16));
+/// let params = IndexParams::new().unwrap();
+/// let tensor = ManagedTensor::from(&arr);
+/// let index = Index::build(&res, &params, &tensor).unwrap();
+/// // arr and tensor must remain alive while index is in use
+/// ```
+///
+/// ## Owned dataset ('static lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::ivf_flat::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((1024, 16));
+/// let params = IndexParams::new().unwrap();
+/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap();
+/// let index = Index::build_owned(&res, &params, device_tensor).unwrap();
+/// drop(arr); // Fine — index owns the device copy
+/// ```
 #[derive(Debug)]
-pub struct Index(ffi::cuvsIvfFlatIndex_t);
+pub struct Index<'a> {
+    inner: ffi::cuvsIvfFlatIndex_t,
+    _data: DatasetOwnership<'a>,
+}
+
+impl<'a> Index<'a> {
+    /// Creates a new FFI index handle.
+    fn create_handle() -> Result<ffi::cuvsIvfFlatIndex_t> {
+        unsafe {
+            let mut index = std::mem::MaybeUninit::<ffi::cuvsIvfFlatIndex_t>::uninit();
+            check_cuvs(ffi::cuvsIvfFlatIndexCreate(index.as_mut_ptr()))?;
+            Ok(index.assume_init())
+        }
+    }
 
-impl Index {
-    /// Builds a new Index from the dataset for efficient search.
+    /// Builds a new IVF-Flat Index from a borrowed dataset.
+    ///
+    /// The compiler enforces that `dataset` outlives the returned index,
+    /// preventing use-after-free when the C library dereferences its
+    /// internal view of the data.
     ///
     /// # Arguments
     ///
     /// * `res` - Resources to use
     /// * `params` - Parameters for building the index
     /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    pub fn build(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
-        let index = Index::new()?;
+        dataset: &'a ManagedTensor,
+    ) -> Result<Index<'a>> {
+        let inner = Self::create_handle()?;
         unsafe {
             check_cuvs(ffi::cuvsIvfFlatBuild(
                 res.0,
                 params.0,
                 dataset.as_ptr(),
-                index.0,
+                inner,
             ))?;
         }
-        Ok(index)
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
-        unsafe {
-            let mut index = std::mem::MaybeUninit::<ffi::cuvsIvfFlatIndex_t>::uninit();
-            check_cuvs(ffi::cuvsIvfFlatIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index(index.assume_init()))
-        }
+    pub fn new() -> Result<Index<'a>> {
+        Ok(Index {
+            inner: Self::create_handle()?,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Perform a Approximate Nearest Neighbors search on the Index
@@ -58,6 +110,7 @@ impl Index {
     /// * `queries` - A matrix in device memory to query for
     /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
     /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
+    /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided)
     pub fn search(
         &self,
         res: &Resources,
@@ -65,29 +118,106 @@ impl Index {
         queries: &ManagedTensor,
         neighbors: &ManagedTensor,
         distances: &ManagedTensor,
+        filter: Option<&dyn Filter>,
     ) -> Result<()> {
         unsafe {
-            let prefilter = ffi::cuvsFilter {
-                addr: 0,
-                type_: ffi::cuvsFilterType::NO_FILTER,
-            };
+            let filter_ffi = filter
+                .map(|f| f.into_ffi())
+                .unwrap_or_else(|| NoFilter.into_ffi());
 
             check_cuvs(ffi::cuvsIvfFlatSearch(
                 res.0,
                 params.0,
-                self.0,
+                self.inner,
                 queries.as_ptr(),
                 neighbors.as_ptr(),
                 distances.as_ptr(),
-                prefilter,
+                filter_ffi,
+            ))
+        }
+    }
+
+    /// Save the IVF-Flat index to file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The file name for saving the index
+    pub fn serialize(&self, res: &Resources, filename: &str) -> Result<()> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        unsafe {
+            check_cuvs(ffi::cuvsIvfFlatSerialize(
+                res.0,
+                c_filename.as_ptr(),
+                self.inner,
             ))
         }
     }
+
+    /// Load an IVF-Flat index from file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The name of the file that stores the index
+    pub fn deserialize(res: &Resources, filename: &str) -> Result<Index<'a>> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsIvfFlatDeserialize(
+                res.0,
+                c_filename.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
+    }
 }
 
-impl Drop for Index {
+impl Index<'static> {
+    /// Builds a new IVF-Flat Index from an owned dataset.
+    ///
+    /// The index takes ownership of `dataset`, making it self-contained
+    /// with a `'static` lifetime. This is useful when the dataset is a
+    /// device copy (from [`ManagedTensor::to_device`]) that should live
+    /// as long as the index.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `params` - Parameters for building the index
+    /// * `dataset` - A row-major matrix to index (ownership transferred to the index)
+    pub fn build_owned(
+        res: &Resources,
+        params: &IndexParams,
+        dataset: ManagedTensor,
+    ) -> Result<Index<'static>> {
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsIvfFlatBuild(
+                res.0,
+                params.0,
+                dataset.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Owned(dataset),
+        })
+    }
+}
+
+impl Drop for Index<'_> {
     fn drop(&mut self) {
-        if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfFlatIndexDestroy(self.0) }) {
+        if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfFlatIndexDestroy(self.inner) }) {
             write!(stderr(), "failed to call cuvsIvfFlatIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
@@ -115,8 +245,8 @@ mod tests {
 
         let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
 
-        // build the ivf-flat index
-        let index = Index::build(&res, &build_params, dataset_device)
+        // build the ivf-flat index (owned — device copy lives in the index)
+        let index = Index::build_owned(&res, &build_params, dataset_device)
             .expect("failed to create ivf-flat index");
 
         // use the first 4 points from the dataset as queries : will test that we get them back
@@ -143,7 +273,7 @@ mod tests {
         let search_params = SearchParams::new().unwrap();
 
         index
-            .search(&res, &search_params, &queries, &neighbors, &distances)
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
             .unwrap();
 
         // Copy back to host memory
@@ -173,8 +303,8 @@ mod tests {
 
         let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
 
-        // Build the index once
-        let index = Index::build(&res, &build_params, dataset_device)
+        // Build the index once (owned)
+        let index = Index::build_owned(&res, &build_params, dataset_device)
             .expect("failed to create ivf-flat index");
 
         let search_params = SearchParams::new().unwrap();
@@ -198,8 +328,8 @@ mod tests {
 
             // This should work on every iteration because search() takes &self
             index
-                .search(&res, &search_params, &queries, &neighbors, &distances)
-                .expect(&format!("search iteration {} failed", search_iter));
+                .search(&res, &search_params, &queries, &neighbors, &distances, None)
+                .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e));
 
             // Copy back to host memory
             distances.to_host(&res, &mut distances_host).unwrap();
@@ -214,4 +344,50 @@ mod tests {
             );
         }
     }
+
+    /// Test that an index built with build (borrowed) ties the dataset lifetime.
+    #[test]
+    fn test_ivf_flat_borrowed_build() {
+        let res = Resources::new().unwrap();
+        let build_params = IndexParams::new().unwrap().set_n_lists(64);
+
+        let n_datapoints = 1024;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Create a device tensor and borrow it for the index
+        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let index = Index::build(&res, &build_params, &dataset_device)
+            .expect("failed to create ivf-flat index");
+
+        // Search while the borrowed dataset is still alive
+        let n_queries = 4;
+        let k = 5;
+        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+        let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        let search_params = SearchParams::new().unwrap();
+
+        index
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
+            .unwrap();
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+        assert_eq!(neighbors_host[[1, 0]], 1);
+        // dataset_device is still alive here — compiler ensures it
+    }
 }
diff --git a/rust/cuvs/src/ivf_flat/index_params.rs b/rust/cuvs/src/ivf_flat/index_params.rs
index 523bc7619e..7eac14e413 100644
--- a/rust/cuvs/src/ivf_flat/index_params.rs
+++ b/rust/cuvs/src/ivf_flat/index_params.rs
@@ -73,6 +73,15 @@ impl IndexParams {
     }
 }
 
+impl IndexParams {
+    /// Returns a builder for constructing [`IndexParams`] with validated parameters.
+    ///
+    /// See [`IndexParamsBuilder`] for details.
+    pub fn builder() -> IndexParamsBuilder {
+        IndexParamsBuilder::default()
+    }
+}
+
 impl fmt::Debug for IndexParams {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // custom debug trait here, default value will show the pointer address
@@ -94,6 +103,101 @@ impl Drop for IndexParams {
     }
 }
 
+/// Builder for IVF-Flat [`IndexParams`] with pre-validated parameters.
+///
+/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults.
+pub struct IndexParamsBuilder {
+    n_lists: u32,
+    metric: Option<DistanceType>,
+    metric_arg: f32,
+    kmeans_n_iters: u32,
+    kmeans_trainset_fraction: f64,
+    add_data_on_build: bool,
+}
+
+impl Default for IndexParamsBuilder {
+    fn default() -> Self {
+        Self {
+            n_lists: 1024,
+            metric: None,
+            metric_arg: 2.0,
+            kmeans_n_iters: 20,
+            kmeans_trainset_fraction: 0.5,
+            add_data_on_build: true,
+        }
+    }
+}
+
+impl IndexParamsBuilder {
+    /// The number of clusters used in the coarse quantizer.
+    ///
+    /// Must be > 0.
+    pub fn n_lists(mut self, v: u32) -> Self {
+        self.n_lists = v;
+        self
+    }
+
+    /// DistanceType to use for building the index.
+    pub fn metric(mut self, v: DistanceType) -> Self {
+        self.metric = Some(v);
+        self
+    }
+
+    /// Metric argument (e.g. p for Minkowski distance).
+    pub fn metric_arg(mut self, v: f32) -> Self {
+        self.metric_arg = v;
+        self
+    }
+
+    /// Number of iterations searching for kmeans centers during index building.
+    pub fn kmeans_n_iters(mut self, v: u32) -> Self {
+        self.kmeans_n_iters = v;
+        self
+    }
+
+    /// Fraction of dataset used for kmeans training. Must be in (0, 1].
+    pub fn kmeans_trainset_fraction(mut self, v: f64) -> Self {
+        self.kmeans_trainset_fraction = v;
+        self
+    }
+
+    /// Populate the index with the dataset during build. When false, use `extend`.
+    pub fn add_data_on_build(mut self, v: bool) -> Self {
+        self.add_data_on_build = v;
+        self
+    }
+
+    /// Validate all parameters without allocating any GPU resources.
+    pub fn validate(&self) -> crate::error::Result<()> {
+        if self.n_lists == 0 {
+            return Err(format!("n_lists must be > 0; got {}", self.n_lists).into());
+        }
+        if self.kmeans_trainset_fraction <= 0.0 || self.kmeans_trainset_fraction > 1.0 {
+            return Err(format!(
+                "kmeans_trainset_fraction must be in (0, 1]; got {}",
+                self.kmeans_trainset_fraction
+            )
+            .into());
+        }
+        Ok(())
+    }
+
+    /// Validate all parameters and allocate the FFI struct.
+    pub fn build(self) -> crate::error::Result<IndexParams> {
+        self.validate()?;
+        let mut params = IndexParams::new()?
+            .set_n_lists(self.n_lists)
+            .set_metric_arg(self.metric_arg)
+            .set_kmeans_n_iters(self.kmeans_n_iters)
+            .set_kmeans_trainset_fraction(self.kmeans_trainset_fraction)
+            .set_add_data_on_build(self.add_data_on_build);
+        if let Some(metric) = self.metric {
+            params = params.set_metric(metric);
+        }
+        Ok(params)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -110,4 +214,22 @@ mod tests {
             assert_eq!((*params.0).add_data_on_build, false);
         }
     }
+
+    #[test]
+    fn builder_rejects_zero_n_lists() {
+        let err = IndexParams::builder().n_lists(0).validate().unwrap_err();
+        assert!(
+            err.to_string().contains("n_lists"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_accepts_valid_params() {
+        assert!(IndexParams::builder()
+            .n_lists(256)
+            .kmeans_trainset_fraction(0.5)
+            .validate()
+            .is_ok());
+    }
 }
diff --git a/rust/cuvs/src/ivf_flat/mod.rs b/rust/cuvs/src/ivf_flat/mod.rs
index 7417116965..954c0238d4 100644
--- a/rust/cuvs/src/ivf_flat/mod.rs
+++ b/rust/cuvs/src/ivf_flat/mod.rs
@@ -28,7 +28,8 @@
 //!
 //!     // build the ivf-flat index
 //!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
+//!     let tensor = ManagedTensor::from(&dataset);
+//!     let index = Index::build(&res, &build_params, &tensor)?;
 //!     println!(
 //!         "Indexed {}x{} datapoints into ivf-flat index",
 //!         n_datapoints, n_features
@@ -72,5 +73,5 @@ mod index_params;
 mod search_params;
 
 pub use index::Index;
-pub use index_params::IndexParams;
+pub use index_params::{IndexParams, IndexParamsBuilder};
 pub use search_params::SearchParams;
diff --git a/rust/cuvs/src/ivf_pq/index.rs b/rust/cuvs/src/ivf_pq/index.rs
index f61e3b771a..9d5638d98b 100644
--- a/rust/cuvs/src/ivf_pq/index.rs
+++ b/rust/cuvs/src/ivf_pq/index.rs
@@ -3,50 +3,102 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+use std::ffi::CString;
 use std::io::{stderr, Write};
+use std::marker::PhantomData;
 
-use crate::dlpack::ManagedTensor;
+use crate::dlpack::{DatasetOwnership, ManagedTensor};
 use crate::error::{check_cuvs, Result};
+use crate::filters::{Filter, NoFilter};
 use crate::ivf_pq::{IndexParams, SearchParams};
 use crate::resources::Resources;
 
 /// Ivf-Pq ANN Index
+///
+/// The lifetime parameter `'a` ensures the dataset outlives the index when built
+/// with [`Index::build`]. Use [`Index::build_owned`] for a self-contained index
+/// that owns its dataset (e.g., after [`ManagedTensor::to_device`]).
+///
+/// # Examples
+///
+/// ## Borrowed dataset (compiler enforces lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::ivf_pq::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((1024, 16));
+/// let params = IndexParams::new().unwrap();
+/// let tensor = ManagedTensor::from(&arr);
+/// let index = Index::build(&res, &params, &tensor).unwrap();
+/// // arr and tensor must remain alive while index is in use
+/// ```
+///
+/// ## Owned dataset ('static lifetime)
+///
+/// ```no_run
+/// # use cuvs::{ManagedTensor, Resources};
+/// # use cuvs::ivf_pq::{Index, IndexParams};
+/// let res = Resources::new().unwrap();
+/// let arr = ndarray::Array::<f32, _>::zeros((1024, 16));
+/// let params = IndexParams::new().unwrap();
+/// let device_tensor = ManagedTensor::from(&arr).to_device(&res).unwrap();
+/// let index = Index::build_owned(&res, &params, device_tensor).unwrap();
+/// drop(arr); // Fine — index owns the device copy
+/// ```
 #[derive(Debug)]
-pub struct Index(ffi::cuvsIvfPqIndex_t);
+pub struct Index<'a> {
+    inner: ffi::cuvsIvfPqIndex_t,
+    _data: DatasetOwnership<'a>,
+}
 
-impl Index {
-    /// Builds a new Index from the dataset for efficient search.
+impl<'a> Index<'a> {
+    /// Creates a new FFI index handle.
+    fn create_handle() -> Result<ffi::cuvsIvfPqIndex_t> {
+        unsafe {
+            let mut index = std::mem::MaybeUninit::<ffi::cuvsIvfPqIndex_t>::uninit();
+            check_cuvs(ffi::cuvsIvfPqIndexCreate(index.as_mut_ptr()))?;
+            Ok(index.assume_init())
+        }
+    }
+
+    /// Builds a new IVF-PQ Index from a borrowed dataset.
+    ///
+    /// The compiler enforces that `dataset` outlives the returned index,
+    /// preventing use-after-free when the C library dereferences its
+    /// internal view of the data.
     ///
     /// # Arguments
     ///
     /// * `res` - Resources to use
     /// * `params` - Parameters for building the index
     /// * `dataset` - A row-major matrix on either the host or device to index
-    pub fn build<T: Into<ManagedTensor>>(
+    pub fn build(
         res: &Resources,
         params: &IndexParams,
-        dataset: T,
-    ) -> Result<Index> {
-        let dataset: ManagedTensor = dataset.into();
-        let index = Index::new()?;
+        dataset: &'a ManagedTensor,
+    ) -> Result<Index<'a>> {
+        let inner = Self::create_handle()?;
         unsafe {
             check_cuvs(ffi::cuvsIvfPqBuild(
                 res.0,
                 params.0,
                 dataset.as_ptr(),
-                index.0,
+                inner,
             ))?;
         }
-        Ok(index)
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Creates a new empty index
-    pub fn new() -> Result<Index> {
-        unsafe {
-            let mut index = std::mem::MaybeUninit::<ffi::cuvsIvfPqIndex_t>::uninit();
-            check_cuvs(ffi::cuvsIvfPqIndexCreate(index.as_mut_ptr()))?;
-            Ok(Index(index.assume_init()))
-        }
+    pub fn new() -> Result<Index<'a>> {
+        Ok(Index {
+            inner: Self::create_handle()?,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
     }
 
     /// Perform a Approximate Nearest Neighbors search on the Index
@@ -58,6 +110,7 @@ impl Index {
     /// * `queries` - A matrix in device memory to query for
     /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
     /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
+    /// * `filter` - Optional filter to apply to the search (defaults to NoFilter if not provided)
     pub fn search(
         &self,
         res: &Resources,
@@ -65,23 +118,106 @@ impl Index {
         queries: &ManagedTensor,
         neighbors: &ManagedTensor,
         distances: &ManagedTensor,
+        filter: Option<&dyn Filter>,
     ) -> Result<()> {
         unsafe {
+            let filter_ffi = filter
+                .map(|f| f.into_ffi())
+                .unwrap_or_else(|| NoFilter.into_ffi());
+
             check_cuvs(ffi::cuvsIvfPqSearch(
                 res.0,
                 params.0,
-                self.0,
+                self.inner,
                 queries.as_ptr(),
                 neighbors.as_ptr(),
                 distances.as_ptr(),
+                filter_ffi,
             ))
         }
     }
+
+    /// Save the IVF-PQ index to file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The file name for saving the index
+    pub fn serialize(&self, res: &Resources, filename: &str) -> Result<()> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        unsafe {
+            check_cuvs(ffi::cuvsIvfPqSerialize(
+                res.0,
+                c_filename.as_ptr(),
+                self.inner,
+            ))
+        }
+    }
+
+    /// Load an IVF-PQ index from file.
+    ///
+    /// Experimental, both the API and the serialization format are subject to change.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `filename` - The name of the file that stores the index
+    pub fn deserialize(res: &Resources, filename: &str) -> Result<Index<'a>> {
+        let c_filename = CString::new(filename).expect("filename contains null byte");
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsIvfPqDeserialize(
+                res.0,
+                c_filename.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Borrowed(PhantomData),
+        })
+    }
 }
 
-impl Drop for Index {
+impl Index<'static> {
+    /// Builds a new IVF-PQ Index from an owned dataset.
+    ///
+    /// The index takes ownership of `dataset`, making it self-contained
+    /// with a `'static` lifetime. This is useful when the dataset is a
+    /// device copy (from [`ManagedTensor::to_device`]) that should live
+    /// as long as the index.
+    ///
+    /// # Arguments
+    ///
+    /// * `res` - Resources to use
+    /// * `params` - Parameters for building the index
+    /// * `dataset` - A row-major matrix to index (ownership transferred to the index)
+    pub fn build_owned(
+        res: &Resources,
+        params: &IndexParams,
+        dataset: ManagedTensor,
+    ) -> Result<Index<'static>> {
+        let inner = Self::create_handle()?;
+        unsafe {
+            check_cuvs(ffi::cuvsIvfPqBuild(
+                res.0,
+                params.0,
+                dataset.as_ptr(),
+                inner,
+            ))?;
+        }
+        Ok(Index {
+            inner,
+            _data: DatasetOwnership::Owned(dataset),
+        })
+    }
+}
+
+impl Drop for Index<'_> {
     fn drop(&mut self) {
-        if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfPqIndexDestroy(self.0) }) {
+        if let Err(e) = check_cuvs(unsafe { ffi::cuvsIvfPqIndexDestroy(self.inner) }) {
             write!(stderr(), "failed to call cuvsIvfPqIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
@@ -109,8 +245,8 @@ mod tests {
 
         let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
 
-        // build the ivf-pq index
-        let index = Index::build(&res, &build_params, dataset_device)
+        // build the ivf-pq index (owned — device copy lives in the index)
+        let index = Index::build_owned(&res, &build_params, dataset_device)
             .expect("failed to create ivf-pq index");
 
         // use the first 4 points from the dataset as queries : will test that we get them back
@@ -137,7 +273,7 @@ mod tests {
         let search_params = SearchParams::new().unwrap();
 
         index
-            .search(&res, &search_params, &queries, &neighbors, &distances)
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
             .unwrap();
 
         // Copy back to host memory
@@ -167,8 +303,8 @@ mod tests {
 
         let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
 
-        // Build the index once
-        let index = Index::build(&res, &build_params, dataset_device)
+        // Build the index once (owned)
+        let index = Index::build_owned(&res, &build_params, dataset_device)
             .expect("failed to create ivf-pq index");
 
         let search_params = SearchParams::new().unwrap();
@@ -192,8 +328,8 @@ mod tests {
 
             // This should work on every iteration because search() takes &self
             index
-                .search(&res, &search_params, &queries, &neighbors, &distances)
-                .expect(&format!("search iteration {} failed", search_iter));
+                .search(&res, &search_params, &queries, &neighbors, &distances, None)
+                .unwrap_or_else(|e| panic!("search iteration {} failed: {}", search_iter, e));
 
             // Copy back to host memory
             distances.to_host(&res, &mut distances_host).unwrap();
@@ -208,4 +344,104 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_ivf_pq_serialize_deserialize() {
+        let res = Resources::new().unwrap();
+        let build_params = IndexParams::new().unwrap().set_n_lists(64);
+
+        let n_datapoints = 1024;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let index = Index::build_owned(&res, &build_params, dataset_device)
+            .expect("failed to create ivf-pq index");
+
+        let dir = std::env::temp_dir();
+        let filepath = dir.join("test_ivf_pq_index.bin");
+        let filepath_str = filepath.to_str().unwrap();
+        index
+            .serialize(&res, filepath_str)
+            .expect("failed to serialize ivf-pq index");
+
+        assert!(filepath.exists());
+        assert!(std::fs::metadata(&filepath).unwrap().len() > 0);
+
+        let loaded_index: Index<'_> =
+            Index::deserialize(&res, filepath_str).expect("failed to deserialize ivf-pq index");
+
+        let n_queries = 4;
+        let k = 10;
+        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+        let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        let search_params = SearchParams::new().unwrap();
+        loaded_index
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
+            .expect("failed to search deserialized index");
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+        assert_eq!(neighbors_host[[1, 0]], 1);
+
+        let _ = std::fs::remove_file(&filepath);
+    }
+
+    /// Test that an index built with build (borrowed) ties the dataset lifetime.
+    #[test]
+    fn test_ivf_pq_borrowed_build() {
+        let res = Resources::new().unwrap();
+        let build_params = IndexParams::new().unwrap().set_n_lists(64);
+
+        let n_datapoints = 1024;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Create a device tensor and borrow it for the index
+        let dataset_device = ManagedTensor::from(&dataset).to_device(&res).unwrap();
+        let index = Index::build(&res, &build_params, &dataset_device)
+            .expect("failed to create ivf-pq index");
+
+        // Search while the borrowed dataset is still alive
+        let n_queries = 4;
+        let k = 5;
+        let queries = dataset.slice(s![0..n_queries, ..]);
+        let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+        let mut neighbors_host = ndarray::Array::<i64, _>::zeros((n_queries, k));
+        let neighbors = ManagedTensor::from(&neighbors_host)
+            .to_device(&res)
+            .unwrap();
+
+        let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+        let distances = ManagedTensor::from(&distances_host)
+            .to_device(&res)
+            .unwrap();
+
+        let search_params = SearchParams::new().unwrap();
+
+        index
+            .search(&res, &search_params, &queries, &neighbors, &distances, None)
+            .unwrap();
+
+        distances.to_host(&res, &mut distances_host).unwrap();
+        neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+        assert_eq!(neighbors_host[[0, 0]], 0);
+        assert_eq!(neighbors_host[[1, 0]], 1);
+        // dataset_device is still alive here — compiler ensures it
+    }
 }
diff --git a/rust/cuvs/src/ivf_pq/index_params.rs b/rust/cuvs/src/ivf_pq/index_params.rs
index e1f2d53656..7e276d3d4a 100644
--- a/rust/cuvs/src/ivf_pq/index_params.rs
+++ b/rust/cuvs/src/ivf_pq/index_params.rs
@@ -149,6 +149,15 @@ impl IndexParams {
     }
 }
 
+impl IndexParams {
+    /// Returns a builder for constructing [`IndexParams`] with validated parameters.
+    ///
+    /// See [`IndexParamsBuilder`] for details.
+    pub fn builder() -> IndexParamsBuilder {
+        IndexParamsBuilder::default()
+    }
+}
+
 impl fmt::Debug for IndexParams {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // custom debug trait here, default value will show the pointer address
@@ -170,6 +179,159 @@ impl Drop for IndexParams {
     }
 }
 
+/// Builder for IVF-PQ [`IndexParams`] with pre-validated parameters.
+///
+/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults.
+pub struct IndexParamsBuilder {
+    n_lists: u32,
+    metric: Option<crate::distance_type::DistanceType>,
+    metric_arg: f32,
+    kmeans_n_iters: u32,
+    kmeans_trainset_fraction: f64,
+    pq_bits: u32,
+    pq_dim: u32,
+    codebook_kind: Option<cuvsIvfPqCodebookGen>,
+    codes_layout: Option<cuvsIvfPqListLayout>,
+    force_random_rotation: bool,
+    max_train_points_per_pq_code: u32,
+    add_data_on_build: bool,
+}
+
+impl Default for IndexParamsBuilder {
+    fn default() -> Self {
+        Self {
+            n_lists: 1024,
+            metric: None,
+            metric_arg: 2.0,
+            kmeans_n_iters: 20,
+            kmeans_trainset_fraction: 0.5,
+            pq_bits: 8,
+            pq_dim: 0,
+            codebook_kind: None,
+            codes_layout: None,
+            force_random_rotation: false,
+            max_train_points_per_pq_code: 256,
+            add_data_on_build: true,
+        }
+    }
+}
+
+impl IndexParamsBuilder {
+    /// The number of clusters used in the coarse quantizer.
+    ///
+    /// Must be > 0.
+    pub fn n_lists(mut self, v: u32) -> Self {
+        self.n_lists = v;
+        self
+    }
+
+    /// DistanceType to use for building the index.
+    pub fn metric(mut self, v: crate::distance_type::DistanceType) -> Self {
+        self.metric = Some(v);
+        self
+    }
+
+    /// Metric argument (e.g. p for Minkowski distance).
+    pub fn metric_arg(mut self, v: f32) -> Self {
+        self.metric_arg = v;
+        self
+    }
+
+    /// Number of iterations searching for kmeans centers during index building.
+    pub fn kmeans_n_iters(mut self, v: u32) -> Self {
+        self.kmeans_n_iters = v;
+        self
+    }
+
+    /// Fraction of dataset used for kmeans training. Must be in (0, 1].
+    pub fn kmeans_trainset_fraction(mut self, v: f64) -> Self {
+        self.kmeans_trainset_fraction = v;
+        self
+    }
+
+    /// Bit length of the vector element after quantization. Typically 4 or 8.
+    pub fn pq_bits(mut self, v: u32) -> Self {
+        self.pq_bits = v;
+        self
+    }
+
+    /// Dimensionality of the vector after product quantization. 0 = auto.
+    pub fn pq_dim(mut self, v: u32) -> Self {
+        self.pq_dim = v;
+        self
+    }
+
+    /// Codebook generation method.
+    pub fn codebook_kind(mut self, v: cuvsIvfPqCodebookGen) -> Self {
+        self.codebook_kind = Some(v);
+        self
+    }
+
+    /// Memory layout of IVF-PQ list data.
+    pub fn codes_layout(mut self, v: cuvsIvfPqListLayout) -> Self {
+        self.codes_layout = Some(v);
+        self
+    }
+
+    /// Apply a random rotation matrix on input data and queries.
+    pub fn force_random_rotation(mut self, v: bool) -> Self {
+        self.force_random_rotation = v;
+        self
+    }
+
+    /// Max number of data points per PQ code during codebook training.
+    pub fn max_train_points_per_pq_code(mut self, v: u32) -> Self {
+        self.max_train_points_per_pq_code = v;
+        self
+    }
+
+    /// Populate the index with the dataset during build. When false, use `extend`.
+    pub fn add_data_on_build(mut self, v: bool) -> Self {
+        self.add_data_on_build = v;
+        self
+    }
+
+    /// Validate all parameters without allocating any GPU resources.
+    pub fn validate(&self) -> crate::error::Result<()> {
+        if self.n_lists == 0 {
+            return Err(format!("n_lists must be > 0; got {}", self.n_lists).into());
+        }
+        if self.kmeans_trainset_fraction <= 0.0 || self.kmeans_trainset_fraction > 1.0 {
+            return Err(format!(
+                "kmeans_trainset_fraction must be in (0, 1]; got {}",
+                self.kmeans_trainset_fraction
+            )
+            .into());
+        }
+        Ok(())
+    }
+
+    /// Validate all parameters and allocate the FFI struct.
+    pub fn build(self) -> crate::error::Result<IndexParams> {
+        self.validate()?;
+        let mut params = IndexParams::new()?
+            .set_n_lists(self.n_lists)
+            .set_metric_arg(self.metric_arg)
+            .set_kmeans_n_iters(self.kmeans_n_iters)
+            .set_kmeans_trainset_fraction(self.kmeans_trainset_fraction)
+            .set_pq_bits(self.pq_bits)
+            .set_pq_dim(self.pq_dim)
+            .set_force_random_rotation(self.force_random_rotation)
+            .set_max_train_points_per_pq_code(self.max_train_points_per_pq_code)
+            .set_add_data_on_build(self.add_data_on_build);
+        if let Some(metric) = self.metric {
+            params = params.set_metric(metric);
+        }
+        if let Some(kind) = self.codebook_kind {
+            params = params.set_codebook_kind(kind);
+        }
+        if let Some(layout) = self.codes_layout {
+            params = params.set_codes_layout(layout);
+        }
+        Ok(params)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -186,4 +348,34 @@ mod tests {
             assert_eq!((*params.0).add_data_on_build, false);
         }
     }
+
+    #[test]
+    fn builder_rejects_zero_n_lists() {
+        let err = IndexParams::builder().n_lists(0).validate().unwrap_err();
+        assert!(
+            err.to_string().contains("n_lists"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_accepts_valid_params() {
+        assert!(IndexParams::builder()
+            .n_lists(256)
+            .kmeans_trainset_fraction(0.5)
+            .validate()
+            .is_ok());
+    }
+
+    #[test]
+    fn existing_setter_api_unchanged() {
+        let params = IndexParams::new()
+            .unwrap()
+            .set_n_lists(128)
+            .set_add_data_on_build(false);
+        unsafe {
+            assert_eq!((*params.0).n_lists, 128);
+            assert_eq!((*params.0).add_data_on_build, false);
+        }
+    }
 }
diff --git a/rust/cuvs/src/ivf_pq/mod.rs b/rust/cuvs/src/ivf_pq/mod.rs
index c4676cd1aa..bdabe0142e 100644
--- a/rust/cuvs/src/ivf_pq/mod.rs
+++ b/rust/cuvs/src/ivf_pq/mod.rs
@@ -25,7 +25,8 @@
 //!
 //!     // build the ivf-pq index
 //!     let build_params = IndexParams::new()?;
-//!     let index = Index::build(&res, &build_params, &dataset)?;
+//!     let tensor = ManagedTensor::from(&dataset);
+//!     let index = Index::build(&res, &build_params, &tensor)?;
 //!     println!(
 //!         "Indexed {}x{} datapoints into ivf-pq index",
 //!         n_datapoints, n_features
@@ -69,5 +70,5 @@ mod index_params;
 mod search_params;
 
 pub use index::Index;
-pub use index_params::IndexParams;
+pub use index_params::{IndexParams, IndexParamsBuilder};
 pub use search_params::SearchParams;
diff --git a/rust/cuvs/src/lib.rs b/rust/cuvs/src/lib.rs
index f085915680..a3c73e2e45 100644
--- a/rust/cuvs/src/lib.rs
+++ b/rust/cuvs/src/lib.rs
@@ -14,6 +14,7 @@ pub mod distance;
 pub mod distance_type;
 mod dlpack;
 mod error;
+pub mod filters;
 pub mod ivf_flat;
 pub mod ivf_pq;
 mod resources;
diff --git a/rust/cuvs/src/vamana/index_params.rs b/rust/cuvs/src/vamana/index_params.rs
index c52c287238..3138390a96 100644
--- a/rust/cuvs/src/vamana/index_params.rs
+++ b/rust/cuvs/src/vamana/index_params.rs
@@ -96,6 +96,15 @@ impl IndexParams {
     }
 }
 
+impl IndexParams {
+    /// Returns a builder for constructing [`IndexParams`] with validated parameters.
+    ///
+    /// See [`IndexParamsBuilder`] for details.
+    pub fn builder() -> IndexParamsBuilder {
+        IndexParamsBuilder::default()
+    }
+}
+
 impl fmt::Debug for IndexParams {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // custom debug trait here, default value will show the pointer address
@@ -117,6 +126,135 @@ impl Drop for IndexParams {
     }
 }
 
+/// Builder for Vamana [`IndexParams`] with pre-validated parameters.
+///
+/// Construct via [`IndexParams::builder()`]. Defaults match the cuVS C API defaults.
+pub struct IndexParamsBuilder {
+    graph_degree: u32,
+    visited_size: u32,
+    vamana_iters: f32,
+    alpha: f32,
+    max_fraction: f32,
+    batch_base: f32,
+    queue_size: u32,
+    reverse_batchsize: u32,
+    metric: Option<DistanceType>,
+}
+
+impl Default for IndexParamsBuilder {
+    fn default() -> Self {
+        Self {
+            graph_degree: 64,
+            visited_size: 75,
+            vamana_iters: 2.0,
+            alpha: 1.2,
+            max_fraction: 0.06,
+            batch_base: 2.0,
+            queue_size: 255,
+            reverse_batchsize: 1_000_000,
+            metric: None,
+        }
+    }
+}
+
+impl IndexParamsBuilder {
+    /// Maximum degree of the output graph (R parameter in Vamana literature).
+    ///
+    /// Must be > 0.
+    pub fn graph_degree(mut self, v: u32) -> Self {
+        self.graph_degree = v;
+        self
+    }
+
+    /// Maximum number of visited nodes per search (L parameter in Vamana literature).
+    ///
+    /// Must be >= `graph_degree`.
+    pub fn visited_size(mut self, v: u32) -> Self {
+        self.visited_size = v;
+        self
+    }
+
+    /// Number of Vamana vector insertion iterations.
+    pub fn vamana_iters(mut self, v: f32) -> Self {
+        self.vamana_iters = v;
+        self
+    }
+
+    /// Alpha for pruning parameter.
+    ///
+    /// Must be > 0.
+    pub fn alpha(mut self, v: f32) -> Self {
+        self.alpha = v;
+        self
+    }
+
+    /// Maximum fraction of dataset inserted per batch.
+    pub fn max_fraction(mut self, v: f32) -> Self {
+        self.max_fraction = v;
+        self
+    }
+
+    /// Base of growth rate of batch sizes.
+    pub fn batch_base(mut self, v: f32) -> Self {
+        self.batch_base = v;
+        self
+    }
+
+    /// Size of candidate queue structure.
+    pub fn queue_size(mut self, v: u32) -> Self {
+        self.queue_size = v;
+        self
+    }
+
+    /// Max batchsize of reverse edge processing.
+    pub fn reverse_batchsize(mut self, v: u32) -> Self {
+        self.reverse_batchsize = v;
+        self
+    }
+
+    /// DistanceType to use for building the index.
+    pub fn metric(mut self, v: DistanceType) -> Self {
+        self.metric = Some(v);
+        self
+    }
+
+    /// Validate all parameters without allocating any GPU resources.
+    pub fn validate(&self) -> crate::error::Result<()> {
+        if self.graph_degree == 0 {
+            return Err(format!("graph_degree must be > 0; got {}", self.graph_degree).into());
+        }
+        if self.visited_size < self.graph_degree {
+            return Err(format!(
+                "visited_size ({}) must be >= graph_degree ({})",
+                self.visited_size, self.graph_degree
+            )
+            .into());
+        }
+        if self.alpha <= 0.0 {
+            return Err(format!("alpha must be > 0; got {}", self.alpha).into());
+        }
+        Ok(())
+    }
+
+    /// Validate all parameters and allocate the FFI struct.
+    pub fn build(self) -> crate::error::Result<IndexParams> {
+        self.validate()?;
+        let mut params = IndexParams::new()?
+            .set_graph_degree(self.graph_degree)
+            .set_visited_size(self.visited_size)
+            .set_vamana_iters(self.vamana_iters)
+            .set_alpha(self.alpha)
+            .set_max_fraction(self.max_fraction)
+            .set_batch_base(self.batch_base)
+            .set_queue_size(self.queue_size)
+            .set_reverse_batchsize(self.reverse_batchsize);
+        if let Some(metric) = self.metric {
+            params = params.set_metric(metric);
+        }
+        Ok(params)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -133,4 +271,39 @@ mod tests {
             assert_eq!((*params.0).visited_size, 128);
         }
     }
+
+    #[test]
+    fn builder_rejects_zero_graph_degree() {
+        let err = IndexParams::builder()
+            .graph_degree(0)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("graph_degree"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_rejects_visited_size_less_than_graph_degree() {
+        let err = IndexParams::builder()
+            .graph_degree(64)
+            .visited_size(32)
+            .validate()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("visited_size"),
+            "error message should name the field: {err}"
+        );
+    }
+
+    #[test]
+    fn builder_accepts_valid_params() {
+        assert!(IndexParams::builder()
+            .graph_degree(32)
+            .visited_size(75)
+            .alpha(1.2)
+            .validate()
+            .is_ok());
+    }
 }
diff --git a/rust/cuvs/src/vamana/mod.rs b/rust/cuvs/src/vamana/mod.rs
index a3ae4ee9ff..631dd37d0e 100644
--- a/rust/cuvs/src/vamana/mod.rs
+++ b/rust/cuvs/src/vamana/mod.rs
@@ -8,4 +8,4 @@ mod index;
 mod index_params;
 
 pub use index::Index;
-pub use index_params::IndexParams;
+pub use index_params::{IndexParams, IndexParamsBuilder};