From 75ceb6f66f411acb0edbf5b5fcc41ae616ad3c47 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 27 Feb 2026 18:10:46 +0100
Subject: [PATCH 01/43] IVF-SQ

---
 cpp/CMakeLists.txt                            |   6 +
 cpp/bench/ann/CMakeLists.txt                  |  11 +-
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |  25 +
 cpp/bench/ann/src/cuvs/cuvs_benchmark.cu      |  20 +-
 cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu         |  10 +
 cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h  | 141 ++++
 cpp/include/cuvs/neighbors/ivf_sq.hpp         | 336 +++++++++
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     | 664 ++++++++++++++++++
 ...f_sq_build_extend_float_uint8_t_int64_t.cu |  89 +++
 ...vf_sq_build_extend_half_uint8_t_int64_t.cu |  89 +++
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh    | 549 +++++++++++++++
 .../ivf_sq_search_float_uint8_t_int64_t.cu    |  29 +
 .../ivf_sq_search_half_uint8_t_int64_t.cu     |  29 +
 cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh | 161 +++++
 .../ivf_sq/ivf_sq_serialize_uint8_t.cu        |  16 +
 cpp/src/neighbors/ivf_sq_index.cpp            | 236 +++++++
 cpp/tests/CMakeLists.txt                      |   7 +
 cpp/tests/neighbors/ann_ivf_sq.cuh            | 457 ++++++++++++
 .../ann_ivf_sq/test_float_uint8_t.cu          |  21 +
 .../cuvs_bench/config/algorithms.yaml         |   3 +
 .../cuvs_bench/config/algos/cuvs_ivf_sq.yaml  |  16 +
 21 files changed, 2913 insertions(+), 2 deletions(-)
 create mode 100644 cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu
 create mode 100644 cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h
 create mode 100644 cpp/include/cuvs/neighbors/ivf_sq.hpp
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
 create mode 100644 cpp/src/neighbors/ivf_sq_index.cpp
 create mode 100644 cpp/tests/neighbors/ann_ivf_sq.cuh
 create mode 100644 cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
 create mode 100644 python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d90579812a..610b9eff3f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -621,6 +621,12 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/ivf_pq/detail/ivf_pq_transform_half_int64_t.cu
     src/neighbors/ivf_pq/detail/ivf_pq_transform_int8_t_int64_t.cu
     src/neighbors/ivf_pq/detail/ivf_pq_transform_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq_index.cpp
+    src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
     src/neighbors/knn_merge_parts.cu
     src/neighbors/nn_descent.cu
     src/neighbors/nn_descent_float.cu
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 8d254c0933..ae42abeb35 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -1,6 +1,6 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
@@ -24,6 +24,7 @@ option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algori
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT "Include faiss' hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ "Include cuVS ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ "Include cuVS ivf pq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA "Include cuVS CAGRA in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benchmark" ON)
@@ -80,6 +81,7 @@ set(CUVS_USE_FAISS_STATIC ON)
 if(BUILD_CPU_ONLY)
   set(CUVS_FAISS_ENABLE_GPU OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_CAGRA OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
@@ -97,6 +99,7 @@ set(CUVS_ANN_BENCH_USE_CUVS OFF)
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
    OR CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+   OR CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
@@ -242,6 +245,12 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ)
+  ConfigureAnnBench(
+    NAME CUVS_IVF_SQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_sq.cu LINKS cuvs
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE)
   ConfigureAnnBench(NAME CUVS_BRUTE_FORCE PATH src/cuvs/cuvs_benchmark.cu LINKS cuvs)
 endif()
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index faa3345d1f..4bc7505dc4 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -35,6 +35,11 @@ extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
 extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #endif
 
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+#include "cuvs_ivf_sq_wrapper.h"
+extern template class cuvs::bench::cuvs_ivf_sq<float>;
+extern template class cuvs::bench::cuvs_ivf_sq<half>;
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_MG
 #include "cuvs_ivf_flat_wrapper.h"
 #include "cuvs_mg_ivf_flat_wrapper.h"
@@ -86,6 +91,26 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_ivf_sq<T>::build_param& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / static_cast<double>(conf.at("ratio"));
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_ivf_sq<T>::search_param& param)
+{
+  param.ivf_sq_params.n_probes = conf.at("nprobe");
+}
+#endif
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) ||   \
   defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || defined(CUVS_ANN_BENCH_USE_CUVS_MG) || \
   defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_DISKANN)
diff --git a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
index aebac654c2..22aeb31c38 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -84,6 +84,15 @@ auto create_algo(const std::string& algo_name,
     }
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, half>) {
+    if (algo_name == "cuvs_ivf_sq") {
+      typename cuvs::bench::cuvs_ivf_sq<T>::build_param param;
+      parse_build_param<T>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_ivf_sq<T>>(metric, dim, param);
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
   if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
     typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::build_param param;
@@ -151,6 +160,15 @@ auto create_search_param(const std::string& algo_name, const nlohmann::json& con
     }
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, half>) {
+    if (algo_name == "cuvs_ivf_sq") {
+      auto param = std::make_unique<typename cuvs::bench::cuvs_ivf_sq<T>::search_param>();
+      parse_search_param<T>(conf, *param);
+      return param;
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
   if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
     auto param = std::make_unique<typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::search_param>();
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu
new file mode 100644
index 0000000000..ec41324c8d
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu
@@ -0,0 +1,10 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cuvs_ivf_sq_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_ivf_sq<float>;
+template class cuvs_ivf_sq<half>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h
new file mode 100644
index 0000000000..1503e6bb84
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h
@@ -0,0 +1,141 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+namespace cuvs::bench {
+
+template <typename T>
+class cuvs_ivf_sq : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::ivf_sq::search_params ivf_sq_params;
+  };
+
+  using build_param = cuvs::neighbors::ivf_sq::index_params;
+
+  cuvs_ivf_sq(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param), dimension_(dim)
+  {
+    index_params_.metric                         = parse_metric_type(metric);
+    index_params_.conservative_memory_allocation = true;
+    RAFT_CUDA_TRY(cudaGetDevice(&device_));
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param, const void* filter_bitset) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  configured_raft_resources handle_{};
+  build_param index_params_;
+  cuvs::neighbors::ivf_sq::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::ivf_sq::index<uint8_t>> index_;
+  int device_;
+  int dimension_;
+
+  std::shared_ptr<cuvs::neighbors::filtering::base_filter> filter_;
+};
+
+template <typename T>
+void cuvs_ivf_sq<T>::build(const T* dataset, size_t nrow)
+{
+  size_t n_streams = 1;
+  raft::resource::set_cuda_stream_pool(handle_, std::make_shared<rmm::cuda_stream_pool>(n_streams));
+  index_ = std::make_shared<cuvs::neighbors::ivf_sq::index<uint8_t>>(
+    std::move(cuvs::neighbors::ivf_sq::build(
+      handle_,
+      index_params_,
+      raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, dimension_))));
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::set_search_param(const search_param_base& param, const void* filter_bitset)
+{
+  filter_        = make_cuvs_filter(filter_bitset, index_->size());
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.ivf_sq_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::save(const std::string& file) const
+{
+  cuvs::neighbors::ivf_sq::serialize(handle_, file, *index_);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::load(const std::string& file)
+{
+  index_ =
+    std::make_shared<cuvs::neighbors::ivf_sq::index<uint8_t>>(handle_, index_params_, this->dim_);
+  cuvs::neighbors::ivf_sq::deserialize(handle_, file, index_.get());
+}
+
+template <typename T>
+std::unique_ptr<algo<T>> cuvs_ivf_sq<T>::copy()
+{
+  return std::make_unique<cuvs_ivf_sq<T>>(*this);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(sizeof(algo_base::index_type) == sizeof(int64_t));
+
+  cuvs::neighbors::ivf_sq::search(
+    handle_,
+    search_params_,
+    *index_,
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, index_->dim()),
+    raft::make_device_matrix_view<int64_t, int64_t>(
+      reinterpret_cast<int64_t*>(neighbors), batch_size, k),
+    raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k),
+    *filter_);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
new file mode 100644
index 0000000000..2f09751e95
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -0,0 +1,336 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include <cstdint>
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
+
+namespace cuvs::neighbors::ivf_sq {
+
+/**
+ * @defgroup ivf_sq_cpp_index_params IVF-SQ index build parameters
+ * @{
+ */
+
+constexpr static uint32_t kIndexGroupSize = 32;
+
+struct index_params : cuvs::neighbors::index_params {
+  uint32_t n_lists                    = 1024;
+  uint32_t kmeans_n_iters             = 20;
+  double kmeans_trainset_fraction     = 0.5;
+  bool adaptive_centers               = false;
+  bool conservative_memory_allocation = false;
+  bool add_data_on_build              = true;
+};
+
+struct search_params : cuvs::neighbors::search_params {
+  uint32_t n_probes = 20;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_list_spec IVF-SQ list storage spec
+ * @{
+ */
+
+template <typename SizeT, typename IdxT, typename ExtT>
+struct list_spec {
+  static_assert(std::is_same_v<IdxT, uint8_t>, "IVF-SQ code type IdxT must be uint8_t");
+
+  using value_type   = IdxT;
+  using list_extents = raft::matrix_extent<SizeT>;
+  using index_type   = ExtT;
+
+  SizeT align_max;
+  SizeT align_min;
+  uint32_t dim;
+
+  constexpr list_spec(uint32_t dim, bool conservative_memory_allocation)
+    : dim(dim),
+      align_min(kIndexGroupSize),
+      align_max(conservative_memory_allocation ? kIndexGroupSize : 1024)
+  {
+  }
+
+  template <typename OtherSizeT>
+  constexpr explicit list_spec(const list_spec<OtherSizeT, IdxT, ExtT>& other_spec)
+    : dim{other_spec.dim}, align_min{other_spec.align_min}, align_max{other_spec.align_max}
+  {
+  }
+
+  static constexpr uint32_t kVecLen = 16;
+
+  constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
+  {
+    uint32_t padded = ((dim + kVecLen - 1) / kVecLen) * kVecLen;
+    return raft::make_extents<SizeT>(n_rows, padded);
+  }
+};
+
+template <typename IdxT, typename ExtT, typename SizeT = uint32_t>
+using list_data = ivf::list<list_spec, SizeT, IdxT, ExtT>;
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index IVF-SQ index
+ * @{
+ */
+
+/**
+ * @brief IVF-SQ index.
+ *
+ * @tparam IdxT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
+ *
+ * No member depends on the raw data type T (float, half). T appears only
+ * in the free-function signatures (build, search, extend) where input data
+ * is consumed, following the IVF-PQ pattern.
+ */
+template <typename IdxT>
+struct index : cuvs::neighbors::index {
+  static_assert(std::is_same_v<IdxT, uint8_t>, "IVF-SQ code type IdxT must be uint8_t for now.");
+
+  using index_params_type  = ivf_sq::index_params;
+  using search_params_type = ivf_sq::search_params;
+  using code_type          = IdxT;
+
+  static constexpr uint32_t sq_bits = sizeof(IdxT) * 8;
+
+ public:
+  index(const index&)            = delete;
+  index(index&&)                 = default;
+  index& operator=(const index&) = delete;
+  index& operator=(index&&)      = default;
+  ~index()                       = default;
+
+  index(raft::resources const& res);
+  index(raft::resources const& res, const index_params& params, uint32_t dim);
+  index(raft::resources const& res,
+        cuvs::distance::DistanceType metric,
+        uint32_t n_lists,
+        uint32_t dim,
+        bool adaptive_centers,
+        bool conservative_memory_allocation);
+
+  cuvs::distance::DistanceType metric() const noexcept;
+  bool adaptive_centers() const noexcept;
+  int64_t size() const noexcept;
+  uint32_t dim() const noexcept;
+  uint32_t n_lists() const noexcept;
+  bool conservative_memory_allocation() const noexcept;
+
+  raft::device_vector_view<uint32_t, uint32_t> list_sizes() noexcept;
+  raft::device_vector_view<const uint32_t, uint32_t> list_sizes() const noexcept;
+
+  raft::device_matrix_view<float, uint32_t, raft::row_major> centers() noexcept;
+  raft::device_matrix_view<const float, uint32_t, raft::row_major> centers() const noexcept;
+
+  std::optional<raft::device_vector_view<float, uint32_t>> center_norms() noexcept;
+  std::optional<raft::device_vector_view<const float, uint32_t>> center_norms() const noexcept;
+  void allocate_center_norms(raft::resources const& res);
+
+  raft::device_vector_view<float, uint32_t> sq_vmin() noexcept;
+  raft::device_vector_view<const float, uint32_t> sq_vmin() const noexcept;
+
+  raft::device_vector_view<float, uint32_t> sq_delta() noexcept;
+  raft::device_vector_view<const float, uint32_t> sq_delta() const noexcept;
+
+  raft::host_vector_view<int64_t, uint32_t> accum_sorted_sizes() noexcept;
+  [[nodiscard]] raft::host_vector_view<const int64_t, uint32_t> accum_sorted_sizes() const noexcept;
+
+  raft::device_vector_view<IdxT*, uint32_t> data_ptrs() noexcept;
+  raft::device_vector_view<IdxT* const, uint32_t> data_ptrs() const noexcept;
+
+  raft::device_vector_view<int64_t*, uint32_t> inds_ptrs() noexcept;
+  raft::device_vector_view<int64_t* const, uint32_t> inds_ptrs() const noexcept;
+
+  std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& lists() noexcept;
+  const std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& lists() const noexcept;
+
+  void check_consistency();
+
+ private:
+  cuvs::distance::DistanceType metric_;
+  bool adaptive_centers_;
+  bool conservative_memory_allocation_;
+
+  std::vector<std::shared_ptr<list_data<IdxT, int64_t>>> lists_;
+  raft::device_vector<uint32_t, uint32_t> list_sizes_;
+  raft::device_matrix<float, uint32_t, raft::row_major> centers_;
+  std::optional<raft::device_vector<float, uint32_t>> center_norms_;
+  raft::device_vector<float, uint32_t> sq_vmin_;
+  raft::device_vector<float, uint32_t> sq_delta_;
+
+  raft::device_vector<IdxT*, uint32_t> data_ptrs_;
+  raft::device_vector<int64_t*, uint32_t> inds_ptrs_;
+  raft::host_vector<int64_t, uint32_t> accum_sorted_sizes_;
+};
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_build IVF-SQ index build
+ * @{
+ */
+
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_extend IVF-SQ index extend
+ * @{
+ */
+
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_search IVF-SQ index search
+ * @{
+ */
+
+void search(raft::resources const& handle,
+            const cuvs::neighbors::ivf_sq::search_params& params,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter =
+              cuvs::neighbors::filtering::none_sample_filter{});
+
+void search(raft::resources const& handle,
+            const cuvs::neighbors::ivf_sq::search_params& params,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter =
+              cuvs::neighbors::filtering::none_sample_filter{});
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_serialize IVF-SQ index serialize
+ * @{
+ */
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::ivf_sq::index<uint8_t>& index);
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::ivf_sq::index<uint8_t>* index);
+
+/**
+ * @}
+ */
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
new file mode 100644
index 0000000000..6c46a20e65
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -0,0 +1,664 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "../../cluster/kmeans_balanced.cuh"
+#include "../detail/ann_utils.cuh"
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/stats/histogram.cuh>
+#include <raft/util/pow2_utils.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cstdint>
+#include <limits>
+
+namespace cuvs::neighbors::ivf_sq {
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+namespace detail {
+
+struct ColMinMaxPair {
+  float min_val;
+  float max_val;
+};
+
+struct ColMinMaxOp {
+  __device__ __forceinline__ ColMinMaxPair operator()(const ColMinMaxPair& a,
+                                                      const ColMinMaxPair& b) const
+  {
+    return {fminf(a.min_val, b.min_val), fmaxf(a.max_val, b.max_val)};
+  }
+};
+
+/**
+ * Fused per-column min+max in a single pass (2x less DRAM traffic than two
+ * separate reductions).  One thread block per column; threads stride over
+ * rows and feed CUB BlockReduce with a combined min/max pair.
+ *
+ * Row-loop is manually 4x-unrolled so the compiler can overlap four
+ * independent __ldg requests in the memory pipeline.
+ */
+template <int BlockSize>
+__launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const float* __restrict__ data,
+                                                                    float* __restrict__ col_min,
+                                                                    float* __restrict__ col_max,
+                                                                    int64_t n_rows,
+                                                                    uint32_t dim)
+{
+  using BlockReduce = cub::BlockReduce<ColMinMaxPair, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  const uint32_t col = blockIdx.x;
+  if (col >= dim) return;
+
+  ColMinMaxPair agg = {std::numeric_limits<float>::max(), std::numeric_limits<float>::lowest()};
+
+  const int64_t stride = static_cast<int64_t>(BlockSize);
+  int64_t row          = static_cast<int64_t>(threadIdx.x);
+
+  for (; row + 3 * stride < n_rows; row += 4 * stride) {
+    float v0    = __ldg(&data[row * dim + col]);
+    float v1    = __ldg(&data[(row + stride) * dim + col]);
+    float v2    = __ldg(&data[(row + 2 * stride) * dim + col]);
+    float v3    = __ldg(&data[(row + 3 * stride) * dim + col]);
+    agg.min_val = fminf(agg.min_val, fminf(fminf(v0, v1), fminf(v2, v3)));
+    agg.max_val = fmaxf(agg.max_val, fmaxf(fmaxf(v0, v1), fmaxf(v2, v3)));
+  }
+  for (; row < n_rows; row += stride) {
+    float val   = __ldg(&data[row * dim + col]);
+    agg.min_val = fminf(agg.min_val, val);
+    agg.max_val = fmaxf(agg.max_val, val);
+  }
+
+  agg = BlockReduce(temp_storage).Reduce(agg, ColMinMaxOp());
+
+  if (threadIdx.x == 0) {
+    col_min[col] = agg.min_val;
+    col_max[col] = agg.max_val;
+  }
+}
+
+template <typename IdxT>
+auto clone(const raft::resources& res, const index<IdxT>& source) -> index<IdxT>
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  index<IdxT> target(res,
+                     source.metric(),
+                     source.n_lists(),
+                     source.dim(),
+                     source.adaptive_centers(),
+                     source.conservative_memory_allocation());
+
+  raft::copy(target.list_sizes().data_handle(),
+             source.list_sizes().data_handle(),
+             source.list_sizes().size(),
+             stream);
+  raft::copy(target.centers().data_handle(),
+             source.centers().data_handle(),
+             source.centers().size(),
+             stream);
+  if (source.center_norms().has_value()) {
+    target.allocate_center_norms(res);
+    raft::copy(target.center_norms()->data_handle(),
+               source.center_norms()->data_handle(),
+               source.center_norms()->size(),
+               stream);
+  }
+  raft::copy(target.sq_vmin().data_handle(),
+             source.sq_vmin().data_handle(),
+             source.sq_vmin().size(),
+             stream);
+  raft::copy(target.sq_delta().data_handle(),
+             source.sq_delta().data_handle(),
+             source.sq_delta().size(),
+             stream);
+  target.lists() = source.lists();
+  ivf::detail::recompute_internal_state(res, target);
+  return target;
+}
+
+/**
+ * Kernel to encode float residuals to uint8_t SQ codes and write them interleaved.
+ *
+ * Uses warp-per-vector parallelism: each warp cooperatively encodes one vector
+ * so that reads from residuals/vmin/delta are coalesced across the 32 lanes.
+ * Lane 0 handles the atomic position assignment and the index write.
+ */
+template <int BlockSize>
+__launch_bounds__(BlockSize) RAFT_KERNEL encode_and_fill_kernel(const uint32_t* labels,
+                                                                const float* residuals,
+                                                                const int64_t* source_ixs,
+                                                                uint8_t** list_data_ptrs,
+                                                                int64_t** list_index_ptrs,
+                                                                uint32_t* list_sizes_ptr,
+                                                                const float* vmin,
+                                                                const float* delta,
+                                                                int64_t n_rows,
+                                                                uint32_t dim,
+                                                                int64_t batch_offset)
+{
+  constexpr uint32_t kWarpSize      = kIndexGroupSize;
+  constexpr uint32_t kWarpsPerBlock = BlockSize / kWarpSize;
+
+  const uint32_t lane_id = threadIdx.x % kWarpSize;
+  const int64_t row_id =
+    int64_t(threadIdx.x / kWarpSize) + int64_t(blockIdx.x) * int64_t(kWarpsPerBlock);
+  if (row_id >= n_rows) return;
+
+  uint32_t list_id   = 0;
+  uint32_t inlist_id = 0;
+  if (lane_id == 0) {
+    auto source_ix = source_ixs == nullptr ? row_id + batch_offset : source_ixs[row_id];
+    list_id        = labels[row_id];
+    inlist_id      = atomicAdd(list_sizes_ptr + list_id, 1);
+    list_index_ptrs[list_id][inlist_id] = source_ix;
+  }
+  list_id   = __shfl_sync(0xFFFFFFFF, list_id, 0);
+  inlist_id = __shfl_sync(0xFFFFFFFF, inlist_id, 0);
+
+  using interleaved_group = raft::Pow2<kIndexGroupSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id);
+
+  constexpr uint32_t veclen = list_spec<uint32_t, uint8_t, int64_t>::kVecLen;
+  uint32_t padded_dim       = ((dim + veclen - 1) / veclen) * veclen;
+  auto* list_dat   = list_data_ptrs[list_id] + static_cast<size_t>(group_offset) * padded_dim;
+  const float* src = residuals + row_id * dim;
+
+  for (uint32_t d = lane_id; d < padded_dim; d += kWarpSize) {
+    uint8_t out;
+    if (d < dim) {
+      float val  = src[d];
+      float dv   = delta[d];
+      float v    = vmin[d];
+      float code = (dv > 0.0f) ? roundf((val - v) / dv) : 0.0f;
+      out        = static_cast<uint8_t>(fminf(fmaxf(code, 0.0f), 255.0f));
+    } else {
+      out = 0;
+    }
+    uint32_t l                                              = (d / veclen) * veclen;
+    uint32_t j                                              = d % veclen;
+    list_dat[l * kIndexGroupSize + ingroup_id * veclen + j] = out;
+  }
+}
+
+/**
+ * Compute residuals: residual[i] = cast<float>(x_i) - centers[labels[i]]
+ */
+template <typename T>
+RAFT_KERNEL compute_residuals_kernel(const T* dataset,
+                                     const float* centers,
+                                     const uint32_t* labels,
+                                     float* residuals,
+                                     int64_t n_rows,
+                                     uint32_t dim)
+{
+  int64_t i  = int64_t(blockIdx.x) * blockDim.x + threadIdx.x;
+  uint32_t j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= n_rows || j >= dim) return;
+
+  float val              = utils::mapping<float>{}(dataset[i * dim + j]);
+  uint32_t c             = labels[i];
+  residuals[i * dim + j] = val - centers[c * dim + j];
+}
+
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            index<IdxT>* index,
+            const T* new_vectors,
+            const int64_t* new_indices,
+            int64_t n_rows)
+{
+  using LabelT = uint32_t;
+  RAFT_EXPECTS(index != nullptr, "index cannot be empty.");
+  if (n_rows == 0) return;
+
+  auto stream  = raft::resource::get_cuda_stream(handle);
+  auto n_lists = index->n_lists();
+  auto dim     = index->dim();
+  list_spec<uint32_t, IdxT, int64_t> list_device_spec{index->dim(),
+                                                      index->conservative_memory_allocation()};
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::extend(%zu, %u)", size_t(n_rows), dim);
+
+  RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
+               "You must pass data indices when the index is non-empty.");
+
+  auto new_labels =
+    raft::make_device_mdarray<LabelT>(handle,
+                                      raft::resource::get_large_workspace_resource(handle),
+                                      raft::make_extents<int64_t>(n_rows));
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  kmeans_params.metric     = index->metric();
+  auto orig_centroids_view = raft::make_device_matrix_view<const float, int64_t>(
+    index->centers().data_handle(), n_lists, dim);
+
+  constexpr size_t kReasonableMaxBatchSize = 65536;
+  size_t max_batch_size                    = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
+
+  auto copy_stream     = raft::resource::get_cuda_stream(handle);
+  bool enable_prefetch = false;
+  if (handle.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) {
+    if (raft::resource::get_stream_pool_size(handle) >= 1) {
+      enable_prefetch = true;
+      copy_stream     = raft::resource::get_stream_from_stream_pool(handle);
+    }
+  }
+
+  utils::batch_load_iterator<T> vec_batches(new_vectors,
+                                            n_rows,
+                                            index->dim(),
+                                            max_batch_size,
+                                            copy_stream,
+                                            raft::resource::get_workspace_resource(handle),
+                                            enable_prefetch);
+  vec_batches.prefetch_next_batch();
+
+  for (const auto& batch : vec_batches) {
+    auto batch_data_view =
+      raft::make_device_matrix_view<const T, int64_t>(batch.data(), batch.size(), index->dim());
+    auto batch_labels_view = raft::make_device_vector_view<LabelT, int64_t>(
+      new_labels.data_handle() + batch.offset(), batch.size());
+    cuvs::cluster::kmeans::predict(
+      handle, kmeans_params, batch_data_view, orig_centroids_view, batch_labels_view);
+    vec_batches.prefetch_next_batch();
+    raft::resource::sync_stream(handle);
+  }
+
+  auto* list_sizes_ptr    = index->list_sizes().data_handle();
+  auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
+    handle, raft::resource::get_workspace_resource(handle), raft::make_extents<int64_t>(n_lists));
+  raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
+
+  if (index->adaptive_centers()) {
+    auto centroids_view = raft::make_device_matrix_view<float, int64_t>(
+      index->centers().data_handle(), index->centers().extent(0), index->centers().extent(1));
+    auto list_sizes_view =
+      raft::make_device_vector_view<std::remove_pointer_t<decltype(list_sizes_ptr)>, int64_t>(
+        list_sizes_ptr, n_lists);
+    for (const auto& batch : vec_batches) {
+      auto batch_data_view =
+        raft::make_device_matrix_view<const T, int64_t>(batch.data(), batch.size(), index->dim());
+      auto batch_labels_view = raft::make_device_vector_view<const LabelT, int64_t>(
+        new_labels.data_handle() + batch.offset(), batch.size());
+      cuvs::cluster::kmeans_balanced::helpers::calc_centers_and_sizes(handle,
+                                                                      batch_data_view,
+                                                                      batch_labels_view,
+                                                                      centroids_view,
+                                                                      list_sizes_view,
+                                                                      false,
+                                                                      utils::mapping<float>{});
+    }
+  } else {
+    raft::stats::histogram<uint32_t, int64_t>(raft::stats::HistTypeAuto,
+                                              reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                              int64_t(n_lists),
+                                              new_labels.data_handle(),
+                                              n_rows,
+                                              1,
+                                              stream);
+    raft::linalg::add(
+      list_sizes_ptr, list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+  }
+
+  std::vector<uint32_t> new_list_sizes(n_lists);
+  std::vector<uint32_t> old_list_sizes(n_lists);
+  {
+    raft::copy(old_list_sizes.data(), old_list_sizes_dev.data_handle(), n_lists, stream);
+    raft::copy(new_list_sizes.data(), list_sizes_ptr, n_lists, stream);
+    raft::resource::sync_stream(handle);
+    auto& lists = index->lists();
+    for (uint32_t label = 0; label < n_lists; label++) {
+      ivf::resize_list(handle,
+                       lists[label],
+                       list_device_spec,
+                       new_list_sizes[label],
+                       raft::Pow2<kIndexGroupSize>::roundUp(old_list_sizes[label]));
+    }
+  }
+  ivf::detail::recompute_internal_state(handle, *index);
+  raft::copy(list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+
+  utils::batch_load_iterator<int64_t> vec_indices(
+    new_indices, n_rows, 1, max_batch_size, stream, raft::resource::get_workspace_resource(handle));
+  vec_batches.reset();
+  vec_batches.prefetch_next_batch();
+  utils::batch_load_iterator<int64_t> idx_batch = vec_indices.begin();
+
+  auto residuals_buf = raft::make_device_vector<float>(handle, max_batch_size * dim);
+
+  size_t next_report_offset = 0;
+  size_t d_report_offset    = n_rows * 5 / 100;
+
+  for (const auto& batch : vec_batches) {
+    int64_t bs = batch.size();
+
+    {
+      dim3 threads(32, 8);
+      dim3 blocks(raft::ceildiv<int64_t>(bs, threads.x), raft::ceildiv<uint32_t>(dim, threads.y));
+      compute_residuals_kernel<T>
+        <<<blocks, threads, 0, stream>>>(batch.data(),
+                                         index->centers().data_handle(),
+                                         new_labels.data_handle() + batch.offset(),
+                                         residuals_buf.data_handle(),
+                                         bs,
+                                         dim);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    {
+      constexpr int kEncodeBlockSize   = 256;
+      constexpr int kEncodeWarpsPerBlk = kEncodeBlockSize / kIndexGroupSize;
+      const dim3 block_dim(kEncodeBlockSize);
+      const dim3 grid_dim(raft::ceildiv<int64_t>(bs, int64_t(kEncodeWarpsPerBlk)));
+      encode_and_fill_kernel<kEncodeBlockSize>
+        <<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle() + batch.offset(),
+                                             residuals_buf.data_handle(),
+                                             idx_batch->data(),
+                                             index->data_ptrs().data_handle(),
+                                             index->inds_ptrs().data_handle(),
+                                             list_sizes_ptr,
+                                             index->sq_vmin().data_handle(),
+                                             index->sq_delta().data_handle(),
+                                             bs,
+                                             dim,
+                                             batch.offset());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    vec_batches.prefetch_next_batch();
+    raft::resource::sync_stream(handle);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    if (batch.offset() > next_report_offset) {
+      float progress = batch.offset() * 100.0f / n_rows;
+      RAFT_LOG_DEBUG("ivf_sq::extend added vectors %zu, %6.1f%% complete",
+                     static_cast<size_t>(batch.offset()),
+                     progress);
+      next_report_offset += d_report_offset;
+    }
+    ++idx_batch;
+  }
+
+  auto compute_center_norms = [&]() {
+    if (index->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(index->center_norms()->data_handle(),
+                                                        index->centers().data_handle(),
+                                                        dim,
+                                                        n_lists,
+                                                        stream,
+                                                        raft::sqrt_op{});
+    } else {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(
+        index->center_norms()->data_handle(), index->centers().data_handle(), dim, n_lists, stream);
+    }
+  };
+
+  if (!index->center_norms().has_value()) {
+    index->allocate_center_norms(handle);
+    if (index->center_norms().has_value()) { compute_center_norms(); }
+  } else if (index->adaptive_centers()) {
+    compute_center_norms();
+  }
+}
+
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            const index<IdxT>& orig_index,
+            const T* new_vectors,
+            const int64_t* new_indices,
+            int64_t n_rows) -> index<IdxT>
+{
+  auto ext_index = clone(handle, orig_index);
+  detail::extend(handle, &ext_index, new_vectors, new_indices, n_rows);
+  return ext_index;
+}
+
+template <typename T, typename IdxT>
+inline auto build(raft::resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  int64_t n_rows,
+                  uint32_t dim) -> index<IdxT>
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::build(%zu, %u)", size_t(n_rows), dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, half>, "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+  RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
+  RAFT_EXPECTS(params.metric != cuvs::distance::DistanceType::CosineExpanded || dim > 1,
+               "Cosine metric requires more than one dim");
+
+  index<IdxT> idx(handle, params, dim);
+  utils::memzero(idx.accum_sorted_sizes().data_handle(), idx.accum_sorted_sizes().size(), stream);
+  utils::memzero(idx.list_sizes().data_handle(), idx.list_sizes().size(), stream);
+  utils::memzero(idx.data_ptrs().data_handle(), idx.data_ptrs().size(), stream);
+  utils::memzero(idx.inds_ptrs().data_handle(), idx.inds_ptrs().size(), stream);
+
+  // Train k-means centroids and SQ parameters on the same training subset.
+  // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
+  {
+    auto trainset_ratio = std::max<size_t>(
+      1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, idx.n_lists()));
+    auto n_rows_train = n_rows / trainset_ratio;
+    rmm::device_uvector<T> trainset(
+      n_rows_train * idx.dim(), stream, raft::resource::get_large_workspace_resource(handle));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                    sizeof(T) * idx.dim(),
+                                    dataset,
+                                    sizeof(T) * idx.dim() * trainset_ratio,
+                                    sizeof(T) * idx.dim(),
+                                    n_rows_train,
+                                    cudaMemcpyDefault,
+                                    stream));
+    auto trainset_const_view =
+      raft::make_device_matrix_view<const T, int64_t>(trainset.data(), n_rows_train, idx.dim());
+    auto centers_view = raft::make_device_matrix_view<float, int64_t>(
+      idx.centers().data_handle(), idx.n_lists(), idx.dim());
+    cuvs::cluster::kmeans::balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = idx.metric();
+    cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
+    raft::resource::sync_stream(handle);
+
+    // Train SQ: predict labels for the training subset, compute its residuals,
+    // and derive per-dimension vmin/delta from them.
+    auto train_labels = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
+    {
+      cuvs::cluster::kmeans::balanced_params pred_params;
+      pred_params.metric      = idx.metric();
+      auto centers_const_view = raft::make_device_matrix_view<const float, int64_t>(
+        idx.centers().data_handle(), idx.n_lists(), dim);
+      cuvs::cluster::kmeans::predict(
+        handle, pred_params, trainset_const_view, centers_const_view, train_labels.view());
+      raft::resource::sync_stream(handle);
+    }
+
+    rmm::device_uvector<float> residuals(
+      n_rows_train * dim, stream, raft::resource::get_large_workspace_resource(handle));
+    {
+      dim3 threads(32, 8);
+      dim3 blocks(raft::ceildiv<int64_t>(n_rows_train, threads.x),
+                  raft::ceildiv<uint32_t>(dim, threads.y));
+      compute_residuals_kernel<T><<<blocks, threads, 0, stream>>>(trainset.data(),
+                                                                  idx.centers().data_handle(),
+                                                                  train_labels.data_handle(),
+                                                                  residuals.data(),
+                                                                  n_rows_train,
+                                                                  dim);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    {
+      auto vmax_buf  = raft::make_device_vector<float, uint32_t>(handle, dim);
+      auto* vmin_ptr = idx.sq_vmin().data_handle();
+      auto* vmax_ptr = vmax_buf.data_handle();
+
+      constexpr int kMinMaxBlockSize = 256;
+      fused_column_minmax_kernel<kMinMaxBlockSize><<<dim, kMinMaxBlockSize, 0, stream>>>(
+        residuals.data(), vmin_ptr, vmax_ptr, n_rows_train, dim);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // Expand the observed range by a small margin to reduce clipping on unseen data,
+      // since the SQ parameters are trained on a subset rather than the full dataset.
+      constexpr float kMargin = 0.05f;
+      auto* delta_ptr         = idx.sq_delta().data_handle();
+      raft::linalg::map_offset(
+        handle, idx.sq_vmin(), [vmin_ptr, vmax_ptr, delta_ptr, kMargin] __device__(uint32_t j) {
+          float range  = vmax_ptr[j] - vmin_ptr[j];
+          float margin = range * kMargin;
+          delta_ptr[j] = (range > 0.0f) ? (range + 2.0f * margin) / 255.0f : 1.0f;
+          return vmin_ptr[j] - margin;
+        });
+    }
+  }
+
+  if (params.add_data_on_build) { detail::extend<T, IdxT>(handle, &idx, dataset, nullptr, n_rows); }
+
+  return idx;
+}
+
+template <typename T, typename IdxT>
+auto build(raft::resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
+{
+  int64_t n_rows = dataset.extent(0);
+  uint32_t dim   = dataset.extent(1);
+  return build<T, IdxT>(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+auto build(raft::resources const& handle,
+           const index_params& params,
+           raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
+{
+  int64_t n_rows = dataset.extent(0);
+  uint32_t dim   = dataset.extent(1);
+  return build<T, IdxT>(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
+           index<IdxT>& idx)
+{
+  idx = build<T, IdxT>(handle, params, dataset);
+}
+
+template <typename T, typename IdxT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
+           index<IdxT>& idx)
+{
+  idx = build<T, IdxT>(handle, params, dataset);
+}
+
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const index<IdxT>& orig_index) -> index<IdxT>
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  int64_t n_rows = new_vectors.extent(0);
+  return extend<T, IdxT>(handle,
+                         orig_index,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         n_rows);
+}
+
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const index<IdxT>& orig_index) -> index<IdxT>
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  int64_t n_rows = new_vectors.extent(0);
+  return extend<T, IdxT>(handle,
+                         orig_index,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         n_rows);
+}
+
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            index<IdxT>* idx)
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  detail::extend(handle,
+                 idx,
+                 new_vectors.data_handle(),
+                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                 new_vectors.extent(0));
+}
+
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            index<IdxT>* idx)
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  detail::extend(handle,
+                 idx,
+                 new_vectors.data_handle(),
+                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                 new_vectors.extent(0));
+}
+
+}  // namespace detail
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..a97aebb11c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_build.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, IdxT)                                               \
+  auto build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)            \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
+  }                                                                                          \
+                                                                                             \
+  void build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,            \
+             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
+  }                                                                                          \
+                                                                                             \
+  auto build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)              \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
+  }                                                                                          \
+                                                                                             \
+  void build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,              \
+             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
+  }                                                                                          \
+                                                                                             \
+  auto extend(raft::resources const& handle,                                                 \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                     \
+  }                                                                                          \
+                                                                                             \
+  void extend(raft::resources const& handle,                                                 \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
+              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                          \
+                                                                                             \
+  auto extend(raft::resources const& handle,                                                 \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                     \
+  }                                                                                          \
+                                                                                             \
+  void extend(raft::resources const& handle,                                                 \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
+              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+  }
+
+CUVS_INST_IVF_SQ_BUILD_EXTEND(float, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..9148e5c328
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_build.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, IdxT)                                               \
+  auto build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)            \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
+  }                                                                                          \
+                                                                                             \
+  void build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,            \
+             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
+  }                                                                                          \
+                                                                                             \
+  auto build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)              \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
+  }                                                                                          \
+                                                                                             \
+  void build(raft::resources const& handle,                                                  \
+             const cuvs::neighbors::ivf_sq::index_params& params,                            \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,              \
+             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
+  }                                                                                          \
+                                                                                             \
+  auto extend(raft::resources const& handle,                                                 \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                     \
+  }                                                                                          \
+                                                                                             \
+  void extend(raft::resources const& handle,                                                 \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
+              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                          \
+                                                                                             \
+  auto extend(raft::resources const& handle,                                                 \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
+  {                                                                                          \
+    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                     \
+  }                                                                                          \
+                                                                                             \
+  void extend(raft::resources const& handle,                                                 \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
+              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
+  {                                                                                          \
+    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+  }
+
+CUVS_INST_IVF_SQ_BUILD_EXTEND(half, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
new file mode 100644
index 0000000000..39c653b048
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -0,0 +1,549 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../detail/ann_utils.cuh"
+#include "../ivf_common.cuh"
+#include "../sample_filter.cuh"
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/selection/select_k.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/fill.h>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+enum class SqScanMetric { kL2, kIP, kCosine };
+
+/**
+ * Per-probe scan kernel for IVF-SQ search.
+ *
+ * Grid: (n_queries, n_probes).  Each block handles one (query, probe) pair.
+ * Within a block, each warp processes one interleaved group of kIndexGroupSize
+ * (=32) vectors at a time, with each lane responsible for one vector.
+ * Dimension blocks of veclen=16 bytes are loaded as coalesced uint4 reads
+ * across the warp (32 lanes x 16 bytes = 512 bytes = 4 cache lines), giving
+ * full memory-bandwidth utilisation.
+ *
+ * Per-dimension constants that are invariant across rows are precomputed into
+ * shared memory so the hot loop only reads from smem + one uint4 per dim-block:
+ *
+ *   L2 / L2Sqrt:
+ *     s_query_term[d] = query[d] - centroid[d] - sq_vmin[d]
+ *     dist += (s_query_term[d] - code * s_sq_scale[d])^2
+ *
+ *   InnerProduct / Cosine:
+ *     s_query_term[d] = query[d]
+ *     s_recon_base[d] = centroid[d] + sq_vmin[d]
+ *     v_d   = s_recon_base[d] + code * s_sq_scale[d]
+ *     dist += s_query_term[d] * v_d
+ *
+ * Shared-memory layout adapts to the metric to avoid waste:
+ *   L2 / L2Sqrt       : [s_query_term | s_sq_scale]                (2 * dim floats)
+ *   InnerProduct/Cosine: [s_query_term | s_recon_base | s_sq_scale] (3 * dim floats)
+ */
+template <int BlockDim, SqScanMetric Metric, typename IdxT, typename IvfSampleFilterT>
+__launch_bounds__(BlockDim) RAFT_KERNEL ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
+                                                           const uint32_t* list_sizes,
+                                                           const uint32_t* coarse_indices,
+                                                           const float* queries_float,
+                                                           const float* centers,
+                                                           const float* sq_vmin,
+                                                           const float* sq_delta,
+                                                           const float* query_norms,
+                                                           uint32_t n_probes,
+                                                           uint32_t dim,
+                                                           uint32_t max_samples,
+                                                           const uint32_t* chunk_indices,
+                                                           float* out_distances,
+                                                           uint32_t* out_indices,
+                                                           IvfSampleFilterT sample_filter)
+{
+  static_assert(kIndexGroupSize == raft::WarpSize,
+                "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
+
+  extern __shared__ float smem[];
+
+  constexpr bool kIsL2     = (Metric == SqScanMetric::kL2);
+  constexpr bool kIsCosine = (Metric == SqScanMetric::kCosine);
+
+  float* s_query_term = smem;
+  float* s_recon_base = smem + dim;
+  float* s_sq_scale   = kIsL2 ? (smem + dim) : (smem + 2 * dim);
+
+  const uint32_t query_ix = blockIdx.x;
+  const uint32_t probe_ix = blockIdx.y;
+
+  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
+  const uint32_t cluster_id = my_coarse[probe_ix];
+  const uint32_t cluster_sz = list_sizes[cluster_id];
+  if (cluster_sz == 0) return;
+
+  const uint8_t* codes  = data_ptrs[cluster_id];
+  const float* query    = queries_float + query_ix * dim;
+  const float* centroid = centers + cluster_id * dim;
+
+  for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
+    float vmin_d  = sq_vmin[d];
+    s_sq_scale[d] = sq_delta[d];
+    if constexpr (kIsL2) {
+      s_query_term[d] = query[d] - centroid[d] - vmin_d;
+    } else {
+      s_query_term[d] = query[d];
+      s_recon_base[d] = centroid[d] + vmin_d;
+    }
+  }
+  __syncthreads();
+
+  const uint32_t* my_chunk = chunk_indices + query_ix * n_probes;
+  uint32_t out_base        = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+
+  constexpr uint32_t veclen         = 16;
+  constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
+  const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
+
+  uint32_t padded_dim   = ((dim + veclen - 1) / veclen) * veclen;
+  uint32_t n_dim_blocks = padded_dim / veclen;
+
+  for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
+       group += kWarpsPerBlock * kIndexGroupSize) {
+    const uint32_t row = group + lane_id;
+    const bool valid   = (row < cluster_sz) && sample_filter(query_ix, cluster_id, row);
+
+    float dist      = 0.0f;
+    float v_norm_sq = 0.0f;
+
+    const uint8_t* group_data = codes + size_t(group) * padded_dim;
+
+    for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
+      uint8_t codes_local[veclen];
+      *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
+        group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
+
+      const uint32_t l = bl * veclen;
+#pragma unroll
+      for (uint32_t j = 0; j < veclen; j++) {
+        if (l + j < dim) {
+          float recon = float(codes_local[j]) * s_sq_scale[l + j];
+
+          if constexpr (kIsL2) {
+            float diff = s_query_term[l + j] - recon;
+            dist += diff * diff;
+          } else {
+            float v_d = s_recon_base[l + j] + recon;
+            dist += s_query_term[l + j] * v_d;
+            if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
+          }
+        }
+      }
+    }
+
+    if constexpr (kIsCosine) {
+      float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
+      dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+    }
+
+    if (valid) {
+      uint32_t out_idx       = query_ix * max_samples + out_base + row;
+      out_distances[out_idx] = dist;
+      out_indices[out_idx]   = out_base + row;
+    }
+  }
+}
+
+template <typename IdxT, typename IvfSampleFilterT>
+void ivf_sq_scan(raft::resources const& handle,
+                 const index<IdxT>& idx,
+                 const float* queries_float,
+                 const float* query_norms,
+                 uint32_t n_queries,
+                 uint32_t n_probes,
+                 uint32_t max_samples,
+                 const uint32_t* coarse_indices,
+                 const uint32_t* chunk_indices,
+                 float* out_distances,
+                 uint32_t* out_indices,
+                 IvfSampleFilterT sample_filter,
+                 rmm::cuda_stream_view stream)
+{
+  constexpr int kThreads = 256;
+  dim3 grid(n_queries, n_probes);
+  dim3 block(kThreads);
+  uint32_t dim = idx.dim();
+
+  auto do_launch = [&](auto kernel_ptr, size_t smem) {
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+    kernel_ptr<<<grid, block, smem, stream>>>(idx.data_ptrs().data_handle(),
+                                              idx.list_sizes().data_handle(),
+                                              coarse_indices,
+                                              queries_float,
+                                              idx.centers().data_handle(),
+                                              idx.sq_vmin().data_handle(),
+                                              idx.sq_delta().data_handle(),
+                                              query_norms,
+                                              n_probes,
+                                              dim,
+                                              max_samples,
+                                              chunk_indices,
+                                              out_distances,
+                                              out_indices,
+                                              sample_filter);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  };
+
+  switch (idx.metric()) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kL2, IdxT, IvfSampleFilterT>,
+                2 * dim * sizeof(float));
+      break;
+    case cuvs::distance::DistanceType::InnerProduct:
+      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kIP, IdxT, IvfSampleFilterT>,
+                3 * dim * sizeof(float));
+      break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>,
+                3 * dim * sizeof(float));
+      break;
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
+  }
+}
+
+template <typename T, typename IdxT, typename IvfSampleFilterT>
+void search_impl(raft::resources const& handle,
+                 const index<IdxT>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 bool select_min,
+                 int64_t* neighbors,
+                 float* distances,
+                 rmm::device_async_resource_ref search_mr,
+                 IvfSampleFilterT sample_filter)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  auto dim    = index.dim();
+
+  std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes);
+
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  rmm::device_uvector<float> coarse_distances_dev(n_queries_probes, stream, search_mr);
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries_probes, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_same_v<T, float>) {
+    float_query_size = 0;
+  } else {
+    float_query_size = n_queries * dim;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    raft::linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * dim, utils::mapping<float>{}, stream);
+  }
+
+  auto distance_buffer_dev_view = raft::make_device_matrix_view<float, int64_t>(
+    distance_buffer_dev.data(), n_queries, index.n_lists());
+
+  RAFT_EXPECTS(index.metric() == cuvs::distance::DistanceType::InnerProduct ||
+                 index.center_norms().has_value(),
+               "Center norms are required for search with L2 or Cosine metric. "
+               "Rebuild the index with add_data_on_build=true or call extend() first.");
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+  switch (index.metric()) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(query_norm_dev.data(),
+                                                        converted_queries_ptr,
+                                                        static_cast<int64_t>(dim),
+                                                        static_cast<int64_t>(n_queries),
+                                                        stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (int64_t)n_queries,
+                       index.center_norms()->data_handle(),
+                       (int64_t)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      break;
+    }
+    case cuvs::distance::DistanceType::CosineExpanded: {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(query_norm_dev.data(),
+                                                        converted_queries_ptr,
+                                                        static_cast<int64_t>(dim),
+                                                        static_cast<int64_t>(n_queries),
+                                                        stream,
+                                                        raft::sqrt_op{});
+      alpha = -1.0f;
+      beta  = 0.0f;
+      break;
+    }
+    case cuvs::distance::DistanceType::InnerProduct: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+      break;
+    }
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ search.");
+  }
+
+  raft::linalg::gemm(handle,
+                     true,
+                     false,
+                     index.n_lists(),
+                     n_queries,
+                     dim,
+                     &alpha,
+                     index.centers().data_handle(),
+                     dim,
+                     converted_queries_ptr,
+                     dim,
+                     &beta,
+                     distance_buffer_dev.data(),
+                     index.n_lists(),
+                     stream);
+
+  if (index.metric() == cuvs::distance::DistanceType::CosineExpanded) {
+    auto n_lists_local          = index.n_lists();
+    const auto* q_norm_ptr      = query_norm_dev.data();
+    const auto* center_norm_ptr = index.center_norms()->data_handle();
+    raft::linalg::map_offset(
+      handle,
+      distance_buffer_dev_view,
+      [=] __device__(const uint32_t idx, const float dist) {
+        const auto query   = idx / n_lists_local;
+        const auto cluster = idx % n_lists_local;
+        float denom        = q_norm_ptr[query] * center_norm_ptr[cluster];
+        return (denom > 0.0f) ? dist / denom : 0.0f;
+      },
+      raft::make_const_mdspan(distance_buffer_dev_view));
+  }
+
+  cuvs::selection::select_k(
+    handle,
+    raft::make_const_mdspan(distance_buffer_dev_view),
+    std::nullopt,
+    raft::make_device_matrix_view<float, int64_t>(coarse_distances_dev.data(), n_queries, n_probes),
+    raft::make_device_matrix_view<uint32_t, int64_t>(
+      coarse_indices_dev.data(), n_queries, n_probes),
+    select_min);
+
+  rmm::device_uvector<uint32_t> num_samples(n_queries, stream, search_mr);
+  rmm::device_uvector<uint32_t> chunk_index(n_queries_probes, stream, search_mr);
+
+  ivf::detail::calc_chunk_indices::configure(n_probes, n_queries)(index.list_sizes().data_handle(),
+                                                                  coarse_indices_dev.data(),
+                                                                  chunk_index.data(),
+                                                                  num_samples.data(),
+                                                                  stream);
+
+  uint32_t max_samples =
+    std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+
+  rmm::device_uvector<float> all_distances(std::size_t(n_queries) * max_samples, stream, search_mr);
+  rmm::device_uvector<uint32_t> all_indices(
+    std::size_t(n_queries) * max_samples, stream, search_mr);
+
+  float init_val =
+    select_min ? std::numeric_limits<float>::max() : std::numeric_limits<float>::lowest();
+  thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                 all_distances.data(),
+                 std::size_t(n_queries) * max_samples,
+                 init_val);
+  thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                 all_indices.data(),
+                 std::size_t(n_queries) * max_samples,
+                 uint32_t(0xFFFFFFFF));
+
+  auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter(
+    index.inds_ptrs().data_handle(), sample_filter);
+
+  ivf_sq_scan(handle,
+              index,
+              converted_queries_ptr,
+              query_norm_dev.data(),
+              n_queries,
+              n_probes,
+              max_samples,
+              coarse_indices_dev.data(),
+              chunk_index.data(),
+              all_distances.data(),
+              all_indices.data(),
+              filter_adapter,
+              stream);
+
+  rmm::device_uvector<uint32_t> neighbors_uint32(0, stream, search_mr);
+  uint32_t* neighbors_uint32_ptr = nullptr;
+  if constexpr (sizeof(int64_t) == sizeof(uint32_t)) {
+    neighbors_uint32_ptr = reinterpret_cast<uint32_t*>(neighbors);
+  } else {
+    neighbors_uint32.resize(std::size_t(n_queries) * k, stream);
+    neighbors_uint32_ptr = neighbors_uint32.data();
+  }
+
+  auto num_samples_view =
+    raft::make_device_vector_view<const uint32_t>(num_samples.data(), n_queries);
+
+  cuvs::selection::select_k(
+    handle,
+    raft::make_device_matrix_view<const float, int64_t>(
+      all_distances.data(), n_queries, max_samples),
+    raft::make_device_matrix_view<const uint32_t, int64_t>(
+      all_indices.data(), n_queries, max_samples),
+    raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
+    raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
+    select_min,
+    false,
+    cuvs::selection::SelectAlgo::kAuto,
+    num_samples_view);
+
+  ivf::detail::postprocess_distances(
+    distances, distances, index.metric(), n_queries, k, 1.0, false, stream);
+
+  ivf::detail::postprocess_neighbors(neighbors,
+                                     neighbors_uint32_ptr,
+                                     index.inds_ptrs().data_handle(),
+                                     coarse_indices_dev.data(),
+                                     chunk_index.data(),
+                                     n_queries,
+                                     n_probes,
+                                     k,
+                                     stream);
+}
+
+template <typename T,
+          typename IdxT,
+          typename IvfSampleFilterT = cuvs::neighbors::filtering::none_sample_filter>
+inline void search_with_filtering(raft::resources const& handle,
+                                  const search_params& params,
+                                  const index<IdxT>& index,
+                                  const T* queries,
+                                  uint32_t n_queries,
+                                  uint32_t k,
+                                  int64_t* neighbors,
+                                  float* distances,
+                                  IvfSampleFilterT sample_filter = IvfSampleFilterT())
+{
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+
+  uint32_t max_samples =
+    std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+
+  constexpr uint64_t kExpectedWsSize = 1024ull * 1024 * 1024;
+  uint64_t max_ws_size =
+    std::min<uint64_t>(raft::resource::get_workspace_free_bytes(handle), kExpectedWsSize);
+
+  uint64_t converted_query_floats = std::is_same_v<T, float> ? 0 : index.dim();
+  uint64_t ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + max_samples +
+                                           converted_query_floats) +
+                          sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+
+  const uint32_t max_queries =
+    std::min<uint32_t>(n_queries, std::max<uint64_t>(1, max_ws_size / ws_per_query));
+
+  for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
+    uint32_t queries_batch = std::min(max_queries, n_queries - offset_q);
+
+    search_impl<T, IdxT, IvfSampleFilterT>(handle,
+                                           index,
+                                           queries + std::size_t(offset_q) * index.dim(),
+                                           queries_batch,
+                                           k,
+                                           n_probes,
+                                           cuvs::distance::is_min_close(index.metric()),
+                                           neighbors + std::size_t(offset_q) * k,
+                                           distances + std::size_t(offset_q) * k,
+                                           raft::resource::get_workspace_resource(handle),
+                                           sample_filter);
+  }
+}
+
+template <typename T, typename IdxT, typename IvfSampleFilterT>
+void search_with_filtering(raft::resources const& handle,
+                           const search_params& params,
+                           const index<IdxT>& index,
+                           raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must be equal");
+  RAFT_EXPECTS(queries.extent(1) == index.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  search_with_filtering(handle,
+                        params,
+                        index,
+                        queries.data_handle(),
+                        static_cast<std::uint32_t>(queries.extent(0)),
+                        static_cast<std::uint32_t>(neighbors.extent(1)),
+                        neighbors.data_handle(),
+                        distances.data_handle(),
+                        sample_filter);
+}
+
+template <typename T, typename IdxT>
+void search(raft::resources const& handle,
+            const search_params& params,
+            const index<IdxT>& idx,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter_ref)
+{
+  try {
+    auto& sample_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::none_sample_filter&>(sample_filter_ref);
+    return search_with_filtering(handle, params, idx, queries, neighbors, distances, sample_filter);
+  } catch (const std::bad_cast&) {
+  }
+
+  try {
+    auto& sample_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>&>(
+        sample_filter_ref);
+    return search_with_filtering(handle, params, idx, queries, neighbors, distances, sample_filter);
+  } catch (const std::bad_cast&) {
+    RAFT_FAIL("Unsupported sample filter type");
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..60d95a153f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_search.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_SEARCH(T, IdxT)                                             \
+  void search(raft::resources const& handle,                                         \
+              const cuvs::neighbors::ivf_sq::search_params& params,                  \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& index,                     \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
+  {                                                                                  \
+    cuvs::neighbors::ivf_sq::detail::search(                                         \
+      handle, params, index, queries, neighbors, distances, sample_filter);          \
+  }
+
+CUVS_INST_IVF_SQ_SEARCH(float, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_SEARCH
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..fbed3fd432
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_search.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_SEARCH(T, IdxT)                                             \
+  void search(raft::resources const& handle,                                         \
+              const cuvs::neighbors::ivf_sq::search_params& params,                  \
+              const cuvs::neighbors::ivf_sq::index<IdxT>& index,                     \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
+  {                                                                                  \
+    cuvs::neighbors::ivf_sq::detail::search(                                         \
+      handle, params, index, queries, neighbors, distances, sample_filter);          \
+  }
+
+CUVS_INST_IVF_SQ_SEARCH(half, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_SEARCH
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
new file mode 100644
index 0000000000..b95e63ee33
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
@@ -0,0 +1,161 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include <raft/core/detail/mdspan_numpy_serializer.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/serialize.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <fstream>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+constexpr int serialization_version = 1;
+
+template <typename IdxT>
+void serialize(raft::resources const& handle, std::ostream& os, const index<IdxT>& index_)
+{
+  RAFT_LOG_DEBUG(
+    "Saving IVF-SQ index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
+
+  std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  serialize_scalar(handle, os, serialization_version);
+  serialize_scalar(handle, os, index_.size());
+  serialize_scalar(handle, os, index_.dim());
+  serialize_scalar(handle, os, index_.n_lists());
+  serialize_scalar(handle, os, index_.metric());
+  serialize_scalar(handle, os, index_.adaptive_centers());
+  serialize_scalar(handle, os, index_.conservative_memory_allocation());
+  serialize_mdspan(handle, os, index_.centers());
+
+  if (index_.center_norms()) {
+    bool has_norms = true;
+    serialize_scalar(handle, os, has_norms);
+    serialize_mdspan(handle, os, *index_.center_norms());
+  } else {
+    bool has_norms = false;
+    serialize_scalar(handle, os, has_norms);
+  }
+
+  serialize_mdspan(handle, os, index_.sq_vmin());
+  serialize_mdspan(handle, os, index_.sq_delta());
+
+  auto sizes_host = raft::make_host_vector<uint32_t, uint32_t>(index_.list_sizes().extent(0));
+  raft::copy(sizes_host.data_handle(),
+             index_.list_sizes().data_handle(),
+             sizes_host.size(),
+             raft::resource::get_cuda_stream(handle));
+  raft::resource::sync_stream(handle);
+  serialize_mdspan(handle, os, sizes_host.view());
+
+  list_spec<uint32_t, IdxT, int64_t> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::serialize_list(handle,
+                        os,
+                        index_.lists()[label],
+                        list_store_spec,
+                        raft::Pow2<kIndexGroupSize>::roundUp(sizes_host(label)));
+  }
+  raft::resource::sync_stream(handle);
+}
+
+template <typename IdxT>
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<IdxT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+  detail::serialize(handle, of, index_);
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+template <typename IdxT>
+auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
+{
+  char dtype_string[4];
+  is.read(dtype_string, 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  auto n_rows           = raft::deserialize_scalar<int64_t>(handle, is);
+  auto dim              = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto n_lists          = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto metric           = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  bool adaptive_centers = raft::deserialize_scalar<bool>(handle, is);
+  bool cma              = raft::deserialize_scalar<bool>(handle, is);
+
+  index<IdxT> index_ = index<IdxT>(handle, metric, n_lists, dim, adaptive_centers, cma);
+
+  deserialize_mdspan(handle, is, index_.centers());
+
+  bool has_norms = raft::deserialize_scalar<bool>(handle, is);
+  if (has_norms) {
+    index_.allocate_center_norms(handle);
+    if (!index_.center_norms()) {
+      RAFT_FAIL("Error inconsistent center norms");
+    } else {
+      auto center_norms = index_.center_norms().value();
+      deserialize_mdspan(handle, is, center_norms);
+    }
+  }
+
+  deserialize_mdspan(handle, is, index_.sq_vmin());
+  deserialize_mdspan(handle, is, index_.sq_delta());
+
+  deserialize_mdspan(handle, is, index_.list_sizes());
+
+  list_spec<uint32_t, IdxT, int64_t> list_device_spec{index_.dim(), cma};
+  list_spec<uint32_t, IdxT, int64_t> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::deserialize_list(handle, is, index_.lists()[label], list_store_spec, list_device_spec);
+  }
+  raft::resource::sync_stream(handle);
+
+  ivf::detail::recompute_internal_state(handle, index_);
+
+  return index_;
+}
+
+template <typename IdxT>
+auto deserialize(raft::resources const& handle, const std::string& filename) -> index<IdxT>
+{
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+  auto index = detail::deserialize<IdxT>(handle, is);
+  is.close();
+  return index;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
+
+#define CUVS_INST_IVF_SQ_SERIALIZE(IdxT)                                           \
+  void serialize(raft::resources const& handle,                                    \
+                 const std::string& filename,                                      \
+                 const cuvs::neighbors::ivf_sq::index<IdxT>& index)                \
+  {                                                                                \
+    cuvs::neighbors::ivf_sq::detail::serialize(handle, filename, index);           \
+  }                                                                                \
+                                                                                   \
+  void deserialize(raft::resources const& handle,                                  \
+                   const std::string& filename,                                    \
+                   cuvs::neighbors::ivf_sq::index<IdxT>* index)                    \
+  {                                                                                \
+    *index = cuvs::neighbors::ivf_sq::detail::deserialize<IdxT>(handle, filename); \
+  }
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
new file mode 100644
index 0000000000..c2351ed8c3
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_serialize.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+CUVS_INST_IVF_SQ_SERIALIZE(uint8_t);
+
+#undef CUVS_INST_IVF_SQ_SERIALIZE
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
new file mode 100644
index 0000000000..d97ace7dcb
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -0,0 +1,236 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+namespace cuvs::neighbors::ivf_sq {
+
+template <typename IdxT>
+index<IdxT>::index(raft::resources const& res)
+  : index(res, cuvs::distance::DistanceType::L2Expanded, 0, 0, false, false)
+{
+}
+
+template <typename IdxT>
+index<IdxT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
+  : index(res,
+          params.metric,
+          params.n_lists,
+          dim,
+          params.adaptive_centers,
+          params.conservative_memory_allocation)
+{
+}
+
+template <typename IdxT>
+index<IdxT>::index(raft::resources const& res,
+                   cuvs::distance::DistanceType metric,
+                   uint32_t n_lists,
+                   uint32_t dim,
+                   bool adaptive_centers,
+                   bool conservative_memory_allocation)
+  : cuvs::neighbors::index(),
+    metric_(metric),
+    adaptive_centers_(adaptive_centers),
+    conservative_memory_allocation_(conservative_memory_allocation),
+    lists_{n_lists},
+    list_sizes_{raft::make_device_vector<uint32_t, uint32_t>(res, n_lists)},
+    centers_(raft::make_device_matrix<float, uint32_t>(res, n_lists, dim)),
+    center_norms_(std::nullopt),
+    sq_vmin_{raft::make_device_vector<float, uint32_t>(res, dim)},
+    sq_delta_{raft::make_device_vector<float, uint32_t>(res, dim)},
+    data_ptrs_{raft::make_device_vector<IdxT*, uint32_t>(res, n_lists)},
+    inds_ptrs_{raft::make_device_vector<int64_t*, uint32_t>(res, n_lists)},
+    accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
+{
+  check_consistency();
+  accum_sorted_sizes_(n_lists) = 0;
+}
+
+template <typename IdxT>
+cuvs::distance::DistanceType index<IdxT>::metric() const noexcept
+{
+  return metric_;
+}
+
+template <typename IdxT>
+bool index<IdxT>::adaptive_centers() const noexcept
+{
+  return adaptive_centers_;
+}
+
+template <typename IdxT>
+int64_t index<IdxT>::size() const noexcept
+{
+  return accum_sorted_sizes()(n_lists());
+}
+
+template <typename IdxT>
+uint32_t index<IdxT>::dim() const noexcept
+{
+  return centers_.extent(1);
+}
+
+template <typename IdxT>
+uint32_t index<IdxT>::n_lists() const noexcept
+{
+  return lists_.size();
+}
+
+template <typename IdxT>
+bool index<IdxT>::conservative_memory_allocation() const noexcept
+{
+  return conservative_memory_allocation_;
+}
+
+template <typename IdxT>
+raft::device_vector_view<uint32_t, uint32_t> index<IdxT>::list_sizes() noexcept
+{
+  return list_sizes_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<const uint32_t, uint32_t> index<IdxT>::list_sizes() const noexcept
+{
+  return list_sizes_.view();
+}
+
+template <typename IdxT>
+raft::device_matrix_view<float, uint32_t, raft::row_major> index<IdxT>::centers() noexcept
+{
+  return centers_.view();
+}
+
+template <typename IdxT>
+raft::device_matrix_view<const float, uint32_t, raft::row_major> index<IdxT>::centers()
+  const noexcept
+{
+  return centers_.view();
+}
+
+template <typename IdxT>
+std::optional<raft::device_vector_view<float, uint32_t>> index<IdxT>::center_norms() noexcept
+{
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<float, uint32_t>>(center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename IdxT>
+std::optional<raft::device_vector_view<const float, uint32_t>> index<IdxT>::center_norms()
+  const noexcept
+{
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<const float, uint32_t>>(
+      center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename IdxT>
+void index<IdxT>::allocate_center_norms(raft::resources const& res)
+{
+  switch (metric_) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+    case cuvs::distance::DistanceType::L2Unexpanded:
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+    case cuvs::distance::DistanceType::CosineExpanded:
+      center_norms_ = raft::make_device_vector<float, uint32_t>(res, n_lists());
+      break;
+    default: center_norms_ = std::nullopt;
+  }
+}
+
+template <typename IdxT>
+raft::device_vector_view<float, uint32_t> index<IdxT>::sq_vmin() noexcept
+{
+  return sq_vmin_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<const float, uint32_t> index<IdxT>::sq_vmin() const noexcept
+{
+  return sq_vmin_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<float, uint32_t> index<IdxT>::sq_delta() noexcept
+{
+  return sq_delta_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<const float, uint32_t> index<IdxT>::sq_delta() const noexcept
+{
+  return sq_delta_.view();
+}
+
+template <typename IdxT>
+raft::host_vector_view<int64_t, uint32_t> index<IdxT>::accum_sorted_sizes() noexcept
+{
+  return accum_sorted_sizes_.view();
+}
+
+template <typename IdxT>
+raft::host_vector_view<const int64_t, uint32_t> index<IdxT>::accum_sorted_sizes() const noexcept
+{
+  return accum_sorted_sizes_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<IdxT*, uint32_t> index<IdxT>::data_ptrs() noexcept
+{
+  return data_ptrs_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<IdxT* const, uint32_t> index<IdxT>::data_ptrs() const noexcept
+{
+  return data_ptrs_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<int64_t*, uint32_t> index<IdxT>::inds_ptrs() noexcept
+{
+  return inds_ptrs_.view();
+}
+
+template <typename IdxT>
+raft::device_vector_view<int64_t* const, uint32_t> index<IdxT>::inds_ptrs() const noexcept
+{
+  return inds_ptrs_.view();
+}
+
+template <typename IdxT>
+std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& index<IdxT>::lists() noexcept
+{
+  return lists_;
+}
+
+template <typename IdxT>
+const std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& index<IdxT>::lists() const noexcept
+{
+  return lists_;
+}
+
+template <typename IdxT>
+void index<IdxT>::check_consistency()
+{
+  auto n_lists = lists_.size();
+  RAFT_EXPECTS(list_sizes_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(data_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(inds_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS((centers_.extent(0) == list_sizes_.extent(0)) &&
+                 (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
+               "inconsistent number of lists (clusters)");
+}
+
+template struct index<uint8_t>;
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 35794adf9b..208c330de7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -131,6 +131,13 @@ ConfigureTest(
   PERCENT 100
 )
 
+ConfigureTest(
+  NAME NEIGHBORS_ANN_IVF_SQ_TEST
+  PATH neighbors/ann_ivf_sq/test_float_uint8_t.cu
+  GPUS 1
+  PERCENT 100
+)
+
 ConfigureTest(
   NAME NEIGHBORS_ANN_IVF_PQ_TEST
   PATH neighbors/ann_ivf_pq/test_float_int64_t.cu neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
new file mode 100644
index 0000000000..a7e02315e4
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -0,0 +1,457 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+#include "naive_knn.cuh"
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <raft/core/bitset.cuh>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/normalize.cuh>
+
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+namespace cuvs::neighbors::ivf_sq {
+
+struct test_ivf_sample_filter {
+  static constexpr unsigned offset = 300;
+};
+
+template <typename IdxT>
+struct AnnIvfSqInputs {
+  IdxT num_queries;
+  IdxT num_db_vecs;
+  IdxT dim;
+  IdxT k;
+  IdxT nprobe;
+  IdxT nlist;
+  cuvs::distance::DistanceType metric;
+  bool adaptive_centers;
+};
+
+template <typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const AnnIvfSqInputs<IdxT>& p)
+{
+  os << "{ " << p.num_queries << ", " << p.num_db_vecs << ", " << p.dim << ", " << p.k << ", "
+     << p.nprobe << ", " << p.nlist << ", "
+     << cuvs::neighbors::print_metric{static_cast<cuvs::distance::DistanceType>((int)p.metric)}
+     << ", " << p.adaptive_centers << '}' << std::endl;
+  return os;
+}
+
+template <typename T, typename DataT, typename IdxT>
+class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
+ public:
+  AnnIVFSQTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnIvfSqInputs<IdxT>>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+  void testIVFSQ()
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<IdxT> indices_ivfsq(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<T> distances_ivfsq(queries_size);
+    std::vector<T> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
+                                                 distances_naive_dev.data(),
+                                                 indices_naive_dev.data(),
+                                                 search_queries.data(),
+                                                 database.data(),
+                                                 ps.num_queries,
+                                                 ps.num_db_vecs,
+                                                 ps.dim,
+                                                 ps.k,
+                                                 ps.metric);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      double min_recall =
+        std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
+
+      rmm::device_uvector<T> distances_ivfsq_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_ivfsq_dev(queries_size, stream_);
+
+      {
+        cuvs::neighbors::ivf_sq::index_params index_params;
+        cuvs::neighbors::ivf_sq::search_params search_params;
+        index_params.n_lists          = ps.nlist;
+        index_params.metric           = ps.metric;
+        index_params.adaptive_centers = ps.adaptive_centers;
+        search_params.n_probes        = ps.nprobe;
+
+        index_params.add_data_on_build        = true;
+        index_params.kmeans_trainset_fraction = 0.5;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+        auto idx = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
+
+        // Test extend: build without data, then extend
+        cuvs::neighbors::ivf_sq::index_params index_params_no_add;
+        index_params_no_add.n_lists                  = ps.nlist;
+        index_params_no_add.metric                   = ps.metric;
+        index_params_no_add.adaptive_centers         = ps.adaptive_centers;
+        index_params_no_add.add_data_on_build        = false;
+        index_params_no_add.kmeans_trainset_fraction = 0.5;
+
+        auto idx_empty =
+          cuvs::neighbors::ivf_sq::build(handle_, index_params_no_add, database_view);
+
+        auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
+        raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
+        raft::resource::sync_stream(handle_);
+
+        auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+          vector_indices.data_handle(), ps.num_db_vecs);
+        cuvs::neighbors::ivf_sq::extend(
+          handle_,
+          database_view,
+          std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
+          &idx_empty);
+
+        // Serialize / deserialize round-trip
+        tmp_index_file index_file;
+        cuvs::neighbors::ivf_sq::serialize(handle_, index_file.filename, idx);
+        cuvs::neighbors::ivf_sq::index<uint8_t> index_loaded(handle_);
+        cuvs::neighbors::ivf_sq::deserialize(handle_, index_file.filename, &index_loaded);
+        ASSERT_EQ(idx.size(), index_loaded.size());
+        ASSERT_EQ(idx.dim(), index_loaded.dim());
+        ASSERT_EQ(idx.n_lists(), index_loaded.n_lists());
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          search_queries.data(), ps.num_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, IdxT>(indices_ivfsq_dev.data(), ps.num_queries, ps.k);
+        auto dists_out_view =
+          raft::make_device_matrix_view<T, IdxT>(distances_ivfsq_dev.data(), ps.num_queries, ps.k);
+
+        cuvs::neighbors::ivf_sq::search(handle_,
+                                        search_params,
+                                        index_loaded,
+                                        search_queries_view,
+                                        indices_out_view,
+                                        dists_out_view);
+
+        raft::update_host(
+          distances_ivfsq.data(), distances_ivfsq_dev.data(), queries_size, stream_);
+        raft::update_host(indices_ivfsq.data(), indices_ivfsq_dev.data(), queries_size, stream_);
+        raft::resource::sync_stream(handle_);
+      }
+      // SQ introduces quantization error, so we relax the distance epsilon
+      float eps = 0.1;
+      ASSERT_TRUE(eval_neighbours(indices_naive,
+                                  indices_ivfsq,
+                                  distances_naive,
+                                  distances_ivfsq,
+                                  ps.num_queries,
+                                  ps.k,
+                                  eps,
+                                  min_recall));
+    }
+  }
+
+  void testFilter()
+  {
+    if (ps.num_db_vecs <= static_cast<IdxT>(test_ivf_sample_filter::offset)) {
+      GTEST_SKIP() << "Skipping filter test: num_db_vecs <= filter offset";
+    }
+
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<IdxT> indices_ivfsq(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<T> distances_ivfsq(queries_size);
+    std::vector<T> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_ivf_sample_filter::offset * ps.dim;
+      cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
+                                                 distances_naive_dev.data(),
+                                                 indices_naive_dev.data(),
+                                                 search_queries.data(),
+                                                 database_filtered_ptr,
+                                                 ps.num_queries,
+                                                 ps.num_db_vecs - test_ivf_sample_filter::offset,
+                                                 ps.dim,
+                                                 ps.k,
+                                                 ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_ivf_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      double min_recall =
+        std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
+
+      rmm::device_uvector<T> distances_ivfsq_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_ivfsq_dev(queries_size, stream_);
+
+      {
+        cuvs::neighbors::ivf_sq::index_params index_params;
+        cuvs::neighbors::ivf_sq::search_params search_params;
+        index_params.n_lists          = ps.nlist;
+        index_params.metric           = ps.metric;
+        index_params.adaptive_centers = ps.adaptive_centers;
+        search_params.n_probes        = ps.nprobe;
+
+        index_params.add_data_on_build        = true;
+        index_params.kmeans_trainset_fraction = 0.5;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+        auto index = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
+
+        auto removed_indices =
+          raft::make_device_vector<IdxT, int64_t>(handle_, test_ivf_sample_filter::offset);
+        raft::linalg::map_offset(handle_, removed_indices.view(), raft::identity_op{});
+        raft::resource::sync_stream(handle_);
+
+        cuvs::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+          handle_, removed_indices.view(), ps.num_db_vecs);
+        auto bitset_filter_obj =
+          cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset.view());
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          search_queries.data(), ps.num_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, IdxT>(indices_ivfsq_dev.data(), ps.num_queries, ps.k);
+        auto dists_out_view =
+          raft::make_device_matrix_view<T, IdxT>(distances_ivfsq_dev.data(), ps.num_queries, ps.k);
+
+        cuvs::neighbors::ivf_sq::search(handle_,
+                                        search_params,
+                                        index,
+                                        search_queries_view,
+                                        indices_out_view,
+                                        dists_out_view,
+                                        bitset_filter_obj);
+
+        raft::update_host(
+          distances_ivfsq.data(), distances_ivfsq_dev.data(), queries_size, stream_);
+        raft::update_host(indices_ivfsq.data(), indices_ivfsq_dev.data(), queries_size, stream_);
+        raft::resource::sync_stream(handle_);
+      }
+      float eps = 0.1;
+      ASSERT_TRUE(eval_neighbours(indices_naive,
+                                  indices_ivfsq,
+                                  distances_naive,
+                                  distances_ivfsq,
+                                  ps.num_queries,
+                                  ps.k,
+                                  eps,
+                                  min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(ps.num_db_vecs * ps.dim, stream_);
+    search_queries.resize(ps.num_queries * ps.dim, stream_);
+
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
+      raft::random::uniform(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::uniform(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0));
+    }
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnIvfSqInputs<IdxT> ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
+const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
+  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric, adaptive_centers
+
+  // ===== Dimension edge cases (all four metrics) =====
+  // dim=1 (CosineExpanded excluded: requires dim > 1)
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  // dim=2,3,4,5 (unaligned)
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // dim=7,8 (around veclen=16 boundary, not a multiple of veclen)
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  // dim=15,16,17 (around veclen=16 boundary)
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // dim=31,32,33 (around 2*veclen boundary)
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  // medium dims
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  // large dims (may exceed shared memory limits)
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+
+  // ===== k edge cases =====
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 2, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 5, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+
+  // ===== nprobe / nlist edge cases =====
+  // nprobe == nlist (exhaustive probe)
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded, false},
+  // nprobe == 1 (minimal probe)
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // nprobe > nlist (clamped to nlist)
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // various nprobe
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  // very small nlist
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::L2Expanded, false},
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::CosineExpanded, false},
+  // smaller nlist
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2Expanded, false},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct, false},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded, false},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+
+  // ===== Dataset size edge cases =====
+  // single query
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // very few queries
+  {2, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {5, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  // very few db vectors (nlist reduced to fit)
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::L2Expanded, false},
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::CosineExpanded, false},
+  // small db with many empty clusters
+  {100, 100, 16, 5, 20, 64, cuvs::distance::DistanceType::L2Expanded, false},
+  {100, 100, 16, 5, 20, 64, cuvs::distance::DistanceType::CosineExpanded, false},
+  // larger datasets
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+
+  // ===== Large query batches (gridDim.x > 65535) =====
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded, false},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct, false},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded, false},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::CosineExpanded, false},
+  // just above the old 65535 limit
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::L2Expanded, false},
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::CosineExpanded, false},
+
+  // ===== Adaptive centers (all four metrics, multiple dims) =====
+  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
+  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+
+  // ===== Recall-stability: same data, different query counts =====
+  {20000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
+  {50000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
+};
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
new file mode 100644
index 0000000000..02ec8a7dfc
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_ivf_sq.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+typedef AnnIVFSQTest<float, float, int64_t> AnnIVFSQTestF_float;
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQ)
+{
+  this->testIVFSQ();
+  this->testFilter();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
index fa2195fc61..3a787f65ab 100644
--- a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
@@ -34,6 +34,9 @@ cuvs_ivf_flat:
 cuvs_ivf_pq:
   executable: CUVS_IVF_PQ_ANN_BENCH
   requires_gpu: true
+cuvs_ivf_sq:
+  executable: CUVS_IVF_SQ_ANN_BENCH
+  requires_gpu: true
 cuvs_cagra:
   executable: CUVS_CAGRA_ANN_BENCH
   requires_gpu: true
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
new file mode 100644
index 0000000000..711f3e8ce8
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
@@ -0,0 +1,16 @@
+name: cuvs_ivf_sq
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096, 8192, 16384, 32000, 64000]
+      ratio: [1, 2, 4]
+      niter: [20, 25]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200, 500, 1000, 2000]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [1]
+      niter: [20]
+    search:
+      nprobe: [1, 5]

From cf19a8629c3377426a2a6bfa3b3e7d900044b42a Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 2 Mar 2026 11:25:03 +0100
Subject: [PATCH 02/43] add IVF-SQ bench constraints

---
 .../cuvs_bench/config/algos/constraints/__init__.py       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
index 9111bdc3b9..ea2afe351e 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
+++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -50,6 +50,12 @@ def cuvs_cagra_search(params, build_params, k, batch_size):
     return True
 
 
+def cuvs_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
 ###############################################################################
 #                              FAISS constraints                              #
 ###############################################################################

From 6a95e8a8215016a01127d9c62e97cbb2fffa1cac Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 2 Mar 2026 12:21:49 +0100
Subject: [PATCH 03/43] Update default IVF-SQ benchmark config

---
 .../cuvs_bench/config/algos/cuvs_ivf_sq.yaml    | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
index 711f3e8ce8..adaad54e04 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
@@ -1,12 +1,21 @@
 name: cuvs_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.cuvs_ivf_sq_search
 groups:
   base:
     build:
-      nlist: [1024, 2048, 4096, 8192, 16384, 32000, 64000]
-      ratio: [1, 2, 4]
-      niter: [20, 25]
+      nlist: [1024, 2048, 4096, 8192]
+      ratio: [1, 2]
+      niter: [25]
     search:
-      nprobe: [1, 5, 10, 50, 100, 200, 500, 1000, 2000]
+      nprobe: [1, 5, 10, 20, 50, 100, 200, 500]
+  large:
+    build:
+      nlist: [8192, 16384, 32000, 64000]
+      ratio: [2, 4]
+      niter: [20]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000, 2000]
   test:
     build:
       nlist: [1024]

From b652160cd37cd03218dfc90a0f9266f4624ca076 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Wed, 11 Mar 2026 16:32:42 +0100
Subject: [PATCH 04/43] IVF-SQ C API

---
 c/CMakeLists.txt                           |   1 +
 c/include/cuvs/core/all.h                  |   1 +
 c/include/cuvs/neighbors/ivf_sq.h          | 373 +++++++++++++++++++++
 c/src/neighbors/ivf_sq.cpp                 | 363 ++++++++++++++++++++
 c/src/neighbors/ivf_sq.hpp                 |  14 +
 c/tests/CMakeLists.txt                     |   3 +-
 c/tests/neighbors/ann_ivf_sq_c.cu          | 130 +++++++
 c/tests/neighbors/c_api.c                  |  13 +-
 c/tests/neighbors/run_ivf_sq_c.c           |  86 +++++
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh |   2 +-
 10 files changed, 983 insertions(+), 3 deletions(-)
 create mode 100644 c/include/cuvs/neighbors/ivf_sq.h
 create mode 100644 c/src/neighbors/ivf_sq.cpp
 create mode 100644 c/src/neighbors/ivf_sq.hpp
 create mode 100644 c/tests/neighbors/ann_ivf_sq_c.cu
 create mode 100644 c/tests/neighbors/run_ivf_sq_c.c

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 22a25c24d0..980cb78f29 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -89,6 +89,7 @@ add_library(
   src/neighbors/brute_force.cpp
   src/neighbors/ivf_flat.cpp
   src/neighbors/ivf_pq.cpp
+  src/neighbors/ivf_sq.cpp
   src/neighbors/cagra.cpp
   $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
   $<$<BOOL:${BUILD_MG_ALGOS}>:src/neighbors/mg_ivf_pq.cpp>
diff --git a/c/include/cuvs/core/all.h b/c/include/cuvs/core/all.h
index cc83684925..0847cc63da 100644
--- a/c/include/cuvs/core/all.h
+++ b/c/include/cuvs/core/all.h
@@ -22,6 +22,7 @@
 #include <cuvs/neighbors/common.h>
 #include <cuvs/neighbors/ivf_flat.h>
 #include <cuvs/neighbors/ivf_pq.h>
+#include <cuvs/neighbors/ivf_sq.h>
 #include <cuvs/neighbors/nn_descent.h>
 #include <cuvs/neighbors/refine.h>
 #include <cuvs/neighbors/tiered_index.h>
diff --git a/c/include/cuvs/neighbors/ivf_sq.h b/c/include/cuvs/neighbors/ivf_sq.h
new file mode 100644
index 0000000000..6b312443b0
--- /dev/null
+++ b/c/include/cuvs/neighbors/ivf_sq.h
@@ -0,0 +1,373 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/distance/distance.h>
+#include <cuvs/neighbors/common.h>
+#include <dlpack/dlpack.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup ivf_sq_c_index_params IVF-SQ index build parameters
+ * @{
+ */
+/**
+ * @brief Supplemental parameters to build IVF-SQ Index
+ *
+ */
+struct cuvsIvfSqIndexParams {
+  /** Distance type. */
+  cuvsDistanceType metric;
+  /** The argument used by some distance metrics. */
+  float metric_arg;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build;
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction;
+  /**
+   * By default (adaptive_centers = false), the cluster centers are trained in `ivf_sq::build`,
+   * and never modified in `ivf_sq::extend`. As a result, you may need to retrain the index
+   * from scratch after invoking (`ivf_sq::extend`) a few times with new data, the distribution of
+   * which is no longer representative of the original training set.
+   *
+   * The alternative behavior (adaptive_centers = true) is to update the cluster centers for new
+   * data when it is added. In this case, `index.centers()` are always exactly the centroids of the
+   * data in the corresponding clusters. The drawback of this behavior is that the centroids depend
+   * on the order of adding new data (through the classification of the added data); that is,
+   * `index.centers()` "drift" together with the changing distribution of the newly added data.
+   */
+  bool adaptive_centers;
+  /**
+   * By default, the algorithm allocates more space than necessary for individual clusters
+   * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+   * data copies during repeated calls to `extend` (extending the database).
+   *
+   * The alternative is the conservative allocation behavior; when enabled, the algorithm always
+   * allocates the minimum amount of memory required to store the given number of records. Set this
+   * flag to `true` if you prefer to use as little GPU memory for the database as possible.
+   */
+  bool conservative_memory_allocation;
+};
+
+typedef struct cuvsIvfSqIndexParams* cuvsIvfSqIndexParams_t;
+
+/**
+ * @brief Allocate IVF-SQ Index params, and populate with default values
+ *
+ * @param[in] index_params cuvsIvfSqIndexParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* index_params);
+
+/**
+ * @brief De-allocate IVF-SQ Index params
+ *
+ * @param[in] index_params
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqIndexParamsDestroy(cuvsIvfSqIndexParams_t index_params);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_search_params IVF-SQ index search parameters
+ * @{
+ */
+/**
+ * @brief Supplemental parameters to search IVF-SQ index
+ *
+ */
+struct cuvsIvfSqSearchParams {
+  /** The number of clusters to search. */
+  uint32_t n_probes;
+};
+
+typedef struct cuvsIvfSqSearchParams* cuvsIvfSqSearchParams_t;
+
+/**
+ * @brief Allocate IVF-SQ search params, and populate with default values
+ *
+ * @param[in] params cuvsIvfSqSearchParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params);
+
+/**
+ * @brief De-allocate IVF-SQ search params
+ *
+ * @param[in] params
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqSearchParamsDestroy(cuvsIvfSqSearchParams_t params);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_index IVF-SQ index
+ * @{
+ */
+/**
+ * @brief Struct to hold address of cuvs::neighbors::ivf_sq::index and its active trained dtype
+ *
+ */
+typedef struct {
+  uintptr_t addr;
+  DLDataType dtype;
+} cuvsIvfSqIndex;
+
+typedef cuvsIvfSqIndex* cuvsIvfSqIndex_t;
+
+/**
+ * @brief Allocate IVF-SQ index
+ *
+ * @param[in] index cuvsIvfSqIndex_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index);
+
+/**
+ * @brief De-allocate IVF-SQ index
+ *
+ * @param[in] index cuvsIvfSqIndex_t to de-allocate
+ */
+cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index);
+
+/** Get the number of clusters/inverted lists */
+cuvsError_t cuvsIvfSqIndexGetNLists(cuvsIvfSqIndex_t index, int64_t* n_lists);
+
+/** Get the dimensionality of the data */
+cuvsError_t cuvsIvfSqIndexGetDim(cuvsIvfSqIndex_t index, int64_t* dim);
+
+/** Get the size of the index */
+cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size);
+
+/**
+ * @brief Get the cluster centers corresponding to the lists [n_lists, dim]
+ *
+ * @param[in] index cuvsIvfSqIndex_t Built Ivf-SQ Index
+ * @param[out] centers Preallocated array on host or device memory to store output, [n_lists, dim]
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* centers);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_index_build IVF-SQ index build
+ * @{
+ */
+/**
+ * @brief Build an IVF-SQ index with a `DLManagedTensor` which has underlying
+ *        `DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`,
+ *        or `kDLCPU`. Also, acceptable underlying types are:
+ *        1. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+ *        2. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 16`
+ *
+ * @code {.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/ivf_sq.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Assume a populated `DLManagedTensor` type here
+ * DLManagedTensor dataset;
+ *
+ * // Create default index params
+ * cuvsIvfSqIndexParams_t index_params;
+ * cuvsError_t params_create_status = cuvsIvfSqIndexParamsCreate(&index_params);
+ *
+ * // Create IVF-SQ index
+ * cuvsIvfSqIndex_t index;
+ * cuvsError_t index_create_status = cuvsIvfSqIndexCreate(&index);
+ *
+ * // Build the IVF-SQ Index
+ * cuvsError_t build_status = cuvsIvfSqBuild(res, index_params, &dataset, index);
+ *
+ * // de-allocate `index_params`, `index` and `res`
+ * cuvsError_t params_destroy_status = cuvsIvfSqIndexParamsDestroy(index_params);
+ * cuvsError_t index_destroy_status = cuvsIvfSqIndexDestroy(index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] index_params cuvsIvfSqIndexParams_t used to build IVF-SQ index
+ * @param[in] dataset DLManagedTensor* training dataset
+ * @param[out] index cuvsIvfSqIndex_t Newly built IVF-SQ index
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
+                           cuvsIvfSqIndexParams_t index_params,
+                           DLManagedTensor* dataset,
+                           cuvsIvfSqIndex_t index);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_index_search IVF-SQ index search
+ * @{
+ */
+/**
+ * @brief Search an IVF-SQ index with a `DLManagedTensor` which has underlying
+ *        `DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`.
+ *        Types for input are:
+ *        1. `queries`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32` or 16
+ *        2. `neighbors`: `kDLDataType.code == kDLInt` and `kDLDataType.bits = 64`
+ *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+ *
+ * @code {.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/ivf_sq.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Assume a populated `DLManagedTensor` type here
+ * DLManagedTensor queries;
+ * DLManagedTensor neighbors;
+ * DLManagedTensor distances;
+ *
+ * // Create default search params
+ * cuvsIvfSqSearchParams_t search_params;
+ * cuvsError_t params_create_status = cuvsIvfSqSearchParamsCreate(&search_params);
+ *
+ * // Search the `index` built using `cuvsIvfSqBuild`
+ * cuvsError_t search_status = cuvsIvfSqSearch(res, search_params, index, &queries, &neighbors,
+ * &distances);
+ *
+ * // de-allocate `search_params` and `res`
+ * cuvsError_t params_destroy_status = cuvsIvfSqSearchParamsDestroy(search_params);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] search_params cuvsIvfSqSearchParams_t used to search IVF-SQ index
+ * @param[in] index ivfSqIndex which has been returned by `cuvsIvfSqBuild`
+ * @param[in] queries DLManagedTensor* queries dataset to search
+ * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
+ * @param[out] distances DLManagedTensor* output `k` distances for queries
+ */
+cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
+                            cuvsIvfSqSearchParams_t search_params,
+                            cuvsIvfSqIndex_t index,
+                            DLManagedTensor* queries,
+                            DLManagedTensor* neighbors,
+                            DLManagedTensor* distances);
+
+/**
+ * @brief Search an IVF-SQ index with filtering.
+ *
+ * Same as cuvsIvfSqSearch, but applies a pre-filter to exclude vectors during search.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] search_params cuvsIvfSqSearchParams_t used to search IVF-SQ index
+ * @param[in] index ivfSqIndex which has been returned by `cuvsIvfSqBuild`
+ * @param[in] queries DLManagedTensor* queries dataset to search
+ * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
+ * @param[out] distances DLManagedTensor* output `k` distances for queries
+ * @param[in] filter cuvsFilter to filter neighbors based on the given bitset
+ */
+cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
+                                      cuvsIvfSqSearchParams_t search_params,
+                                      cuvsIvfSqIndex_t index,
+                                      DLManagedTensor* queries,
+                                      DLManagedTensor* neighbors,
+                                      DLManagedTensor* distances,
+                                      cuvsFilter filter);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_index_serialize IVF-SQ C-API serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/ivf_sq.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsIvfSqBuild`
+ * cuvsIvfSqSerialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index IVF-SQ index
+ */
+cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res, const char* filename, cuvsIvfSqIndex_t index);
+
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index IVF-SQ index loaded from disk
+ */
+cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
+                                 const char* filename,
+                                 cuvsIvfSqIndex_t index);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_c_index_extend IVF-SQ index extend
+ * @{
+ */
+/**
+ * @brief Extend the index with the new data.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] new_vectors DLManagedTensor* the new vectors to add to the index
+ * @param[in] new_indices DLManagedTensor* vector of new indices for the new vectors
+ * @param[inout] index IVF-SQ index to be extended
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsIvfSqExtend(cuvsResources_t res,
+                            DLManagedTensor* new_vectors,
+                            DLManagedTensor* new_indices,
+                            cuvsIvfSqIndex_t index);
+/**
+ * @}
+ */
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/src/neighbors/ivf_sq.cpp b/c/src/neighbors/ivf_sq.cpp
new file mode 100644
index 0000000000..eadf84a299
--- /dev/null
+++ b/c/src/neighbors/ivf_sq.cpp
@@ -0,0 +1,363 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstdint>
+#include <dlpack/dlpack.h>
+
+#include <raft/core/error.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/neighbors/ivf_sq.h>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "../core/exceptions.hpp"
+#include "../core/interop.hpp"
+
+namespace cuvs::neighbors::ivf_sq {
+void convert_c_index_params(cuvsIvfSqIndexParams params,
+                            cuvs::neighbors::ivf_sq::index_params* out)
+{
+  out->metric                        = static_cast<cuvs::distance::DistanceType>((int)params.metric);
+  out->metric_arg                    = params.metric_arg;
+  out->add_data_on_build             = params.add_data_on_build;
+  out->n_lists                       = params.n_lists;
+  out->kmeans_n_iters                = params.kmeans_n_iters;
+  out->kmeans_trainset_fraction      = params.kmeans_trainset_fraction;
+  out->adaptive_centers              = params.adaptive_centers;
+  out->conservative_memory_allocation = params.conservative_memory_allocation;
+}
+void convert_c_search_params(cuvsIvfSqSearchParams params,
+                             cuvs::neighbors::ivf_sq::search_params* out)
+{
+  out->n_probes = params.n_probes;
+}
+}  // namespace cuvs::neighbors::ivf_sq
+
+namespace {
+
+template <typename T>
+void* _build(cuvsResources_t res, cuvsIvfSqIndexParams params, DLManagedTensor* dataset_tensor)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+
+  auto build_params = cuvs::neighbors::ivf_sq::index_params();
+  cuvs::neighbors::ivf_sq::convert_c_index_params(params, &build_params);
+
+  auto dataset = dataset_tensor->dl_tensor;
+  auto dim     = dataset.shape[1];
+
+  auto index = new cuvs::neighbors::ivf_sq::index<uint8_t>(*res_ptr, build_params, dim);
+
+  if (cuvs::core::is_dlpack_device_compatible(dataset)) {
+    using mdspan_type = raft::device_matrix_view<const T, int64_t, raft::row_major>;
+    auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
+    cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds, *index);
+  } else {
+    using mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+    auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
+    cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds, *index);
+  }
+
+  return index;
+}
+
+template <typename T>
+void _search(cuvsResources_t res,
+             cuvsIvfSqSearchParams params,
+             cuvsIvfSqIndex index,
+             DLManagedTensor* queries_tensor,
+             DLManagedTensor* neighbors_tensor,
+             DLManagedTensor* distances_tensor,
+             cuvsFilter* filter)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
+
+  auto search_params = cuvs::neighbors::ivf_sq::search_params();
+  cuvs::neighbors::ivf_sq::convert_c_search_params(params, &search_params);
+
+  using queries_mdspan_type   = raft::device_matrix_view<const T, int64_t, raft::row_major>;
+  using neighbors_mdspan_type = raft::device_matrix_view<int64_t, int64_t, raft::row_major>;
+  using distances_mdspan_type = raft::device_matrix_view<float, int64_t, raft::row_major>;
+  auto queries_mds            = cuvs::core::from_dlpack<queries_mdspan_type>(queries_tensor);
+  auto neighbors_mds          = cuvs::core::from_dlpack<neighbors_mdspan_type>(neighbors_tensor);
+  auto distances_mds          = cuvs::core::from_dlpack<distances_mdspan_type>(distances_tensor);
+
+  if (filter == nullptr || filter->type == NO_FILTER) {
+    cuvs::neighbors::ivf_sq::search(
+      *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds);
+  } else if (filter->type == BITSET) {
+    using filter_mdspan_type    = raft::device_vector_view<std::uint32_t, int64_t, raft::row_major>;
+    auto removed_indices_tensor = reinterpret_cast<DLManagedTensor*>(filter->addr);
+    auto removed_indices = cuvs::core::from_dlpack<filter_mdspan_type>(removed_indices_tensor);
+    cuvs::core::bitset_view<std::uint32_t, int64_t> removed_indices_bitset(removed_indices,
+                                                                           index_ptr->size());
+    auto bitset_filter_obj = cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset);
+    cuvs::neighbors::ivf_sq::search(*res_ptr,
+                                    search_params,
+                                    *index_ptr,
+                                    queries_mds,
+                                    neighbors_mds,
+                                    distances_mds,
+                                    bitset_filter_obj);
+  } else {
+    RAFT_FAIL("Unsupported filter type: BITMAP");
+  }
+}
+
+void _serialize(cuvsResources_t res, const char* filename, cuvsIvfSqIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
+  cuvs::neighbors::ivf_sq::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+void* _deserialize(cuvsResources_t res, const char* filename)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = new cuvs::neighbors::ivf_sq::index<uint8_t>(*res_ptr);
+  cuvs::neighbors::ivf_sq::deserialize(*res_ptr, std::string(filename), index);
+  return index;
+}
+
+template <typename T>
+void _extend(cuvsResources_t res,
+             DLManagedTensor* new_vectors,
+             DLManagedTensor* new_indices,
+             cuvsIvfSqIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
+
+  bool on_device = cuvs::core::is_dlpack_device_compatible(new_vectors->dl_tensor);
+  if (on_device != cuvs::core::is_dlpack_device_compatible(new_indices->dl_tensor)) {
+    RAFT_FAIL("extend inputs must both either be on device memory or host memory");
+  }
+
+  if (on_device) {
+    using vectors_mdspan_type = raft::device_matrix_view<const T, int64_t, raft::row_major>;
+    using indices_mdspan_type = raft::device_vector_view<int64_t, int64_t>;
+    auto vectors_mds          = cuvs::core::from_dlpack<vectors_mdspan_type>(new_vectors);
+    auto indices_mds          = cuvs::core::from_dlpack<indices_mdspan_type>(new_indices);
+    cuvs::neighbors::ivf_sq::extend(*res_ptr, vectors_mds, indices_mds, index_ptr);
+  } else {
+    using vectors_mdspan_type = raft::host_matrix_view<const T, int64_t, raft::row_major>;
+    using indices_mdspan_type = raft::host_vector_view<int64_t, int64_t>;
+    auto vectors_mds          = cuvs::core::from_dlpack<vectors_mdspan_type>(new_vectors);
+    auto indices_mds          = cuvs::core::from_dlpack<indices_mdspan_type>(new_indices);
+    cuvs::neighbors::ivf_sq::extend(*res_ptr, vectors_mds, indices_mds, index_ptr);
+  }
+}
+
+void _get_centers(cuvsIvfSqIndex index, DLManagedTensor* centers)
+{
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
+  cuvs::core::to_dlpack(index_ptr->centers(), centers);
+}
+}  // namespace
+
+extern "C" cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index)
+{
+  return cuvs::core::translate_exceptions([=] { *index = new cuvsIvfSqIndex{}; });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index_c_ptr)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index     = *index_c_ptr;
+    auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
+    delete index_ptr;
+    delete index_c_ptr;
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
+                                      cuvsIvfSqIndexParams_t params,
+                                      DLManagedTensor* dataset_tensor,
+                                      cuvsIvfSqIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto dataset = dataset_tensor->dl_tensor;
+
+    index->dtype.code = dataset.dtype.code;
+    index->dtype.bits = dataset.dtype.bits;
+
+    if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
+      index->addr = reinterpret_cast<uintptr_t>(_build<float>(res, *params, dataset_tensor));
+    } else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 16) {
+      index->addr = reinterpret_cast<uintptr_t>(_build<half>(res, *params, dataset_tensor));
+    } else {
+      RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
+                dataset.dtype.code,
+                dataset.dtype.bits);
+    }
+  });
+}
+
+static cuvsError_t _cuvsIvfSqSearchImpl(cuvsResources_t res,
+                                        cuvsIvfSqSearchParams_t params,
+                                        cuvsIvfSqIndex_t index_c_ptr,
+                                        DLManagedTensor* queries_tensor,
+                                        DLManagedTensor* neighbors_tensor,
+                                        DLManagedTensor* distances_tensor,
+                                        cuvsFilter* filter)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto queries   = queries_tensor->dl_tensor;
+    auto neighbors = neighbors_tensor->dl_tensor;
+    auto distances = distances_tensor->dl_tensor;
+
+    RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(queries),
+                 "queries should have device compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(neighbors),
+                 "neighbors should have device compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(distances),
+                 "distances should have device compatible memory");
+
+    RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64,
+                 "neighbors should be of type int64_t");
+    RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32,
+                 "distances should be of type float32");
+
+    auto index = *index_c_ptr;
+    if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
+      _search<float>(
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
+    } else if (queries.dtype.code == kDLFloat && queries.dtype.bits == 16) {
+      _search<half>(
+        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor, filter);
+    } else {
+      RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d",
+                queries.dtype.code,
+                queries.dtype.bits);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
+                                       cuvsIvfSqSearchParams_t params,
+                                       cuvsIvfSqIndex_t index_c_ptr,
+                                       DLManagedTensor* queries_tensor,
+                                       DLManagedTensor* neighbors_tensor,
+                                       DLManagedTensor* distances_tensor)
+{
+  return _cuvsIvfSqSearchImpl(
+    res, params, index_c_ptr, queries_tensor, neighbors_tensor, distances_tensor, nullptr);
+}
+
+extern "C" cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
+                                                  cuvsIvfSqSearchParams_t params,
+                                                  cuvsIvfSqIndex_t index_c_ptr,
+                                                  DLManagedTensor* queries_tensor,
+                                                  DLManagedTensor* neighbors_tensor,
+                                                  DLManagedTensor* distances_tensor,
+                                                  cuvsFilter filter)
+{
+  return _cuvsIvfSqSearchImpl(
+    res, params, index_c_ptr, queries_tensor, neighbors_tensor, distances_tensor, &filter);
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    *params = new cuvsIvfSqIndexParams{.metric                         = L2Expanded,
+                                       .metric_arg                     = 2.0f,
+                                       .add_data_on_build              = true,
+                                       .n_lists                        = 1024,
+                                       .kmeans_n_iters                 = 20,
+                                       .kmeans_trainset_fraction       = 0.5,
+                                       .adaptive_centers               = false,
+                                       .conservative_memory_allocation = false};
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexParamsDestroy(cuvsIvfSqIndexParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsIvfSqSearchParams{.n_probes = 20}; });
+}
+
+extern "C" cuvsError_t cuvsIvfSqSearchParamsDestroy(cuvsIvfSqSearchParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
+                                            const char* filename,
+                                            cuvsIvfSqIndex_t index)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { index->addr = reinterpret_cast<uintptr_t>(_deserialize(res, filename)); });
+}
+
+extern "C" cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res,
+                                          const char* filename,
+                                          cuvsIvfSqIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] { _serialize(res, filename, *index); });
+}
+
+extern "C" cuvsError_t cuvsIvfSqExtend(cuvsResources_t res,
+                                       DLManagedTensor* new_vectors,
+                                       DLManagedTensor* new_indices,
+                                       cuvsIvfSqIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto vectors = new_vectors->dl_tensor;
+
+    if (index->dtype.code == 0 && index->dtype.bits == 0) {
+      index->dtype.code = vectors.dtype.code;
+      index->dtype.bits = vectors.dtype.bits;
+    }
+
+    if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) {
+      _extend<float>(res, new_vectors, new_indices, *index);
+    } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) {
+      _extend<half>(res, new_vectors, new_indices, *index);
+    } else {
+      RAFT_FAIL(
+        "Unsupported vectors DLtensor dtype: %d and bits: %d", vectors.dtype.code, vectors.dtype.bits);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexGetNLists(cuvsIvfSqIndex_t index, int64_t* n_lists)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index_ptr =
+      reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index->addr);
+    *n_lists = index_ptr->n_lists();
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexGetDim(cuvsIvfSqIndex_t index, int64_t* dim)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index_ptr =
+      reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index->addr);
+    *dim = index_ptr->dim();
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index_ptr =
+      reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index->addr);
+    *size = index_ptr->size();
+  });
+}
+
+extern "C" cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* centers)
+{
+  return cuvs::core::translate_exceptions([=] { _get_centers(*index, centers); });
+}
diff --git a/c/src/neighbors/ivf_sq.hpp b/c/src/neighbors/ivf_sq.hpp
new file mode 100644
index 0000000000..3a08bc689a
--- /dev/null
+++ b/c/src/neighbors/ivf_sq.hpp
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <cuvs/neighbors/ivf_sq.h>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+namespace cuvs::neighbors::ivf_sq {
+/// Converts a cuvsIvfSqIndexParams struct (c) to a ivf_sq::index_params (C++) struct
+void convert_c_index_params(cuvsIvfSqIndexParams params,
+                            cuvs::neighbors::ivf_sq::index_params* out);
+void convert_c_search_params(cuvsIvfSqSearchParams params,
+                             cuvs::neighbors::ivf_sq::search_params* out);
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/c/tests/CMakeLists.txt b/c/tests/CMakeLists.txt
index 6d52e5b174..9c96fc4120 100644
--- a/c/tests/CMakeLists.txt
+++ b/c/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
@@ -77,6 +77,7 @@ ConfigureTest(
 ConfigureTest(NAME BRUTEFORCE_C_TEST PATH neighbors/run_brute_force_c.c neighbors/brute_force_c.cu)
 ConfigureTest(NAME IVF_FLAT_C_TEST PATH neighbors/run_ivf_flat_c.c neighbors/ann_ivf_flat_c.cu)
 ConfigureTest(NAME IVF_PQ_C_TEST PATH neighbors/run_ivf_pq_c.c neighbors/ann_ivf_pq_c.cu)
+ConfigureTest(NAME IVF_SQ_C_TEST PATH neighbors/run_ivf_sq_c.c neighbors/ann_ivf_sq_c.cu)
 ConfigureTest(NAME CAGRA_C_TEST PATH neighbors/ann_cagra_c.cu)
 ConfigureTest(NAME MG_C_TEST PATH neighbors/run_mg_c.c neighbors/ann_mg_c.cu)
 ConfigureTest(
diff --git a/c/tests/neighbors/ann_ivf_sq_c.cu b/c/tests/neighbors/ann_ivf_sq_c.cu
new file mode 100644
index 0000000000..c36786e45a
--- /dev/null
+++ b/c/tests/neighbors/ann_ivf_sq_c.cu
@@ -0,0 +1,130 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuda.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include "neighbors/ann_utils.cuh"
+#include <cuvs/neighbors/ivf_sq.h>
+
+extern "C" void run_ivf_sq(int64_t n_rows,
+                           int64_t n_queries,
+                           int64_t n_dim,
+                           uint32_t n_neighbors,
+                           float* index_data,
+                           float* query_data,
+                           float* distances_data,
+                           int64_t* neighbors_data,
+                           cuvsDistanceType metric,
+                           size_t n_probes,
+                           size_t n_lists);
+
+template <typename T>
+void generate_random_data(T* devPtr, size_t size)
+{
+  raft::handle_t handle;
+  raft::random::RngState r(1234ULL);
+  raft::random::uniform(handle, r, devPtr, size, T(0.1), T(2.0));
+};
+
+template <typename T, typename IdxT>
+void recall_eval(T* query_data,
+                 T* index_data,
+                 IdxT* neighbors,
+                 T* distances,
+                 size_t n_queries,
+                 size_t n_rows,
+                 size_t n_dim,
+                 size_t n_neighbors,
+                 cuvsDistanceType metric,
+                 size_t n_probes,
+                 size_t n_lists)
+{
+  raft::handle_t handle;
+  auto distances_ref = raft::make_device_matrix<T, IdxT>(handle, n_queries, n_neighbors);
+  auto neighbors_ref = raft::make_device_matrix<IdxT, IdxT>(handle, n_queries, n_neighbors);
+  cuvs::neighbors::naive_knn<T, T, IdxT>(
+    handle,
+    distances_ref.data_handle(),
+    neighbors_ref.data_handle(),
+    query_data,
+    index_data,
+    n_queries,
+    n_rows,
+    n_dim,
+    n_neighbors,
+    static_cast<cuvs::distance::DistanceType>((uint16_t)metric));
+
+  size_t size = n_queries * n_neighbors;
+  std::vector<IdxT> neighbors_h(size);
+  std::vector<T> distances_h(size);
+  std::vector<IdxT> neighbors_ref_h(size);
+  std::vector<T> distances_ref_h(size);
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+  raft::copy(neighbors_h.data(), neighbors, size, stream);
+  raft::copy(distances_h.data(), distances, size, stream);
+  raft::copy(neighbors_ref_h.data(), neighbors_ref.data_handle(), size, stream);
+  raft::copy(distances_ref_h.data(), distances_ref.data_handle(), size, stream);
+
+  double min_recall = static_cast<double>(n_probes) / static_cast<double>(n_lists);
+  ASSERT_TRUE(cuvs::neighbors::eval_neighbours(neighbors_ref_h,
+                                               neighbors_h,
+                                               distances_ref_h,
+                                               distances_h,
+                                               n_queries,
+                                               n_neighbors,
+                                               0.001,
+                                               min_recall));
+};
+
+TEST(IvfSqC, BuildSearch)
+{
+  int64_t n_rows       = 8096;
+  int64_t n_queries    = 128;
+  int64_t n_dim        = 32;
+  uint32_t n_neighbors = 8;
+
+  raft::handle_t handle;
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  cuvsDistanceType metric = L2Expanded;
+  size_t n_probes         = 20;
+  size_t n_lists          = 1024;
+
+  rmm::device_uvector<float> index_data(n_rows * n_dim, stream);
+  rmm::device_uvector<float> query_data(n_queries * n_dim, stream);
+  rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
+  rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
+
+  generate_random_data(index_data.data(), n_rows * n_dim);
+  generate_random_data(query_data.data(), n_queries * n_dim);
+
+  run_ivf_sq(n_rows,
+             n_queries,
+             n_dim,
+             n_neighbors,
+             index_data.data(),
+             query_data.data(),
+             distances_data.data(),
+             neighbors_data.data(),
+             metric,
+             n_probes,
+             n_lists);
+
+  recall_eval(query_data.data(),
+              index_data.data(),
+              neighbors_data.data(),
+              distances_data.data(),
+              n_queries,
+              n_rows,
+              n_dim,
+              n_neighbors,
+              metric,
+              n_probes,
+              n_lists);
+}
diff --git a/c/tests/neighbors/c_api.c b/c/tests/neighbors/c_api.c
index 6988aaf618..86108ea703 100644
--- a/c/tests/neighbors/c_api.c
+++ b/c/tests/neighbors/c_api.c
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cuvs/core/c_api.h>
 #include <cuvs/neighbors/all_neighbors.h>
 #include <cuvs/neighbors/cagra.h>
+#include <cuvs/neighbors/ivf_sq.h>
 #include <cuvs/neighbors/tiered_index.h>
 
 #include <dlpack/dlpack.h>
@@ -47,6 +48,15 @@ void test_compile_tiered_index()
   cuvsTieredIndexExtend(resources, &dataset, tiered_index);
 }
 
+void test_compile_ivf_sq()
+{
+  assert(!"test_compile_ivf_sq is not meant to be run");
+
+  cuvsIvfSqIndex_t index;
+  cuvsIvfSqIndexCreate(&index);
+  cuvsIvfSqIndexDestroy(index);
+}
+
 void test_compile_all_neighbors()
 {
   // Smoke test to ensure that the all_neighbors.h API compiles correctly
@@ -66,6 +76,7 @@ int main()
   // These are smoke tests that check that the C-APIs compile with a C compiler.
   // These are not meant to be run.
   test_compile_cagra();
+  test_compile_ivf_sq();
   test_compile_tiered_index();
   test_compile_all_neighbors();
 
diff --git a/c/tests/neighbors/run_ivf_sq_c.c b/c/tests/neighbors/run_ivf_sq_c.c
new file mode 100644
index 0000000000..d07502abd6
--- /dev/null
+++ b/c/tests/neighbors/run_ivf_sq_c.c
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.h>
+
+void run_ivf_sq(int64_t n_rows,
+                int64_t n_queries,
+                int64_t n_dim,
+                uint32_t n_neighbors,
+                float* index_data,
+                float* query_data,
+                float* distances_data,
+                int64_t* neighbors_data,
+                cuvsDistanceType metric,
+                size_t n_probes,
+                size_t n_lists)
+{
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data              = index_data;
+  dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
+  dataset_tensor.dl_tensor.ndim              = 2;
+  dataset_tensor.dl_tensor.dtype.code        = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits        = 32;
+  dataset_tensor.dl_tensor.dtype.lanes       = 1;
+  int64_t dataset_shape[2]                   = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape             = dataset_shape;
+  dataset_tensor.dl_tensor.strides           = NULL;
+
+  cuvsIvfSqIndex_t index;
+  cuvsIvfSqIndexCreate(&index);
+
+  cuvsIvfSqIndexParams_t build_params;
+  cuvsIvfSqIndexParamsCreate(&build_params);
+  build_params->metric  = metric;
+  build_params->n_lists = n_lists;
+  cuvsIvfSqBuild(res, build_params, &dataset_tensor, index);
+
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data              = (void*)query_data;
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim              = 2;
+  queries_tensor.dl_tensor.dtype.code        = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits        = 32;
+  queries_tensor.dl_tensor.dtype.lanes       = 1;
+  int64_t queries_shape[2]                   = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape             = queries_shape;
+  queries_tensor.dl_tensor.strides           = NULL;
+
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data              = (void*)neighbors_data;
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim              = 2;
+  neighbors_tensor.dl_tensor.dtype.code        = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits        = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes       = 1;
+  int64_t neighbors_shape[2]                   = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape             = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides           = NULL;
+
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data              = (void*)distances_data;
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim              = 2;
+  distances_tensor.dl_tensor.dtype.code        = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits        = 32;
+  distances_tensor.dl_tensor.dtype.lanes       = 1;
+  int64_t distances_shape[2]                   = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape             = distances_shape;
+  distances_tensor.dl_tensor.strides           = NULL;
+
+  cuvsIvfSqSearchParams_t search_params;
+  cuvsIvfSqSearchParamsCreate(&search_params);
+  search_params->n_probes = n_probes;
+  cuvsIvfSqSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor);
+
+  cuvsIvfSqSearchParamsDestroy(search_params);
+  cuvsIvfSqIndexParamsDestroy(build_params);
+  cuvsIvfSqIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 39c653b048..a17992ff19 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -427,7 +427,7 @@ void search_impl(raft::resources const& handle,
     num_samples_view);
 
   ivf::detail::postprocess_distances(
-    distances, distances, index.metric(), n_queries, k, 1.0, false, stream);
+    handle, distances, distances, index.metric(), n_queries, k, 1.0, false);
 
   ivf::detail::postprocess_neighbors(neighbors,
                                      neighbors_uint32_ptr,

From 83b8c6351239cd03aaff97a761eca6a49e6f08f5 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 12 Mar 2026 11:40:17 +0100
Subject: [PATCH 05/43] Update postprocess_neighbors signature

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 39c653b048..a17992ff19 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -427,7 +427,7 @@ void search_impl(raft::resources const& handle,
     num_samples_view);
 
   ivf::detail::postprocess_distances(
-    distances, distances, index.metric(), n_queries, k, 1.0, false, stream);
+    handle, distances, distances, index.metric(), n_queries, k, 1.0, false);
 
   ivf::detail::postprocess_neighbors(neighbors,
                                      neighbors_uint32_ptr,

From 1050debec9dc64c52f766a5e6e69ea9569789430 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 12 Mar 2026 12:01:52 +0100
Subject: [PATCH 06/43] update testing

---
 cpp/tests/neighbors/ann_ivf_sq.cuh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index a7e02315e4..25abc82740 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -410,9 +410,6 @@ const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
   // very few db vectors (nlist reduced to fit)
   {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::L2Expanded, false},
   {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::CosineExpanded, false},
-  // small db with many empty clusters
-  {100, 100, 16, 5, 20, 64, cuvs::distance::DistanceType::L2Expanded, false},
-  {100, 100, 16, 5, 20, 64, cuvs::distance::DistanceType::CosineExpanded, false},
   // larger datasets
   {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
   {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, true},

From 928830a9a0677d4d0a3bc1283f5b0beb421cdbe0 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 12 Mar 2026 16:24:26 +0100
Subject: [PATCH 07/43] Add C documentation

---
 docs/source/c_api/neighbors.rst          |  1 +
 docs/source/c_api/neighbors_ivf_sq_c.rst | 66 ++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 docs/source/c_api/neighbors_ivf_sq_c.rst

diff --git a/docs/source/c_api/neighbors.rst b/docs/source/c_api/neighbors.rst
index 09bd47e2c8..b950aa8227 100644
--- a/docs/source/c_api/neighbors.rst
+++ b/docs/source/c_api/neighbors.rst
@@ -12,6 +12,7 @@ Nearest Neighbors
    neighbors_bruteforce_c.rst
    neighbors_ivf_flat_c.rst
    neighbors_ivf_pq_c.rst
+   neighbors_ivf_sq_c.rst
    neighbors_cagra_c.rst
    neighbors_hnsw_c.rst
    neighbors_mg.rst
diff --git a/docs/source/c_api/neighbors_ivf_sq_c.rst b/docs/source/c_api/neighbors_ivf_sq_c.rst
new file mode 100644
index 0000000000..be903fcf97
--- /dev/null
+++ b/docs/source/c_api/neighbors_ivf_sq_c.rst
@@ -0,0 +1,66 @@
+IVF-SQ
+======
+
+The IVF-SQ method is an ANN algorithm. It uses an inverted file index (IVF) with scalar quantization (SQ) to compress the vectors. This algorithm provides knobs to reduce the overall search space and memory footprint, and to trade-off accuracy for speed.
+
+.. role:: py(code)
+   :language: c
+   :class: highlight
+
+``#include <cuvs/neighbors/ivf_sq.h>``
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: ivf_sq_c_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: ivf_sq_c_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: ivf_sq_c_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index build
+-----------
+
+.. doxygengroup:: ivf_sq_c_index_build
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search
+------------
+
+.. doxygengroup:: ivf_sq_c_index_search
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+
+.. doxygengroup:: ivf_sq_c_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_sq_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:

From 3a911d887a19471137085bcc19f6ab881af5f76f Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 11:14:58 +0100
Subject: [PATCH 08/43] documentation

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp    | 554 ++++++++++++++++++++++-
 docs/source/cpp_api/neighbors.rst        |   1 +
 docs/source/cpp_api/neighbors_ivf_sq.rst |  68 +++
 3 files changed, 614 insertions(+), 9 deletions(-)
 create mode 100644 docs/source/cpp_api/neighbors_ivf_sq.rst

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 2f09751e95..042ecda10a 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -24,19 +24,62 @@ namespace cuvs::neighbors::ivf_sq {
 constexpr static uint32_t kIndexGroupSize = 32;
 
 struct index_params : cuvs::neighbors::index_params {
-  uint32_t n_lists                    = 1024;
-  uint32_t kmeans_n_iters             = 20;
-  double kmeans_trainset_fraction     = 0.5;
-  bool adaptive_centers               = false;
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+  /**
+   * By default (adaptive_centers = false), the cluster centers are trained in `ivf_sq::build`,
+   * and never modified in `ivf_sq::extend`. As a result, you may need to retrain the index
+   * from scratch after invoking (`ivf_sq::extend`) a few times with new data, the distribution of
+   * which is no longer representative of the original training set.
+   *
+   * The alternative behavior (adaptive_centers = true) is to update the cluster centers for new
+   * data when it is added. In this case, `index.centers()` are always exactly the centroids of the
+   * data in the corresponding clusters. The drawback of this behavior is that the centroids depend
+   * on the order of adding new data (through the classification of the added data); that is,
+   * `index.centers()` "drift" together with the changing distribution of the newly added data.
+   */
+  bool adaptive_centers = false;
+  /**
+   * By default, the algorithm allocates more space than necessary for individual clusters
+   * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+   * data copies during repeated calls to `extend` (extending the database).
+   *
+   * The alternative is the conservative allocation behavior; when enabled, the algorithm always
+   * allocates the minimum amount of memory required to store the given number of records. Set this
+   * flag to `true` if you prefer to use as little GPU memory for the database as possible.
+   */
   bool conservative_memory_allocation = false;
-  bool add_data_on_build              = true;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build = true;
 };
 
+static_assert(std::is_aggregate_v<index_params>);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_search_params IVF-SQ index search parameters
+ * @{
+ */
+
 struct search_params : cuvs::neighbors::search_params {
+  /** The number of clusters to search. */
   uint32_t n_probes = 20;
 };
 
-static_assert(std::is_aggregate_v<index_params>);
 static_assert(std::is_aggregate_v<search_params>);
 
 /**
@@ -97,11 +140,23 @@ using list_data = ivf::list<list_spec, SizeT, IdxT, ExtT>;
 /**
  * @brief IVF-SQ index.
  *
+ * In the IVF-SQ index, a database vector is first assigned to the nearest cluster center
+ * using an inverted file (IVF) structure, and then compressed using scalar quantization (SQ).
+ *
+ * Scalar quantization independently maps each dimension of the vector to a fixed-width integer
+ * code. For 8-bit quantization (uint8_t), each floating-point component is linearly mapped to
+ * an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and range (`sq_delta`)
+ * values:
+ *
+ *   code_i = round((x_i - vmin_i) / delta_i * 255)
+ *
+ * This provides a compact representation (1 byte per dimension) while preserving the relative
+ * distances between vectors with high fidelity, offering a good trade-off between index size,
+ * search speed, and recall compared to flat (uncompressed) and product-quantized (PQ)
+ * representations.
+ *
  * @tparam IdxT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
  *
- * No member depends on the raw data type T (float, half). T appears only
- * in the free-function signatures (build, search, extend) where input data
- * is consumed, following the IVF-PQ pattern.
  */
 template <typename IdxT>
 struct index : cuvs::neighbors::index {
@@ -192,41 +247,219 @@ struct index : cuvs::neighbors::index {
  * @{
  */
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
 auto build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
 void build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
            cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
 auto build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
 void build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
            cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
 auto build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
 void build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
            cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
 auto build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
 void build(raft::resources const& handle,
            const cuvs::neighbors::ivf_sq::index_params& index_params,
            raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
@@ -241,45 +474,237 @@ void build(raft::resources const& handle,
  * @{
  */
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
 auto extend(raft::resources const& handle,
             raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
 void extend(raft::resources const& handle,
             raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
 auto extend(raft::resources const& handle,
             raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
 void extend(raft::resources const& handle,
             raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
 auto extend(raft::resources const& handle,
             raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
 void extend(raft::resources const& handle,
             raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
 auto extend(raft::resources const& handle,
             raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
 void extend(raft::resources const& handle,
             raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
@@ -294,6 +719,41 @@ void extend(raft::resources const& handle,
  * @{
  */
 
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_sq::build](#ivf_sq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default search parameters
+ *   ivf_sq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_sq::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_sq::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_sq::search(handle, search_params, index, queries3, out_inds3, out_dists3);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-sq constructed index
+ * @param[in] queries raft::device_matrix_view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors raft::device_matrix_view to the indices of the neighbors in the source
+ * dataset [n_queries, k]
+ * @param[out] distances raft::device_matrix_view to the distances to the selected neighbors
+ * [n_queries, k]
+ * @param[in] sample_filter an optional device filter function object that greenlights samples
+ * for a given query. (none_sample_filter for no filtering)
+ */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_sq::search_params& params,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
@@ -303,6 +763,38 @@ void search(raft::resources const& handle,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
 
+/**
+ * @brief Search ANN using the constructed index with half-precision queries.
+ *
+ * See the [ivf_sq::build](#ivf_sq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ *
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default search parameters
+ *   ivf_sq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_sq::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_sq::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_sq::search(handle, search_params, index, queries3, out_inds3, out_dists3);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-sq constructed index
+ * @param[in] queries raft::device_matrix_view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors raft::device_matrix_view to the indices of the neighbors in the source
+ * dataset [n_queries, k]
+ * @param[out] distances raft::device_matrix_view to the distances to the selected neighbors
+ * [n_queries, k]
+ * @param[in] sample_filter an optional device filter function object that greenlights samples
+ * for a given query. (none_sample_filter for no filtering)
+ */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_sq::search_params& params,
             const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
@@ -321,10 +813,54 @@ void search(raft::resources const& handle,
  * @{
  */
 
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/ivf_sq.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = ivf_sq::build(...);`
+ * cuvs::neighbors::ivf_sq::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index IVF-SQ index
+ *
+ */
 void serialize(raft::resources const& handle,
                const std::string& filename,
                const cuvs::neighbors::ivf_sq::index<uint8_t>& index);
 
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/ivf_sq.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an empty index with `ivf_sq::index<uint8_t> index(handle);`
+ * cuvs::neighbors::ivf_sq::deserialize(handle, filename, &index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index IVF-SQ index
+ *
+ */
 void deserialize(raft::resources const& handle,
                  const std::string& filename,
                  cuvs::neighbors::ivf_sq::index<uint8_t>* index);
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index 0c6e9cfd86..1c266bb902 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -18,6 +18,7 @@ Nearest Neighbors
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
+   neighbors_ivf_sq.rst
    neighbors_nn_descent.rst
    neighbors_refine.rst
    neighbors_mg.rst
diff --git a/docs/source/cpp_api/neighbors_ivf_sq.rst b/docs/source/cpp_api/neighbors_ivf_sq.rst
new file mode 100644
index 0000000000..d0554f926a
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_ivf_sq.rst
@@ -0,0 +1,68 @@
+IVF-SQ
+======
+
+The IVF-SQ method is an ANN algorithm. Like IVF-Flat, IVF-SQ splits the points into a number of clusters (also specified by a parameter called n_lists) and searches the closest clusters to compute the nearest neighbors (also specified by a parameter called n_probes), but it shrinks the sizes of the vectors using scalar quantization, independently mapping each dimension to a fixed-width integer code.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/ivf_sq.hpp>``
+
+namespace *cuvs::neighbors::ivf_sq*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: ivf_sq_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: ivf_sq_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: ivf_sq_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index build
+-----------
+
+.. doxygengroup:: ivf_sq_cpp_index_build
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+
+.. doxygengroup:: ivf_sq_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search
+------------
+
+.. doxygengroup:: ivf_sq_cpp_index_search
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_sq_cpp_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:

From b1246284aef228e2e5a255e862149523716e1238 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 14:03:36 +0100
Subject: [PATCH 09/43] memset in index constructor

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh |  4 ----
 cpp/src/neighbors/ivf_sq_index.cpp        | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 6c46a20e65..13838556ae 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -456,10 +456,6 @@ inline auto build(raft::resources const& handle,
                "Cosine metric requires more than one dim");
 
   index<IdxT> idx(handle, params, dim);
-  utils::memzero(idx.accum_sorted_sizes().data_handle(), idx.accum_sorted_sizes().size(), stream);
-  utils::memzero(idx.list_sizes().data_handle(), idx.list_sizes().size(), stream);
-  utils::memzero(idx.data_ptrs().data_handle(), idx.data_ptrs().size(), stream);
-  utils::memzero(idx.inds_ptrs().data_handle(), idx.inds_ptrs().size(), stream);
 
   // Train k-means centroids and SQ parameters on the same training subset.
   // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index d97ace7dcb..ffa54bd2e9 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -5,6 +5,11 @@
 
 #include <cuvs/neighbors/ivf_sq.hpp>
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+
+#include <cstring>
+
 namespace cuvs::neighbors::ivf_sq {
 
 template <typename IdxT>
@@ -46,7 +51,14 @@ index<IdxT>::index(raft::resources const& res,
     accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
 {
   check_consistency();
-  accum_sorted_sizes_(n_lists) = 0;
+  auto stream = raft::resource::get_cuda_stream(res);
+  std::memset(accum_sorted_sizes_.data_handle(), 0, accum_sorted_sizes_.size() * sizeof(int64_t));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(list_sizes_.data_handle(), 0, list_sizes_.size() * sizeof(uint32_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(data_ptrs_.data_handle(), 0, data_ptrs_.size() * sizeof(IdxT*), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(inds_ptrs_.data_handle(), 0, inds_ptrs_.size() * sizeof(int64_t*), stream));
 }
 
 template <typename IdxT>

From 641c6ca34babd5d2914d6c1b6c7bbe47aec92980 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 14:35:56 +0100
Subject: [PATCH 10/43] random sampling

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 26 +++++++++++------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 13838556ae..e7c0f734b4 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -26,6 +26,8 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/pow2_utils.cuh>
 
@@ -460,22 +462,18 @@ inline auto build(raft::resources const& handle,
   // Train k-means centroids and SQ parameters on the same training subset.
   // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
   {
+    raft::random::RngState random_state{137};
     auto trainset_ratio = std::max<size_t>(
       1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, idx.n_lists()));
     auto n_rows_train = n_rows / trainset_ratio;
-    rmm::device_uvector<T> trainset(
-      n_rows_train * idx.dim(), stream, raft::resource::get_large_workspace_resource(handle));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
-                                    sizeof(T) * idx.dim(),
-                                    dataset,
-                                    sizeof(T) * idx.dim() * trainset_ratio,
-                                    sizeof(T) * idx.dim(),
-                                    n_rows_train,
-                                    cudaMemcpyDefault,
-                                    stream));
-    auto trainset_const_view =
-      raft::make_device_matrix_view<const T, int64_t>(trainset.data(), n_rows_train, idx.dim());
-    auto centers_view = raft::make_device_matrix_view<float, int64_t>(
+    auto trainset =
+      raft::make_device_mdarray<T>(handle,
+                                   raft::resource::get_large_workspace_resource(handle),
+                                   raft::make_extents<int64_t>(n_rows_train, idx.dim()));
+    auto dataset_view = raft::make_device_matrix_view<const T, int64_t>(dataset, n_rows, idx.dim());
+    raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset_view, trainset.view());
+    auto trainset_const_view = raft::make_const_mdspan(trainset.view());
+    auto centers_view        = raft::make_device_matrix_view<float, int64_t>(
       idx.centers().data_handle(), idx.n_lists(), idx.dim());
     cuvs::cluster::kmeans::balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
@@ -502,7 +500,7 @@ inline auto build(raft::resources const& handle,
       dim3 threads(32, 8);
       dim3 blocks(raft::ceildiv<int64_t>(n_rows_train, threads.x),
                   raft::ceildiv<uint32_t>(dim, threads.y));
-      compute_residuals_kernel<T><<<blocks, threads, 0, stream>>>(trainset.data(),
+      compute_residuals_kernel<T><<<blocks, threads, 0, stream>>>(trainset.data_handle(),
                                                                   idx.centers().data_handle(),
                                                                   train_labels.data_handle(),
                                                                   residuals.data(),

From 70ca00a331cd96138b0eab3a57bb135858882341 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 15:16:41 +0100
Subject: [PATCH 11/43] inplace residuals

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 53 ++++++++++++++---------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index e7c0f734b4..e11f5dd6b5 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -64,8 +64,8 @@ struct ColMinMaxOp {
  * Row-loop is manually 4x-unrolled so the compiler can overlap four
  * independent __ldg requests in the memory pipeline.
  */
-template <int BlockSize>
-__launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const float* __restrict__ data,
+template <int BlockSize, typename T>
+__launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __restrict__ data,
                                                                     float* __restrict__ col_min,
                                                                     float* __restrict__ col_max,
                                                                     int64_t n_rows,
@@ -83,15 +83,15 @@ __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const float*
   int64_t row          = static_cast<int64_t>(threadIdx.x);
 
   for (; row + 3 * stride < n_rows; row += 4 * stride) {
-    float v0    = __ldg(&data[row * dim + col]);
-    float v1    = __ldg(&data[(row + stride) * dim + col]);
-    float v2    = __ldg(&data[(row + 2 * stride) * dim + col]);
-    float v3    = __ldg(&data[(row + 3 * stride) * dim + col]);
+    float v0    = float(data[row * dim + col]);
+    float v1    = float(data[(row + stride) * dim + col]);
+    float v2    = float(data[(row + 2 * stride) * dim + col]);
+    float v3    = float(data[(row + 3 * stride) * dim + col]);
     agg.min_val = fminf(agg.min_val, fminf(fminf(v0, v1), fminf(v2, v3)));
     agg.max_val = fmaxf(agg.max_val, fmaxf(fmaxf(v0, v1), fmaxf(v2, v3)));
   }
   for (; row < n_rows; row += stride) {
-    float val   = __ldg(&data[row * dim + col]);
+    float val   = float(data[row * dim + col]);
     agg.min_val = fminf(agg.min_val, val);
     agg.max_val = fmaxf(agg.max_val, val);
   }
@@ -229,6 +229,20 @@ RAFT_KERNEL compute_residuals_kernel(const T* dataset,
   residuals[i * dim + j] = val - centers[c * dim + j];
 }
 
+/** In-place variant: dataset[i] = cast<T>(cast<float>(dataset[i]) - centers[labels[i]]) */
+template <typename T>
+RAFT_KERNEL compute_residuals_inplace_kernel(
+  T* dataset, const float* centers, const uint32_t* labels, int64_t n_rows, uint32_t dim)
+{
+  int64_t i  = int64_t(blockIdx.x) * blockDim.x + threadIdx.x;
+  uint32_t j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= n_rows || j >= dim) return;
+
+  float val            = utils::mapping<float>{}(dataset[i * dim + j]);
+  uint32_t c           = labels[i];
+  dataset[i * dim + j] = utils::mapping<T>{}(val - centers[c * dim + j]);
+}
+
 template <typename T, typename IdxT>
 void extend(raft::resources const& handle,
             index<IdxT>* index,
@@ -481,10 +495,10 @@ inline auto build(raft::resources const& handle,
     cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
     raft::resource::sync_stream(handle);
 
-    // Train SQ: predict labels for the training subset, compute its residuals,
+    // Train SQ: predict labels for the training subset, compute residuals in-place,
     // and derive per-dimension vmin/delta from them.
-    auto train_labels = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
     {
+      auto train_labels = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
       cuvs::cluster::kmeans::balanced_params pred_params;
       pred_params.metric      = idx.metric();
       auto centers_const_view = raft::make_device_matrix_view<const float, int64_t>(
@@ -492,23 +506,22 @@ inline auto build(raft::resources const& handle,
       cuvs::cluster::kmeans::predict(
         handle, pred_params, trainset_const_view, centers_const_view, train_labels.view());
       raft::resource::sync_stream(handle);
-    }
 
-    rmm::device_uvector<float> residuals(
-      n_rows_train * dim, stream, raft::resource::get_large_workspace_resource(handle));
-    {
       dim3 threads(32, 8);
       dim3 blocks(raft::ceildiv<int64_t>(n_rows_train, threads.x),
                   raft::ceildiv<uint32_t>(dim, threads.y));
-      compute_residuals_kernel<T><<<blocks, threads, 0, stream>>>(trainset.data_handle(),
-                                                                  idx.centers().data_handle(),
-                                                                  train_labels.data_handle(),
-                                                                  residuals.data(),
-                                                                  n_rows_train,
-                                                                  dim);
+      compute_residuals_inplace_kernel<T>
+        <<<blocks, threads, 0, stream>>>(trainset.data_handle(),
+                                         idx.centers().data_handle(),
+                                         train_labels.data_handle(),
+                                         n_rows_train,
+                                         dim);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
 
+    // After the in-place kernel, trainset now contains residuals.
+    auto& residuals = trainset;
+
     {
       auto vmax_buf  = raft::make_device_vector<float, uint32_t>(handle, dim);
       auto* vmin_ptr = idx.sq_vmin().data_handle();
@@ -516,7 +529,7 @@ inline auto build(raft::resources const& handle,
 
       constexpr int kMinMaxBlockSize = 256;
       fused_column_minmax_kernel<kMinMaxBlockSize><<<dim, kMinMaxBlockSize, 0, stream>>>(
-        residuals.data(), vmin_ptr, vmax_ptr, n_rows_train, dim);
+        residuals.data_handle(), vmin_ptr, vmax_ptr, n_rows_train, dim);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
 
       // Expand the observed range by a small margin to reduce clipping on unseen data,

From e7d660cddcc9e197c12dacae0db54a978d75fb1e Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 15:27:29 +0100
Subject: [PATCH 12/43] improved kernel layout for residuals computation

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 28 +++++++++++------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index e11f5dd6b5..617d01179a 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -234,13 +234,13 @@ template <typename T>
 RAFT_KERNEL compute_residuals_inplace_kernel(
   T* dataset, const float* centers, const uint32_t* labels, int64_t n_rows, uint32_t dim)
 {
-  int64_t i  = int64_t(blockIdx.x) * blockDim.x + threadIdx.x;
-  uint32_t j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= n_rows || j >= dim) return;
-
-  float val            = utils::mapping<float>{}(dataset[i * dim + j]);
-  uint32_t c           = labels[i];
-  dataset[i * dim + j] = utils::mapping<T>{}(val - centers[c * dim + j]);
+  int64_t i = blockIdx.x;
+  if (i >= n_rows) return;
+  uint32_t c = labels[i];
+  for (uint32_t j = threadIdx.x; j < dim; j += blockDim.x) {
+    float val            = utils::mapping<float>{}(dataset[i * dim + j]);
+    dataset[i * dim + j] = utils::mapping<T>{}(val - centers[c * dim + j]);
+  }
 }
 
 template <typename T, typename IdxT>
@@ -507,15 +507,13 @@ inline auto build(raft::resources const& handle,
         handle, pred_params, trainset_const_view, centers_const_view, train_labels.view());
       raft::resource::sync_stream(handle);
 
-      dim3 threads(32, 8);
-      dim3 blocks(raft::ceildiv<int64_t>(n_rows_train, threads.x),
-                  raft::ceildiv<uint32_t>(dim, threads.y));
+      constexpr int kResidualBlockSize = 256;
       compute_residuals_inplace_kernel<T>
-        <<<blocks, threads, 0, stream>>>(trainset.data_handle(),
-                                         idx.centers().data_handle(),
-                                         train_labels.data_handle(),
-                                         n_rows_train,
-                                         dim);
+        <<<n_rows_train, kResidualBlockSize, 0, stream>>>(trainset.data_handle(),
+                                                          idx.centers().data_handle(),
+                                                          train_labels.data_handle(),
+                                                          n_rows_train,
+                                                          dim);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
 

From 96b28db42cbb0aae730fefffc13ba0da2c763f59 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 15:40:27 +0100
Subject: [PATCH 13/43] raft::device_vector

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 617d01179a..3691b2306d 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -307,8 +307,7 @@ void extend(raft::resources const& handle,
   }
 
   auto* list_sizes_ptr    = index->list_sizes().data_handle();
-  auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
-    handle, raft::resource::get_workspace_resource(handle), raft::make_extents<int64_t>(n_lists));
+  auto old_list_sizes_dev = raft::make_device_vector<uint32_t, int64_t>(handle, n_lists);
   raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
 
   if (index->adaptive_centers()) {

From 206cb2e612538afadbb3017d90c668f742b5f7e1 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 13 Mar 2026 17:32:06 +0100
Subject: [PATCH 14/43] drop adaptative_centers feature

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         |  16 --
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     |  50 +---
 cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh |  16 +-
 cpp/src/neighbors/ivf_sq_index.cpp            |  17 +-
 cpp/tests/neighbors/ann_ivf_sq.cuh            | 243 ++++++++----------
 5 files changed, 132 insertions(+), 210 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 042ecda10a..10d9a4c856 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -30,19 +30,6 @@ struct index_params : cuvs::neighbors::index_params {
   uint32_t kmeans_n_iters = 20;
   /** The fraction of data to use during iterative kmeans building. */
   double kmeans_trainset_fraction = 0.5;
-  /**
-   * By default (adaptive_centers = false), the cluster centers are trained in `ivf_sq::build`,
-   * and never modified in `ivf_sq::extend`. As a result, you may need to retrain the index
-   * from scratch after invoking (`ivf_sq::extend`) a few times with new data, the distribution of
-   * which is no longer representative of the original training set.
-   *
-   * The alternative behavior (adaptive_centers = true) is to update the cluster centers for new
-   * data when it is added. In this case, `index.centers()` are always exactly the centroids of the
-   * data in the corresponding clusters. The drawback of this behavior is that the centroids depend
-   * on the order of adding new data (through the classification of the added data); that is,
-   * `index.centers()` "drift" together with the changing distribution of the newly added data.
-   */
-  bool adaptive_centers = false;
   /**
    * By default, the algorithm allocates more space than necessary for individual clusters
    * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
@@ -181,11 +168,9 @@ struct index : cuvs::neighbors::index {
         cuvs::distance::DistanceType metric,
         uint32_t n_lists,
         uint32_t dim,
-        bool adaptive_centers,
         bool conservative_memory_allocation);
 
   cuvs::distance::DistanceType metric() const noexcept;
-  bool adaptive_centers() const noexcept;
   int64_t size() const noexcept;
   uint32_t dim() const noexcept;
   uint32_t n_lists() const noexcept;
@@ -223,7 +208,6 @@ struct index : cuvs::neighbors::index {
 
  private:
   cuvs::distance::DistanceType metric_;
-  bool adaptive_centers_;
   bool conservative_memory_allocation_;
 
   std::vector<std::shared_ptr<list_data<IdxT, int64_t>>> lists_;
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 3691b2306d..e4a373a80b 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -13,7 +13,6 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_sq.hpp>
 
-#include "../../cluster/kmeans_balanced.cuh"
 #include "../detail/ann_utils.cuh"
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/logger.hpp>
@@ -109,12 +108,8 @@ auto clone(const raft::resources& res, const index<IdxT>& source) -> index<IdxT>
 {
   auto stream = raft::resource::get_cuda_stream(res);
 
-  index<IdxT> target(res,
-                     source.metric(),
-                     source.n_lists(),
-                     source.dim(),
-                     source.adaptive_centers(),
-                     source.conservative_memory_allocation());
+  index<IdxT> target(
+    res, source.metric(), source.n_lists(), source.dim(), source.conservative_memory_allocation());
 
   raft::copy(target.list_sizes().data_handle(),
              source.list_sizes().data_handle(),
@@ -310,36 +305,15 @@ void extend(raft::resources const& handle,
   auto old_list_sizes_dev = raft::make_device_vector<uint32_t, int64_t>(handle, n_lists);
   raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
 
-  if (index->adaptive_centers()) {
-    auto centroids_view = raft::make_device_matrix_view<float, int64_t>(
-      index->centers().data_handle(), index->centers().extent(0), index->centers().extent(1));
-    auto list_sizes_view =
-      raft::make_device_vector_view<std::remove_pointer_t<decltype(list_sizes_ptr)>, int64_t>(
-        list_sizes_ptr, n_lists);
-    for (const auto& batch : vec_batches) {
-      auto batch_data_view =
-        raft::make_device_matrix_view<const T, int64_t>(batch.data(), batch.size(), index->dim());
-      auto batch_labels_view = raft::make_device_vector_view<const LabelT, int64_t>(
-        new_labels.data_handle() + batch.offset(), batch.size());
-      cuvs::cluster::kmeans_balanced::helpers::calc_centers_and_sizes(handle,
-                                                                      batch_data_view,
-                                                                      batch_labels_view,
-                                                                      centroids_view,
-                                                                      list_sizes_view,
-                                                                      false,
-                                                                      utils::mapping<float>{});
-    }
-  } else {
-    raft::stats::histogram<uint32_t, int64_t>(raft::stats::HistTypeAuto,
-                                              reinterpret_cast<int32_t*>(list_sizes_ptr),
-                                              int64_t(n_lists),
-                                              new_labels.data_handle(),
-                                              n_rows,
-                                              1,
-                                              stream);
-    raft::linalg::add(
-      list_sizes_ptr, list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
-  }
+  raft::stats::histogram<uint32_t, int64_t>(raft::stats::HistTypeAuto,
+                                            reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                            int64_t(n_lists),
+                                            new_labels.data_handle(),
+                                            n_rows,
+                                            1,
+                                            stream);
+  raft::linalg::add(
+    list_sizes_ptr, list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
 
   std::vector<uint32_t> new_list_sizes(n_lists);
   std::vector<uint32_t> old_list_sizes(n_lists);
@@ -437,8 +411,6 @@ void extend(raft::resources const& handle,
   if (!index->center_norms().has_value()) {
     index->allocate_center_norms(handle);
     if (index->center_norms().has_value()) { compute_center_norms(); }
-  } else if (index->adaptive_centers()) {
-    compute_center_norms();
   }
 }
 
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
index b95e63ee33..8aa1f12e04 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
@@ -37,7 +37,6 @@ void serialize(raft::resources const& handle, std::ostream& os, const index<IdxT
   serialize_scalar(handle, os, index_.dim());
   serialize_scalar(handle, os, index_.n_lists());
   serialize_scalar(handle, os, index_.metric());
-  serialize_scalar(handle, os, index_.adaptive_centers());
   serialize_scalar(handle, os, index_.conservative_memory_allocation());
   serialize_mdspan(handle, os, index_.centers());
 
@@ -94,14 +93,13 @@ auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
   if (ver != serialization_version) {
     RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
   }
-  auto n_rows           = raft::deserialize_scalar<int64_t>(handle, is);
-  auto dim              = raft::deserialize_scalar<uint32_t>(handle, is);
-  auto n_lists          = raft::deserialize_scalar<uint32_t>(handle, is);
-  auto metric           = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
-  bool adaptive_centers = raft::deserialize_scalar<bool>(handle, is);
-  bool cma              = raft::deserialize_scalar<bool>(handle, is);
-
-  index<IdxT> index_ = index<IdxT>(handle, metric, n_lists, dim, adaptive_centers, cma);
+  auto n_rows  = raft::deserialize_scalar<int64_t>(handle, is);
+  auto dim     = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto n_lists = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto metric  = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  bool cma     = raft::deserialize_scalar<bool>(handle, is);
+
+  index<IdxT> index_ = index<IdxT>(handle, metric, n_lists, dim, cma);
 
   deserialize_mdspan(handle, is, index_.centers());
 
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index ffa54bd2e9..8b4de55f54 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -14,18 +14,13 @@ namespace cuvs::neighbors::ivf_sq {
 
 template <typename IdxT>
 index<IdxT>::index(raft::resources const& res)
-  : index(res, cuvs::distance::DistanceType::L2Expanded, 0, 0, false, false)
+  : index(res, cuvs::distance::DistanceType::L2Expanded, 0, 0, false)
 {
 }
 
 template <typename IdxT>
 index<IdxT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
-  : index(res,
-          params.metric,
-          params.n_lists,
-          dim,
-          params.adaptive_centers,
-          params.conservative_memory_allocation)
+  : index(res, params.metric, params.n_lists, dim, params.conservative_memory_allocation)
 {
 }
 
@@ -34,11 +29,9 @@ index<IdxT>::index(raft::resources const& res,
                    cuvs::distance::DistanceType metric,
                    uint32_t n_lists,
                    uint32_t dim,
-                   bool adaptive_centers,
                    bool conservative_memory_allocation)
   : cuvs::neighbors::index(),
     metric_(metric),
-    adaptive_centers_(adaptive_centers),
     conservative_memory_allocation_(conservative_memory_allocation),
     lists_{n_lists},
     list_sizes_{raft::make_device_vector<uint32_t, uint32_t>(res, n_lists)},
@@ -67,12 +60,6 @@ cuvs::distance::DistanceType index<IdxT>::metric() const noexcept
   return metric_;
 }
 
-template <typename IdxT>
-bool index<IdxT>::adaptive_centers() const noexcept
-{
-  return adaptive_centers_;
-}
-
 template <typename IdxT>
 int64_t index<IdxT>::size() const noexcept
 {
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index 25abc82740..d90ec66959 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -31,7 +31,6 @@ struct AnnIvfSqInputs {
   IdxT nprobe;
   IdxT nlist;
   cuvs::distance::DistanceType metric;
-  bool adaptive_centers;
 };
 
 template <typename IdxT>
@@ -40,7 +39,7 @@ template <typename IdxT>
   os << "{ " << p.num_queries << ", " << p.num_db_vecs << ", " << p.dim << ", " << p.k << ", "
      << p.nprobe << ", " << p.nlist << ", "
      << cuvs::neighbors::print_metric{static_cast<cuvs::distance::DistanceType>((int)p.metric)}
-     << ", " << p.adaptive_centers << '}' << std::endl;
+     << '}' << std::endl;
   return os;
 }
 
@@ -91,10 +90,9 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
       {
         cuvs::neighbors::ivf_sq::index_params index_params;
         cuvs::neighbors::ivf_sq::search_params search_params;
-        index_params.n_lists          = ps.nlist;
-        index_params.metric           = ps.metric;
-        index_params.adaptive_centers = ps.adaptive_centers;
-        search_params.n_probes        = ps.nprobe;
+        index_params.n_lists   = ps.nlist;
+        index_params.metric    = ps.metric;
+        search_params.n_probes = ps.nprobe;
 
         index_params.add_data_on_build        = true;
         index_params.kmeans_trainset_fraction = 0.5;
@@ -108,7 +106,6 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
         cuvs::neighbors::ivf_sq::index_params index_params_no_add;
         index_params_no_add.n_lists                  = ps.nlist;
         index_params_no_add.metric                   = ps.metric;
-        index_params_no_add.adaptive_centers         = ps.adaptive_centers;
         index_params_no_add.add_data_on_build        = false;
         index_params_no_add.kmeans_trainset_fraction = 0.5;
 
@@ -214,10 +211,9 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
       {
         cuvs::neighbors::ivf_sq::index_params index_params;
         cuvs::neighbors::ivf_sq::search_params search_params;
-        index_params.n_lists          = ps.nlist;
-        index_params.metric           = ps.metric;
-        index_params.adaptive_centers = ps.adaptive_centers;
-        search_params.n_probes        = ps.nprobe;
+        index_params.n_lists   = ps.nlist;
+        index_params.metric    = ps.metric;
+        search_params.n_probes = ps.nprobe;
 
         index_params.add_data_on_build        = true;
         index_params.kmeans_trainset_fraction = 0.5;
@@ -299,156 +295,141 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
 };
 
 const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
-  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric, adaptive_centers
+  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric
 
   // ===== Dimension edge cases (all four metrics) =====
   // dim=1 (CosineExpanded excluded: requires dim > 1)
-  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
   // dim=2,3,4,5 (unaligned)
-  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // dim=7,8 (around veclen=16 boundary, not a multiple of veclen)
-  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // dim=15,16,17 (around veclen=16 boundary)
-  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // dim=31,32,33 (around 2*veclen boundary)
-  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
   // medium dims
-  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
   // large dims (may exceed shared memory limits)
-  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::CosineExpanded},
 
   // ===== k edge cases =====
-  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 2, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 5, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 2, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 5, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct},
 
   // ===== nprobe / nlist edge cases =====
   // nprobe == nlist (exhaustive probe)
-  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded},
   // nprobe == 1 (minimal probe)
-  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // nprobe > nlist (clamped to nlist)
-  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // various nprobe
-  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
   // very small nlist
-  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::L2Expanded, false},
-  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::CosineExpanded, false},
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::CosineExpanded},
   // smaller nlist
-  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2Expanded, false},
-  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct, false},
-  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded, false},
-  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2SqrtExpanded},
 
   // ===== Dataset size edge cases =====
   // single query
-  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // very few queries
-  {2, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {5, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
+  {2, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {5, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
   // very few db vectors (nlist reduced to fit)
-  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::L2Expanded, false},
-  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::CosineExpanded, false},
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::L2Expanded},
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::CosineExpanded},
   // larger datasets
-  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false},
-  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded, false},
-  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, true},
-  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, false},
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
 
   // ===== Large query batches (gridDim.x > 65535) =====
-  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded, false},
-  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct, false},
-  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded, false},
-  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
-  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::CosineExpanded, false},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::CosineExpanded},
   // just above the old 65535 limit
-  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::L2Expanded, false},
-  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::CosineExpanded, false},
-
-  // ===== Adaptive centers (all four metrics, multiple dims) =====
-  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {1000, 10000, 8, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
-  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 32, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::L2Expanded},
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::CosineExpanded},
 
   // ===== Recall-stability: same data, different query counts =====
-  {20000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
-  {50000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded, false},
+  {20000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+  {50000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
 };
 
 }  // namespace cuvs::neighbors::ivf_sq

From e34bdd8178000a92aa4ee6083edf71b1e36e74a9 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 16 Mar 2026 11:19:03 +0100
Subject: [PATCH 15/43] Add IVF-SQ FAISS benchmark

---
 cpp/bench/ann/CMakeLists.txt                  | 15 ++++++++++
 .../cuvs_bench/config/algorithms.yaml         |  5 +++-
 .../config/algos/constraints/__init__.py      | 12 ++++++++
 .../cuvs_bench/config/algos/cuvs_ivf_sq.yaml  | 14 +++++-----
 .../config/algos/faiss_cpu_ivf_sq.yaml        | 28 +++++++++++++++++++
 .../config/algos/faiss_gpu_ivf_sq.yaml        | 28 +++++++++++++++++++
 6 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml
 create mode 100644 python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index c377b64ec6..0755a15a2f 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND CMAKE_MODULE_PATH "${CUVS_SOURCE_DIR}")
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_SQ "Include faiss' ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_CAGRA "Include faiss' cagra algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_CAGRA_HNSW
        "Include faiss' cagra algorithm for build and hnsw for search in benchmark" ON
@@ -22,6 +23,7 @@ option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algori
        ON
 )
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_SQ "Include faiss' cpu ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT "Include faiss' hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ "Include cuVS ivf sq algorithm in benchmark" ON)
@@ -318,6 +320,12 @@ if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_SQ)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_IVF_SQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT)
   ConfigureAnnBench(
     NAME FAISS_CPU_HNSW_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
@@ -338,6 +346,13 @@ if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND CUVS_FAISS_ENABLE_GPU)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_SQ AND CUVS_FAISS_ENABLE_GPU)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_IVF_SQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
+    raft::raft
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
     NAME FAISS_GPU_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
index 3a787f65ab..f181e549eb 100644
--- a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
@@ -8,7 +8,7 @@ faiss_gpu_ivf_pq:
   executable: FAISS_GPU_IVF_PQ_ANN_BENCH
   requires_gpu: true
 faiss_gpu_ivf_sq:
-  executable: FAISS_GPU_IVF_PQ_ANN_BENCH
+  executable: FAISS_GPU_IVF_SQ_ANN_BENCH
   requires_gpu: true
 faiss_gpu_cagra:
   executable: FAISS_GPU_CAGRA_ANN_BENCH
@@ -25,6 +25,9 @@ faiss_cpu_ivf_flat:
 faiss_cpu_ivf_pq:
   executable: FAISS_CPU_IVF_PQ_ANN_BENCH
   requires_gpu: false
+faiss_cpu_ivf_sq:
+  executable: FAISS_CPU_IVF_SQ_ANN_BENCH
+  requires_gpu: false
 faiss_cpu_hnsw_flat:
   executable: FAISS_CPU_HNSW_FLAT_ANN_BENCH
   requires_gpu: false
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
index ea2afe351e..f22852f0ae 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
+++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
@@ -61,6 +61,18 @@ def cuvs_ivf_sq_search(params, build_params, k, batch_size):
 ###############################################################################
 
 
+def faiss_gpu_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
+def faiss_cpu_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
 def faiss_gpu_ivf_pq_build(params, dims):
     ret = True
     # M must be defined
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
index adaad54e04..af59493eef 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
@@ -4,22 +4,22 @@ constraints:
 groups:
   base:
     build:
-      nlist: [1024, 2048, 4096, 8192]
-      ratio: [1, 2]
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
       niter: [25]
     search:
-      nprobe: [1, 5, 10, 20, 50, 100, 200, 500]
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
   large:
     build:
-      nlist: [8192, 16384, 32000, 64000]
-      ratio: [2, 4]
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
       niter: [20]
     search:
-      nprobe: [10, 20, 50, 100, 200, 500, 1000, 2000]
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
   test:
     build:
       nlist: [1024]
-      ratio: [1]
+      ratio: [2]
       niter: [20]
     search:
       nprobe: [1, 5]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml
new file mode 100644
index 0000000000..ce237f280d
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml
@@ -0,0 +1,28 @@
+name: faiss_cpu_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.faiss_cpu_ivf_sq_search
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
+      refine_ratio: [1]
+  large:
+    build:
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [2]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5]
+      refine_ratio: [1]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml
new file mode 100644
index 0000000000..d49df8116d
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml
@@ -0,0 +1,28 @@
+name: faiss_gpu_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_sq_search
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
+      refine_ratio: [1]
+  large:
+    build:
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [2]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5]
+      refine_ratio: [1]

From 9bd7bc05aa49e7049a120c3764e81bce6709ef71 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 19 Mar 2026 14:36:02 +0100
Subject: [PATCH 16/43] Adressing review

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh |  7 +++++++
 cpp/src/neighbors/ivf_sq_index.cpp         | 10 ++++++++++
 2 files changed, 17 insertions(+)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index a17992ff19..88c1b71970 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -459,6 +459,13 @@ inline void search_with_filtering(raft::resources const& handle,
   RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
   auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+  if (n_probes < params.n_probes) {
+    RAFT_LOG_WARN(
+      "n_probes (%u) is larger than the number of lists in the index (%u), clamping to %u.",
+      params.n_probes,
+      index.n_lists(),
+      n_probes);
+  }
 
   uint32_t max_samples =
     std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index 8b4de55f54..91eb86704f 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -43,6 +43,8 @@ index<IdxT>::index(raft::resources const& res,
     inds_ptrs_{raft::make_device_vector<int64_t*, uint32_t>(res, n_lists)},
     accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
 {
+  RAFT_EXPECTS(n_lists > 0, "n_lists must be positive.");
+  RAFT_EXPECTS(dim > 0, "dim must be positive.");
   check_consistency();
   auto stream = raft::resource::get_cuda_stream(res);
   std::memset(accum_sorted_sizes_.data_handle(), 0, accum_sorted_sizes_.size() * sizeof(int64_t));
@@ -228,6 +230,14 @@ void index<IdxT>::check_consistency()
   RAFT_EXPECTS((centers_.extent(0) == list_sizes_.extent(0)) &&
                  (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
                "inconsistent number of lists (clusters)");
+  RAFT_EXPECTS(sq_vmin_.extent(0) == centers_.extent(1),
+               "sq_vmin size (%u) does not match dim (%u)",
+               static_cast<uint32_t>(sq_vmin_.extent(0)),
+               static_cast<uint32_t>(centers_.extent(1)));
+  RAFT_EXPECTS(sq_delta_.extent(0) == centers_.extent(1),
+               "sq_delta size (%u) does not match dim (%u)",
+               static_cast<uint32_t>(sq_delta_.extent(0)),
+               static_cast<uint32_t>(centers_.extent(1)));
 }
 
 template struct index<uint8_t>;

From 0ce16416ff1364bba28440abd1da7ce4e5c339e5 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Fri, 20 Mar 2026 13:55:58 +0100
Subject: [PATCH 17/43] Addressing review

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 48 ++++++++---------------
 cpp/src/neighbors/ivf_sq_index.cpp        |  2 -
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index e4a373a80b..5c2f6808f1 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -204,26 +204,6 @@ __launch_bounds__(BlockSize) RAFT_KERNEL encode_and_fill_kernel(const uint32_t*
   }
 }
 
-/**
- * Compute residuals: residual[i] = cast<float>(x_i) - centers[labels[i]]
- */
-template <typename T>
-RAFT_KERNEL compute_residuals_kernel(const T* dataset,
-                                     const float* centers,
-                                     const uint32_t* labels,
-                                     float* residuals,
-                                     int64_t n_rows,
-                                     uint32_t dim)
-{
-  int64_t i  = int64_t(blockIdx.x) * blockDim.x + threadIdx.x;
-  uint32_t j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= n_rows || j >= dim) return;
-
-  float val              = utils::mapping<float>{}(dataset[i * dim + j]);
-  uint32_t c             = labels[i];
-  residuals[i * dim + j] = val - centers[c * dim + j];
-}
-
 /** In-place variant: dataset[i] = cast<T>(cast<float>(dataset[i]) - centers[labels[i]]) */
 template <typename T>
 RAFT_KERNEL compute_residuals_inplace_kernel(
@@ -348,16 +328,22 @@ void extend(raft::resources const& handle,
     int64_t bs = batch.size();
 
     {
-      dim3 threads(32, 8);
-      dim3 blocks(raft::ceildiv<int64_t>(bs, threads.x), raft::ceildiv<uint32_t>(dim, threads.y));
-      compute_residuals_kernel<T>
-        <<<blocks, threads, 0, stream>>>(batch.data(),
-                                         index->centers().data_handle(),
-                                         new_labels.data_handle() + batch.offset(),
-                                         residuals_buf.data_handle(),
-                                         bs,
-                                         dim);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
+      auto batch_view = raft::make_device_matrix_view<const T, int64_t>(batch.data(), bs, dim);
+      auto residuals_view =
+        raft::make_device_matrix_view<float, int64_t>(residuals_buf.data_handle(), bs, dim);
+
+      const float* centers_ptr   = index->centers().data_handle();
+      const uint32_t* labels_ptr = new_labels.data_handle() + batch.offset();
+
+      raft::linalg::map_offset(
+        handle,
+        residuals_view,
+        [centers_ptr, labels_ptr, dim] __device__(auto idx, T x) {
+          auto i = idx / dim;
+          auto j = idx % dim;
+          return utils::mapping<float>{}(x)-centers_ptr[labels_ptr[i] * dim + j];
+        },
+        batch_view);
     }
 
     {
@@ -464,7 +450,6 @@ inline auto build(raft::resources const& handle,
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = idx.metric();
     cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
-    raft::resource::sync_stream(handle);
 
     // Train SQ: predict labels for the training subset, compute residuals in-place,
     // and derive per-dimension vmin/delta from them.
@@ -476,7 +461,6 @@ inline auto build(raft::resources const& handle,
         idx.centers().data_handle(), idx.n_lists(), dim);
       cuvs::cluster::kmeans::predict(
         handle, pred_params, trainset_const_view, centers_const_view, train_labels.view());
-      raft::resource::sync_stream(handle);
 
       constexpr int kResidualBlockSize = 256;
       compute_residuals_inplace_kernel<T>
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index 91eb86704f..bf5b6df288 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -43,8 +43,6 @@ index<IdxT>::index(raft::resources const& res,
     inds_ptrs_{raft::make_device_vector<int64_t*, uint32_t>(res, n_lists)},
     accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
 {
-  RAFT_EXPECTS(n_lists > 0, "n_lists must be positive.");
-  RAFT_EXPECTS(dim > 0, "dim must be positive.");
   check_consistency();
   auto stream = raft::resource::get_cuda_stream(res);
   std::memset(accum_sorted_sizes_.data_handle(), 0, accum_sorted_sizes_.size() * sizeof(int64_t));

From 77c4a7996d2a7dd9c7e9fbfef0c4cea0a72259db Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 14:33:49 +0200
Subject: [PATCH 18/43] Fix issue with host data + half testing

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     |  44 +++----
 cpp/tests/neighbors/ann_ivf_sq.cuh            | 116 ++++++++++++++----
 .../ann_ivf_sq/test_float_uint8_t.cu          |   9 ++
 3 files changed, 118 insertions(+), 51 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 5c2f6808f1..8fbaf30240 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -17,6 +17,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
@@ -412,14 +413,16 @@ auto extend(raft::resources const& handle,
   return ext_index;
 }
 
-template <typename T, typename IdxT>
-inline auto build(raft::resources const& handle,
-                  const index_params& params,
-                  const T* dataset,
-                  int64_t n_rows,
-                  uint32_t dim) -> index<IdxT>
+template <typename T, typename IdxT, typename accessor>
+inline auto build(
+  raft::resources const& handle,
+  const index_params& params,
+  raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, accessor> dataset)
+  -> index<IdxT>
 {
-  auto stream = raft::resource::get_cuda_stream(handle);
+  int64_t n_rows = dataset.extent(0);
+  uint32_t dim   = dataset.extent(1);
+  auto stream    = raft::resource::get_cuda_stream(handle);
   cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "ivf_sq::build(%zu, %u)", size_t(n_rows), dim);
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, half>, "unsupported data type");
@@ -441,8 +444,7 @@ inline auto build(raft::resources const& handle,
       raft::make_device_mdarray<T>(handle,
                                    raft::resource::get_large_workspace_resource(handle),
                                    raft::make_extents<int64_t>(n_rows_train, idx.dim()));
-    auto dataset_view = raft::make_device_matrix_view<const T, int64_t>(dataset, n_rows, idx.dim());
-    raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset_view, trainset.view());
+    raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     auto trainset_const_view = raft::make_const_mdspan(trainset.view());
     auto centers_view        = raft::make_device_matrix_view<float, int64_t>(
       idx.centers().data_handle(), idx.n_lists(), idx.dim());
@@ -499,31 +501,13 @@ inline auto build(raft::resources const& handle,
     }
   }
 
-  if (params.add_data_on_build) { detail::extend<T, IdxT>(handle, &idx, dataset, nullptr, n_rows); }
+  if (params.add_data_on_build) {
+    detail::extend<T, IdxT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
+  }
 
   return idx;
 }
 
-template <typename T, typename IdxT>
-auto build(raft::resources const& handle,
-           const index_params& params,
-           raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
-{
-  int64_t n_rows = dataset.extent(0);
-  uint32_t dim   = dataset.extent(1);
-  return build<T, IdxT>(handle, params, dataset.data_handle(), n_rows, dim);
-}
-
-template <typename T, typename IdxT>
-auto build(raft::resources const& handle,
-           const index_params& params,
-           raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
-{
-  int64_t n_rows = dataset.extent(0);
-  uint32_t dim   = dataset.extent(1);
-  return build<T, IdxT>(handle, params, dataset.data_handle(), n_rows, dim);
-}
-
 template <typename T, typename IdxT>
 void build(raft::resources const& handle,
            const index_params& params,
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index d90ec66959..c4f3c9de74 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -16,6 +16,8 @@
 #include <raft/core/resource/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 
+#include <numeric>
+
 namespace cuvs::neighbors::ivf_sq {
 
 struct test_ivf_sample_filter {
@@ -31,6 +33,7 @@ struct AnnIvfSqInputs {
   IdxT nprobe;
   IdxT nlist;
   cuvs::distance::DistanceType metric;
+  bool host_dataset = false;
 };
 
 template <typename IdxT>
@@ -39,7 +42,7 @@ template <typename IdxT>
   os << "{ " << p.num_queries << ", " << p.num_db_vecs << ", " << p.dim << ", " << p.k << ", "
      << p.nprobe << ", " << p.nlist << ", "
      << cuvs::neighbors::print_metric{static_cast<cuvs::distance::DistanceType>((int)p.metric)}
-     << '}' << std::endl;
+     << ", " << (p.host_dataset ? "host" : "device") << '}' << std::endl;
   return os;
 }
 
@@ -97,32 +100,60 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
         index_params.add_data_on_build        = true;
         index_params.kmeans_trainset_fraction = 0.5;
 
-        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-
-        auto idx = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
-
-        // Test extend: build without data, then extend
         cuvs::neighbors::ivf_sq::index_params index_params_no_add;
         index_params_no_add.n_lists                  = ps.nlist;
         index_params_no_add.metric                   = ps.metric;
         index_params_no_add.add_data_on_build        = false;
         index_params_no_add.kmeans_trainset_fraction = 0.5;
 
-        auto idx_empty =
-          cuvs::neighbors::ivf_sq::build(handle_, index_params_no_add, database_view);
-
-        auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
-        raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
-        raft::resource::sync_stream(handle_);
-
-        auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
-          vector_indices.data_handle(), ps.num_db_vecs);
-        cuvs::neighbors::ivf_sq::extend(
-          handle_,
-          database_view,
-          std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
-          &idx_empty);
+        cuvs::neighbors::ivf_sq::index<uint8_t> idx(handle_);
+        cuvs::neighbors::ivf_sq::index<uint8_t> idx_empty(handle_);
+
+        if (!ps.host_dataset) {
+          auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+            (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+          idx = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
+
+          idx_empty = cuvs::neighbors::ivf_sq::build(handle_, index_params_no_add, database_view);
+
+          auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
+          raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
+          raft::resource::sync_stream(handle_);
+
+          auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+            vector_indices.data_handle(), ps.num_db_vecs);
+          cuvs::neighbors::ivf_sq::extend(
+            handle_,
+            database_view,
+            std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
+            &idx_empty);
+        } else {
+          auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
+          raft::copy(
+            host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
+          raft::resource::sync_stream(handle_);
+
+          idx = cuvs::neighbors::ivf_sq::build(
+            handle_, index_params, raft::make_const_mdspan(host_database.view()));
+
+          idx_empty = cuvs::neighbors::ivf_sq::build(
+            handle_, index_params_no_add, raft::make_const_mdspan(host_database.view()));
+
+          auto vector_indices = raft::make_host_vector<IdxT>(handle_, ps.num_db_vecs);
+          std::iota(
+            vector_indices.data_handle(), vector_indices.data_handle() + ps.num_db_vecs, IdxT(0));
+
+          auto indices_view = raft::make_host_vector_view<const IdxT, IdxT>(
+            vector_indices.data_handle(), ps.num_db_vecs);
+          auto host_database_view = raft::make_host_matrix_view<const DataT, IdxT>(
+            host_database.data_handle(), ps.num_db_vecs, ps.dim);
+          cuvs::neighbors::ivf_sq::extend(
+            handle_,
+            host_database_view,
+            std::make_optional<raft::host_vector_view<const IdxT, IdxT>>(indices_view),
+            &idx_empty);
+        }
 
         // Serialize / deserialize round-trip
         tmp_index_file index_file;
@@ -430,6 +461,49 @@ const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
   // ===== Recall-stability: same data, different query counts =====
   {20000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
   {50000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+
+  // ===== Host dataset: build + extend from host_matrix_view =====
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
+  {1000, 10000, 3, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {100, 10000, 64, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct, true},
+};
+
+const std::vector<AnnIvfSqInputs<int64_t>> inputs_half = {
+  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric, host_dataset
+
+  // All four metrics at a standard dimension
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+
+  // Unaligned and small dimensions
+  {1000, 10000, 3, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+
+  // Medium / larger dimensions
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+
+  // k edge cases
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded},
+
+  // nprobe / nlist edge cases
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded},
+
+  // Host dataset
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
 };
 
 }  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
index 02ec8a7dfc..f136b6f41c 100644
--- a/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
@@ -18,4 +18,13 @@ TEST_P(AnnIVFSQTestF_float, AnnIVFSQ)
 
 INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
 
+typedef AnnIVFSQTest<float, half, int64_t> AnnIVFSQTestF_half;
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQ)
+{
+  this->testIVFSQ();
+  this->testFilter();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_half, ::testing::ValuesIn(inputs_half));
+
 }  // namespace cuvs::neighbors::ivf_sq

From b46ea79f377360cdd1e9d3bc7f505e910dcccf63 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 16:39:14 +0200
Subject: [PATCH 19/43] Update metric in doc

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp     | 10 +++++-----
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 10d9a4c856..bb29b3fef1 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -236,7 +236,7 @@ struct index : cuvs::neighbors::index {
  *
  * NB: Currently, the following distance metrics are supported:
  * - L2Expanded
- * - L2Unexpanded
+ * - L2SqrtExpanded
  * - InnerProduct
  * - CosineExpanded
  *
@@ -265,7 +265,7 @@ auto build(raft::resources const& handle,
  *
  * NB: Currently, the following distance metrics are supported:
  * - L2Expanded
- * - L2Unexpanded
+ * - L2SqrtExpanded
  * - InnerProduct
  * - CosineExpanded
  *
@@ -318,7 +318,7 @@ auto build(raft::resources const& handle,
  *
  * NB: Currently, the following distance metrics are supported:
  * - L2Expanded
- * - L2Unexpanded
+ * - L2SqrtExpanded
  * - InnerProduct
  * - CosineExpanded
  *
@@ -371,7 +371,7 @@ auto build(raft::resources const& handle,
  *
  * NB: Currently, the following distance metrics are supported:
  * - L2Expanded
- * - L2Unexpanded
+ * - L2SqrtExpanded
  * - InnerProduct
  * - CosineExpanded
  *
@@ -424,7 +424,7 @@ auto build(raft::resources const& handle,
  *
  * NB: Currently, the following distance metrics are supported:
  * - L2Expanded
- * - L2Unexpanded
+ * - L2SqrtExpanded
  * - InnerProduct
  * - CosineExpanded
  *
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 8fbaf30240..7934460fb5 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -62,7 +62,7 @@ struct ColMinMaxOp {
  * rows and feed CUB BlockReduce with a combined min/max pair.
  *
  * Row-loop is manually 4x-unrolled so the compiler can overlap four
- * independent __ldg requests in the memory pipeline.
+ * independent read-only loads in the memory pipeline.
  */
 template <int BlockSize, typename T>
 __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __restrict__ data,

From 44c5f0a3826408dd764fc8f048acbb5bf052be53 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 16:42:40 +0200
Subject: [PATCH 20/43] Fix manage_local_topk / Capacity mismatch in IVF-SQ
 search

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh | 705 +++++++++++++++------
 1 file changed, 516 insertions(+), 189 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 88c1b71970..c79e043e20 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -23,6 +23,7 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/integer_utils.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -34,143 +35,369 @@ using namespace cuvs::spatial::knn::detail;  // NOLINT
 
 enum class SqScanMetric { kL2, kIP, kCosine };
 
-/**
- * Per-probe scan kernel for IVF-SQ search.
- *
- * Grid: (n_queries, n_probes).  Each block handles one (query, probe) pair.
- * Within a block, each warp processes one interleaved group of kIndexGroupSize
- * (=32) vectors at a time, with each lane responsible for one vector.
- * Dimension blocks of veclen=16 bytes are loaded as coalesced uint4 reads
- * across the warp (32 lanes x 16 bytes = 512 bytes = 4 cache lines), giving
- * full memory-bandwidth utilisation.
- *
- * Per-dimension constants that are invariant across rows are precomputed into
- * shared memory so the hot loop only reads from smem + one uint4 per dim-block:
- *
- *   L2 / L2Sqrt:
- *     s_query_term[d] = query[d] - centroid[d] - sq_vmin[d]
- *     dist += (s_query_term[d] - code * s_sq_scale[d])^2
- *
- *   InnerProduct / Cosine:
- *     s_query_term[d] = query[d]
- *     s_recon_base[d] = centroid[d] + sq_vmin[d]
- *     v_d   = s_recon_base[d] + code * s_sq_scale[d]
- *     dist += s_query_term[d] * v_d
- *
- * Shared-memory layout adapts to the metric to avoid waste:
- *   L2 / L2Sqrt       : [s_query_term | s_sq_scale]                (2 * dim floats)
- *   InnerProduct/Cosine: [s_query_term | s_recon_base | s_sq_scale] (3 * dim floats)
- */
-template <int BlockDim, SqScanMetric Metric, typename IdxT, typename IvfSampleFilterT>
-__launch_bounds__(BlockDim) RAFT_KERNEL ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
-                                                           const uint32_t* list_sizes,
-                                                           const uint32_t* coarse_indices,
-                                                           const float* queries_float,
-                                                           const float* centers,
-                                                           const float* sq_vmin,
-                                                           const float* sq_delta,
-                                                           const float* query_norms,
-                                                           uint32_t n_probes,
-                                                           uint32_t dim,
-                                                           uint32_t max_samples,
-                                                           const uint32_t* chunk_indices,
-                                                           float* out_distances,
-                                                           uint32_t* out_indices,
-                                                           IvfSampleFilterT sample_filter)
+static constexpr int kSqScanThreads = 128;
+
+// Maximum fused top-k capacity we instantiate for the scan kernel.
+// Must match the highest Capacity case in ivf_sq_scan's switch.
+static constexpr int kMaxSqScanCapacity = 256;
+
+auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k) -> bool
+{
+  return k <= kMaxSqScanCapacity &&
+         k <= raft::matrix::detail::select::warpsort::kMaxCapacity;
+}
+
+// ---------------------------------------------------------------------------
+// block_sort type selection (fused top-k vs dummy for Capacity == 0)
+// ---------------------------------------------------------------------------
+template <int Capacity, bool Ascending>
+struct sq_block_sort {
+  using type = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    uint32_t>;
+};
+
+template <bool Ascending>
+struct sq_block_sort<0, Ascending> {
+  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, Ascending>;
+};
+
+template <int Capacity, bool Ascending>
+using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
+
+// ---------------------------------------------------------------------------
+// configure_grid_dim_x: choose grid.x to saturate the GPU
+// ---------------------------------------------------------------------------
+inline uint32_t configure_grid_dim_x(uint32_t n_queries,
+                                     uint32_t n_probes,
+                                     int smem_size,
+                                     int block_size,
+                                     const void* kernel_ptr)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, kernel_ptr, block_size, smem_size));
+
+  size_t min_grid_size = size_t(num_sms) * num_blocks_per_sm;
+  size_t min_grid_x    = raft::ceildiv<size_t>(min_grid_size, n_queries);
+  return std::min<uint32_t>(n_probes, static_cast<uint32_t>(min_grid_x));
+}
+
+// ---------------------------------------------------------------------------
+// IVF-SQ scan kernel with fused in-kernel top-k
+//
+// Grid layout:
+//   kManageLocalTopK (Capacity > 0):
+//     grid (grid_dim_x, n_queries) — each block loops over probes
+//   otherwise (Capacity == 0):
+//     grid (n_probes, n_queries) — one block per (query, probe)
+//
+// Shared-memory layout: [s_sq_scale(dim) | s_query_term(dim) | s_recon_base(dim)?]
+//   s_sq_scale is loaded once (invariant across probes).
+//   For L2, s_query_term is reloaded per probe.
+//   For IP/Cosine, s_query_term is loaded once and s_recon_base is reloaded per probe.
+//   After all probes are scanned, the smem is reused for block_sort merge.
+// ---------------------------------------------------------------------------
+template <int BlockDim, int Capacity, SqScanMetric Metric, typename IdxT, typename IvfSampleFilterT>
+__launch_bounds__(BlockDim) RAFT_KERNEL
+  ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
+                     const uint32_t* list_sizes,
+                     const uint32_t* coarse_indices,
+                     const float* queries_float,
+                     const float* centers,
+                     const float* sq_vmin,
+                     const float* sq_delta,
+                     const float* query_norms,
+                     uint32_t n_probes,
+                     uint32_t dim,
+                     uint32_t k,
+                     uint32_t max_samples,
+                     const uint32_t* chunk_indices,
+                     float* out_distances,
+                     uint32_t* out_indices,
+                     IvfSampleFilterT sample_filter)
 {
   static_assert(kIndexGroupSize == raft::WarpSize,
                 "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
 
-  extern __shared__ float smem[];
+  constexpr bool kManageLocalTopK = (Capacity > 0);
+  constexpr bool kIsL2            = (Metric == SqScanMetric::kL2);
+  constexpr bool kIsCosine        = (Metric == SqScanMetric::kCosine);
+  constexpr bool kAscending       = (Metric != SqScanMetric::kIP);
 
-  constexpr bool kIsL2     = (Metric == SqScanMetric::kL2);
-  constexpr bool kIsCosine = (Metric == SqScanMetric::kCosine);
+  extern __shared__ __align__(256) uint8_t smem_buf[];
+  float* smem = reinterpret_cast<float*>(smem_buf);
 
-  float* s_query_term = smem;
-  float* s_recon_base = smem + dim;
-  float* s_sq_scale   = kIsL2 ? (smem + dim) : (smem + 2 * dim);
+  // smem layout: [s_sq_scale | s_query_term | (s_recon_base for IP/Cosine)]
+  float* s_sq_scale   = smem;
+  float* s_query_term = smem + dim;
+  float* s_recon_base = kIsL2 ? nullptr : (smem + 2 * dim);
 
-  const uint32_t query_ix = blockIdx.x;
-  const uint32_t probe_ix = blockIdx.y;
+  const uint32_t query_ix = blockIdx.y;
+  const float* query      = queries_float + query_ix * dim;
 
-  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
-  const uint32_t cluster_id = my_coarse[probe_ix];
-  const uint32_t cluster_sz = list_sizes[cluster_id];
-  if (cluster_sz == 0) return;
-
-  const uint8_t* codes  = data_ptrs[cluster_id];
-  const float* query    = queries_float + query_ix * dim;
-  const float* centroid = centers + cluster_id * dim;
+  // Point output to this block's slice when using fused top-k
+  if constexpr (kManageLocalTopK) {
+    out_distances += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+    out_indices += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+  }
 
+  // --- Phase 1: load shared memory that is invariant across probes ---
   for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
-    float vmin_d  = sq_vmin[d];
     s_sq_scale[d] = sq_delta[d];
-    if constexpr (kIsL2) {
-      s_query_term[d] = query[d] - centroid[d] - vmin_d;
-    } else {
-      s_query_term[d] = query[d];
-      s_recon_base[d] = centroid[d] + vmin_d;
-    }
+    if constexpr (!kIsL2) { s_query_term[d] = query[d]; }
   }
-  __syncthreads();
 
-  const uint32_t* my_chunk = chunk_indices + query_ix * n_probes;
-  uint32_t out_base        = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+  using local_topk_t = sq_block_sort_t<Capacity, kAscending>;
+  local_topk_t queue(k);
+
+  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
+  const uint32_t* my_chunk  = chunk_indices + query_ix * n_probes;
 
   constexpr uint32_t veclen         = 16;
   constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
   const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
   const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
 
-  uint32_t padded_dim   = ((dim + veclen - 1) / veclen) * veclen;
-  uint32_t n_dim_blocks = padded_dim / veclen;
+  // --- Phase 2: loop over probes ---
+  for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
+       probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
+    const uint32_t cluster_id = my_coarse[probe_ix];
+    const uint32_t cluster_sz = list_sizes[cluster_id];
+
+    // Load centroid-dependent shared memory terms
+    {
+      const float* centroid = centers + cluster_id * dim;
+      for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
+        if constexpr (kIsL2) {
+          s_query_term[d] = query[d] - centroid[d] - sq_vmin[d];
+        } else {
+          s_recon_base[d] = centroid[d] + sq_vmin[d];
+        }
+      }
+    }
+    __syncthreads();
+
+    if (cluster_sz == 0) {
+      if constexpr (!kManageLocalTopK) break;
+      continue;
+    }
+
+    const uint8_t* codes    = data_ptrs[cluster_id];
+    uint32_t sample_offset  = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+    uint32_t padded_dim     = ((dim + veclen - 1) / veclen) * veclen;
+    uint32_t n_dim_blocks   = padded_dim / veclen;
 
-  for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
-       group += kWarpsPerBlock * kIndexGroupSize) {
-    const uint32_t row = group + lane_id;
-    const bool valid   = (row < cluster_sz) && sample_filter(query_ix, cluster_id, row);
+    for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
+         group += kWarpsPerBlock * kIndexGroupSize) {
+      const uint32_t row = group + lane_id;
+      const bool valid   = (row < cluster_sz) && sample_filter(query_ix, cluster_id, row);
 
-    float dist      = 0.0f;
-    float v_norm_sq = 0.0f;
+      float dist      = 0.0f;
+      float v_norm_sq = 0.0f;
 
-    const uint8_t* group_data = codes + size_t(group) * padded_dim;
+      const uint8_t* group_data = codes + size_t(group) * padded_dim;
 
-    for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
-      uint8_t codes_local[veclen];
-      *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
-        group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
+      for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
+        uint8_t codes_local[veclen];
+        *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
+          group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
 
-      const uint32_t l = bl * veclen;
+        const uint32_t l = bl * veclen;
 #pragma unroll
-      for (uint32_t j = 0; j < veclen; j++) {
-        if (l + j < dim) {
-          float recon = float(codes_local[j]) * s_sq_scale[l + j];
-
-          if constexpr (kIsL2) {
-            float diff = s_query_term[l + j] - recon;
-            dist += diff * diff;
-          } else {
-            float v_d = s_recon_base[l + j] + recon;
-            dist += s_query_term[l + j] * v_d;
-            if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
+        for (uint32_t j = 0; j < veclen; j++) {
+          if (l + j < dim) {
+            float recon = float(codes_local[j]) * s_sq_scale[l + j];
+
+            if constexpr (kIsL2) {
+              float diff = s_query_term[l + j] - recon;
+              dist += diff * diff;
+            } else {
+              float v_d = s_recon_base[l + j] + recon;
+              dist += s_query_term[l + j] * v_d;
+              if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
+            }
           }
         }
       }
+
+      if constexpr (kIsCosine) {
+        float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
+        dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+      }
+
+      if constexpr (kManageLocalTopK) {
+        float val = valid ? dist : local_topk_t::queue_t::kDummy;
+        queue.add(val, sample_offset + row);
+      } else {
+        if (valid) {
+          uint32_t out_idx       = query_ix * max_samples + sample_offset + row;
+          out_distances[out_idx] = dist;
+          out_indices[out_idx]   = sample_offset + row;
+        }
+      }
     }
 
-    if constexpr (kIsCosine) {
-      float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
-      dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+    __syncthreads();
+    if constexpr (!kManageLocalTopK) break;
+  }
+
+  if constexpr (kManageLocalTopK) {
+    __syncthreads();
+    queue.done(smem_buf);
+    queue.store(out_distances, out_indices);
+
+    // block_sort initializes unused slots with (kDummy, idx=0). When the
+    // probed clusters have fewer than k total valid vectors, those slots
+    // survive into the output and share idx=0 with the real first vector,
+    // causing duplicates.  Mark them with an invalid index so
+    // postprocess_neighbors treats them as out-of-bounds.
+    // store() is a warp-0-only operation, restrict the fixup to the same warp.
+    if (threadIdx.x < raft::WarpSize) {
+      constexpr auto kDummyVal = local_topk_t::queue_t::kDummy;
+      for (uint32_t i = threadIdx.x; i < k; i += raft::WarpSize) {
+        if (out_distances[i] == kDummyVal) { out_indices[i] = uint32_t(0xFFFFFFFF); }
+      }
     }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Compute shared-memory size for a given kernel configuration
+// ---------------------------------------------------------------------------
+inline size_t sq_scan_smem_size(uint32_t dim, SqScanMetric metric)
+{
+  return (metric == SqScanMetric::kL2 ? 2 : 3) * dim * sizeof(float);
+}
+
+template <int Capacity>
+size_t sq_scan_total_smem(uint32_t dim, uint32_t k, SqScanMetric metric)
+{
+  size_t scan_smem = sq_scan_smem_size(dim, metric);
+  if constexpr (Capacity > 0) {
+    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
+    int num_subwarps           = kSqScanThreads / kSubwarpSize;
+    size_t merge_smem =
+      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, uint32_t>(
+        num_subwarps, k);
+    return std::max(scan_smem, merge_smem);
+  }
+  return scan_smem;
+}
+
+// ---------------------------------------------------------------------------
+// Launch helper: dispatches on Metric, handles grid_dim_x query vs launch
+// ---------------------------------------------------------------------------
+template <int Capacity, typename IdxT, typename IvfSampleFilterT>
+void ivf_sq_scan_launch(const index<IdxT>& idx,
+                        const float* queries_float,
+                        const float* query_norms,
+                        uint32_t n_queries,
+                        uint32_t n_probes,
+                        uint32_t k,
+                        uint32_t max_samples,
+                        const uint32_t* coarse_indices,
+                        const uint32_t* chunk_indices,
+                        float* out_distances,
+                        uint32_t* out_indices,
+                        IvfSampleFilterT sample_filter,
+                        uint32_t& grid_dim_x,
+                        rmm::cuda_stream_view stream)
+{
+  constexpr bool kManageLocalTopK = (Capacity > 0);
+  constexpr int kThreads          = kSqScanThreads;
+  uint32_t dim                    = idx.dim();
+
+  constexpr uint32_t kMaxGridY = 32768;
 
-    if (valid) {
-      uint32_t out_idx       = query_ix * max_samples + out_base + row;
-      out_distances[out_idx] = dist;
-      out_indices[out_idx]   = out_base + row;
+  auto do_launch = [&](auto kernel_ptr, SqScanMetric metric_val) {
+    size_t smem = sq_scan_total_smem<Capacity>(dim, k, metric_val);
+
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+
+    // If grid_dim_x == 0, compute the optimal value and return
+    if constexpr (kManageLocalTopK) {
+      if (grid_dim_x == 0) {
+        grid_dim_x = configure_grid_dim_x(
+          std::min(kMaxGridY, n_queries), n_probes, smem, kThreads,
+          reinterpret_cast<const void*>(kernel_ptr));
+        return;
+      }
     }
+
+    dim3 block(kThreads);
+
+    // Batch over queries to respect the gridDim.y limit (65535)
+    for (uint32_t query_offset = 0; query_offset < n_queries; query_offset += kMaxGridY) {
+      uint32_t batch = std::min(kMaxGridY, n_queries - query_offset);
+      dim3 grid      = kManageLocalTopK ? dim3(grid_dim_x, batch) : dim3(n_probes, batch);
+
+      auto q_ptr  = queries_float + uint64_t(query_offset) * dim;
+      auto qn_ptr = query_norms ? query_norms + query_offset : query_norms;
+      auto ci     = coarse_indices + uint64_t(query_offset) * n_probes;
+      auto ch     = chunk_indices + uint64_t(query_offset) * n_probes;
+      auto od     = out_distances;
+      auto oi     = out_indices;
+      if constexpr (kManageLocalTopK) {
+        od += uint64_t(query_offset) * grid_dim_x * k;
+        oi += uint64_t(query_offset) * grid_dim_x * k;
+      } else {
+        od += uint64_t(query_offset) * max_samples;
+        oi += uint64_t(query_offset) * max_samples;
+      }
+
+      kernel_ptr<<<grid, block, smem, stream>>>(idx.data_ptrs().data_handle(),
+                                                idx.list_sizes().data_handle(),
+                                                ci,
+                                                q_ptr,
+                                                idx.centers().data_handle(),
+                                                idx.sq_vmin().data_handle(),
+                                                idx.sq_delta().data_handle(),
+                                                qn_ptr,
+                                                n_probes,
+                                                dim,
+                                                k,
+                                                max_samples,
+                                                ch,
+                                                od,
+                                                oi,
+                                                sample_filter);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  };
+
+  switch (idx.metric()) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+      do_launch(
+        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, IdxT, IvfSampleFilterT>,
+        SqScanMetric::kL2);
+      break;
+    case cuvs::distance::DistanceType::InnerProduct:
+      do_launch(
+        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, IdxT, IvfSampleFilterT>,
+        SqScanMetric::kIP);
+      break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      do_launch(
+        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>,
+        SqScanMetric::kCosine);
+      break;
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
   }
 }
 
+// ---------------------------------------------------------------------------
+// ivf_sq_scan: top-level scan dispatch with Capacity selection
+// ---------------------------------------------------------------------------
 template <typename IdxT, typename IvfSampleFilterT>
 void ivf_sq_scan(raft::resources const& handle,
                  const index<IdxT>& idx,
@@ -178,58 +405,57 @@ void ivf_sq_scan(raft::resources const& handle,
                  const float* query_norms,
                  uint32_t n_queries,
                  uint32_t n_probes,
+                 uint32_t k,
                  uint32_t max_samples,
                  const uint32_t* coarse_indices,
                  const uint32_t* chunk_indices,
                  float* out_distances,
                  uint32_t* out_indices,
                  IvfSampleFilterT sample_filter,
+                 uint32_t& grid_dim_x,
                  rmm::cuda_stream_view stream)
 {
-  constexpr int kThreads = 256;
-  dim3 grid(n_queries, n_probes);
-  dim3 block(kThreads);
-  uint32_t dim = idx.dim();
+  // Determine the fused top-k capacity (0 = disabled / fallback to materialization)
+  int capacity = is_local_topk_feasible(k) ? raft::bound_by_power_of_two(int(k)) : 0;
+
+  // Clamp to supported compile-time Capacity values.
+  // Using a limited set to avoid excessive template instantiations.
+  if (capacity > 0 && capacity <= 32) {
+    capacity = 32;
+  } else if (capacity > 32 && capacity <= 256) {
+    capacity = 256;
+  } else if (capacity > 256) {
+    capacity = 0;
+  }
 
-  auto do_launch = [&](auto kernel_ptr, size_t smem) {
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
-    kernel_ptr<<<grid, block, smem, stream>>>(idx.data_ptrs().data_handle(),
-                                              idx.list_sizes().data_handle(),
-                                              coarse_indices,
-                                              queries_float,
-                                              idx.centers().data_handle(),
-                                              idx.sq_vmin().data_handle(),
-                                              idx.sq_delta().data_handle(),
-                                              query_norms,
-                                              n_probes,
-                                              dim,
-                                              max_samples,
-                                              chunk_indices,
-                                              out_distances,
-                                              out_indices,
-                                              sample_filter);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  auto fwd = [&](auto cap_tag) {
+    ivf_sq_scan_launch<decltype(cap_tag)::value, IdxT, IvfSampleFilterT>(idx,
+                                                                         queries_float,
+                                                                         query_norms,
+                                                                         n_queries,
+                                                                         n_probes,
+                                                                         k,
+                                                                         max_samples,
+                                                                         coarse_indices,
+                                                                         chunk_indices,
+                                                                         out_distances,
+                                                                         out_indices,
+                                                                         sample_filter,
+                                                                         grid_dim_x,
+                                                                         stream);
   };
 
-  switch (idx.metric()) {
-    case cuvs::distance::DistanceType::L2Expanded:
-    case cuvs::distance::DistanceType::L2SqrtExpanded:
-      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kL2, IdxT, IvfSampleFilterT>,
-                2 * dim * sizeof(float));
-      break;
-    case cuvs::distance::DistanceType::InnerProduct:
-      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kIP, IdxT, IvfSampleFilterT>,
-                3 * dim * sizeof(float));
-      break;
-    case cuvs::distance::DistanceType::CosineExpanded:
-      do_launch(ivf_sq_scan_kernel<kThreads, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>,
-                3 * dim * sizeof(float));
-      break;
-    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
+  switch (capacity) {
+    case 0: fwd(std::integral_constant<int, 0>{}); break;
+    case 32: fwd(std::integral_constant<int, 32>{}); break;
+    case 256: fwd(std::integral_constant<int, 256>{}); break;
+    default: RAFT_FAIL("Unexpected capacity value %d", capacity);
   }
 }
 
+// ---------------------------------------------------------------------------
+// search_impl — host-side search logic
+// ---------------------------------------------------------------------------
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_impl(raft::resources const& handle,
                  const index<IdxT>& index,
@@ -366,41 +592,37 @@ void search_impl(raft::resources const& handle,
                                                                   num_samples.data(),
                                                                   stream);
 
-  uint32_t max_samples =
-    std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
-
-  rmm::device_uvector<float> all_distances(std::size_t(n_queries) * max_samples, stream, search_mr);
-  rmm::device_uvector<uint32_t> all_indices(
-    std::size_t(n_queries) * max_samples, stream, search_mr);
-
-  float init_val =
-    select_min ? std::numeric_limits<float>::max() : std::numeric_limits<float>::lowest();
-  thrust::fill_n(raft::resource::get_thrust_policy(handle),
-                 all_distances.data(),
-                 std::size_t(n_queries) * max_samples,
-                 init_val);
-  thrust::fill_n(raft::resource::get_thrust_policy(handle),
-                 all_indices.data(),
-                 std::size_t(n_queries) * max_samples,
-                 uint32_t(0xFFFFFFFF));
-
   auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter(
     index.inds_ptrs().data_handle(), sample_filter);
 
-  ivf_sq_scan(handle,
-              index,
-              converted_queries_ptr,
-              query_norm_dev.data(),
-              n_queries,
-              n_probes,
-              max_samples,
-              coarse_indices_dev.data(),
-              chunk_index.data(),
-              all_distances.data(),
-              all_indices.data(),
-              filter_adapter,
-              stream);
+  bool manage_local_topk = is_local_topk_feasible(k);
+
+  // Determine grid_dim_x for the fused path
+  uint32_t grid_dim_x = 0;
+  if (manage_local_topk) {
+    if (n_probes > 1) {
+      // Query the occupancy to compute optimal grid_dim_x (does not launch)
+      ivf_sq_scan(handle,
+                  index,
+                  converted_queries_ptr,
+                  query_norm_dev.data(),
+                  n_queries,
+                  n_probes,
+                  k,
+                  0,
+                  coarse_indices_dev.data(),
+                  chunk_index.data(),
+                  nullptr,
+                  nullptr,
+                  filter_adapter,
+                  grid_dim_x,
+                  stream);
+    } else {
+      grid_dim_x = 1;
+    }
+  }
 
+  // Prepare uint32 neighbors buffer for postprocessing
   rmm::device_uvector<uint32_t> neighbors_uint32(0, stream, search_mr);
   uint32_t* neighbors_uint32_ptr = nullptr;
   if constexpr (sizeof(int64_t) == sizeof(uint32_t)) {
@@ -410,21 +632,110 @@ void search_impl(raft::resources const& handle,
     neighbors_uint32_ptr = neighbors_uint32.data();
   }
 
-  auto num_samples_view =
-    raft::make_device_vector_view<const uint32_t>(num_samples.data(), n_queries);
+  if (manage_local_topk) {
+    // --- Fused top-k path ---
+    auto target_size = std::size_t(n_queries) * grid_dim_x * k;
+    rmm::device_uvector<float> distances_tmp(0, stream, search_mr);
+    rmm::device_uvector<uint32_t> indices_tmp(0, stream, search_mr);
 
-  cuvs::selection::select_k(
-    handle,
-    raft::make_device_matrix_view<const float, int64_t>(
-      all_distances.data(), n_queries, max_samples),
-    raft::make_device_matrix_view<const uint32_t, int64_t>(
-      all_indices.data(), n_queries, max_samples),
-    raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
-    raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
-    select_min,
-    false,
-    cuvs::selection::SelectAlgo::kAuto,
-    num_samples_view);
+    float* dist_out_ptr    = nullptr;
+    uint32_t* idx_out_ptr  = nullptr;
+
+    if (grid_dim_x > 1) {
+      distances_tmp.resize(target_size, stream);
+      indices_tmp.resize(target_size, stream);
+      dist_out_ptr = distances_tmp.data();
+      idx_out_ptr  = indices_tmp.data();
+    } else {
+      dist_out_ptr = distances;
+      idx_out_ptr  = neighbors_uint32_ptr;
+    }
+
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                query_norm_dev.data(),
+                n_queries,
+                n_probes,
+                k,
+                0,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                dist_out_ptr,
+                idx_out_ptr,
+                filter_adapter,
+                grid_dim_x,
+                stream);
+
+    // Merge across blocks if needed
+    if (grid_dim_x > 1) {
+      auto cols = uint32_t(grid_dim_x) * k;
+      cuvs::selection::select_k(
+        handle,
+        raft::make_device_matrix_view<const float, int64_t>(
+          distances_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<const uint32_t, int64_t>(
+          indices_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
+        raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
+        select_min);
+    }
+  } else {
+    // --- Fallback: materialize all distances ---
+    uint32_t max_samples =
+      std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+
+    rmm::device_uvector<float> all_distances(
+      std::size_t(n_queries) * max_samples, stream, search_mr);
+    rmm::device_uvector<uint32_t> all_indices(
+      std::size_t(n_queries) * max_samples, stream, search_mr);
+
+    float init_val =
+      select_min ? std::numeric_limits<float>::max() : std::numeric_limits<float>::lowest();
+    thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                   all_distances.data(),
+                   std::size_t(n_queries) * max_samples,
+                   init_val);
+    thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                   all_indices.data(),
+                   std::size_t(n_queries) * max_samples,
+                   uint32_t(0xFFFFFFFF));
+
+    // grid_dim_x is unused for the non-fused path; set to n_probes so each
+    // block in the (n_probes, n_queries) grid processes exactly one probe
+    uint32_t gdx = n_probes;
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                query_norm_dev.data(),
+                n_queries,
+                n_probes,
+                k,
+                max_samples,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                all_distances.data(),
+                all_indices.data(),
+                filter_adapter,
+                gdx,
+                stream);
+
+    auto num_samples_view =
+      raft::make_device_vector_view<const uint32_t>(num_samples.data(), n_queries);
+
+    cuvs::selection::select_k(
+      handle,
+      raft::make_device_matrix_view<const float, int64_t>(
+        all_distances.data(), n_queries, max_samples),
+      raft::make_device_matrix_view<const uint32_t, int64_t>(
+        all_indices.data(), n_queries, max_samples),
+      raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
+      raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
+      select_min,
+      false,
+      cuvs::selection::SelectAlgo::kAuto,
+      num_samples_view);
+  }
 
   ivf::detail::postprocess_distances(
     handle, distances, distances, index.metric(), n_queries, k, 1.0, false);
@@ -467,17 +778,33 @@ inline void search_with_filtering(raft::resources const& handle,
       n_probes);
   }
 
-  uint32_t max_samples =
-    std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+  bool manage_local_topk = is_local_topk_feasible(k);
+
+  uint32_t max_samples = 0;
+  if (!manage_local_topk) {
+    max_samples =
+      std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+  }
 
   constexpr uint64_t kExpectedWsSize = 1024ull * 1024 * 1024;
   uint64_t max_ws_size =
     std::min<uint64_t>(raft::resource::get_workspace_free_bytes(handle), kExpectedWsSize);
 
   uint64_t converted_query_floats = std::is_same_v<T, float> ? 0 : index.dim();
-  uint64_t ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + max_samples +
-                                           converted_query_floats) +
-                          sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+  uint64_t ws_per_query;
+  if (manage_local_topk) {
+    // Fused path: only small per-query buffers for coarse search + chunk indices
+    // (The scan output is at most grid_dim_x * k per query, which is small)
+    // Conservatively assume grid_dim_x <= n_probes for the workspace estimate
+    uint64_t fused_out = uint64_t(n_probes) * k;
+    ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
+                                    converted_query_floats) +
+                   sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + fused_out + k);
+  } else {
+    ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + max_samples +
+                                    converted_query_floats) +
+                   sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+  }
 
   const uint32_t max_queries =
     std::min<uint32_t>(n_queries, std::max<uint64_t>(1, max_ws_size / ws_per_query));

From ef957f7373256be6a15721404387681028fad06a Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 16:44:51 +0200
Subject: [PATCH 21/43] Add large-k tests for IVF-SQ materialized fallback path

---
 cpp/tests/neighbors/ann_ivf_sq.cuh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index c4f3c9de74..5cc4e55075 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -398,6 +398,15 @@ const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
   {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct},
 
+  // ===== Large k (beyond fused top-k kMaxSqScanCapacity=256, exercises materialized fallback) =====
+  // k=257: smallest k that forces the materialized path (Capacity clamped to 0)
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::InnerProduct},
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::CosineExpanded},
+  // k=300: comfortably above the fused top-k threshold
+  {100, 10000, 32, 300, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 32, 300, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+
   // ===== nprobe / nlist edge cases =====
   // nprobe == nlist (exhaustive probe)
   {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},

From 56ebfc9508caf20b0b592b1d8572577d21b965c7 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 16:53:06 +0200
Subject: [PATCH 22/43] Improve shared memory synchronization in IVF-SQ scan
 kernel

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index c79e043e20..c7b9d65e20 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -154,6 +154,7 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
     s_sq_scale[d] = sq_delta[d];
     if constexpr (!kIsL2) { s_query_term[d] = query[d]; }
   }
+  __syncthreads();
 
   using local_topk_t = sq_block_sort_t<Capacity, kAscending>;
   local_topk_t queue(k);
@@ -167,6 +168,15 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
   const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
 
   // --- Phase 2: loop over probes ---
+  // Synchronization protocol:
+  //  (a) __syncthreads after Phase 1 (above) ensures s_sq_scale / s_query_term
+  //      are visible before any probe iteration overwrites s_query_term or
+  //      s_recon_base.
+  //  (b) __syncthreads after per-probe smem writes (below) ensures
+  //      probe-specific values are visible before the distance computation.
+  //  (c) __syncthreads at the end of each iteration ensures all distance
+  //      computation reads are complete before the next iteration overwrites
+  //      the same smem regions.
   for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
        probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
     const uint32_t cluster_id = my_coarse[probe_ix];
@@ -183,9 +193,11 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
         }
       }
     }
-    __syncthreads();
+    __syncthreads();  // (b)
 
     if (cluster_sz == 0) {
+      // No distance computation reads happened, so no end-of-iteration
+      // barrier is needed; the next iteration's barrier (b) is sufficient.
       if constexpr (!kManageLocalTopK) break;
       continue;
     }
@@ -245,11 +257,15 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
       }
     }
 
-    __syncthreads();
+    __syncthreads();  // (c)
     if constexpr (!kManageLocalTopK) break;
   }
 
   if constexpr (kManageLocalTopK) {
+    // All probe iterations are done; smem_buf is reused for block_sort merge.
+    // The loop's last (b) or (c) barrier ensures all prior smem accesses have
+    // completed, so this additional barrier is only needed to synchronize any
+    // register-level state across warps before the merge.
     __syncthreads();
     queue.done(smem_buf);
     queue.store(out_distances, out_indices);

From 15b2f158378251104d2598577ef033e300ee50ca Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 2 Apr 2026 18:41:05 +0200
Subject: [PATCH 23/43] IVF-SQ scan: reduce L2 global reads and refine fused
 top-k capacity selection

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh | 240 ++++++++++++---------
 1 file changed, 138 insertions(+), 102 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index c7b9d65e20..17e6c4f9eb 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -40,11 +40,13 @@ static constexpr int kSqScanThreads = 128;
 // Maximum fused top-k capacity we instantiate for the scan kernel.
 // Must match the highest Capacity case in ivf_sq_scan's switch.
 static constexpr int kMaxSqScanCapacity = 256;
+static_assert(kMaxSqScanCapacity <= raft::matrix::detail::select::warpsort::kMaxCapacity,
+              "kMaxSqScanCapacity must not exceed the warpsort library's maximum supported "
+              "capacity; reduce kMaxSqScanCapacity or update the warpsort dependency.");
 
 auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k) -> bool
 {
-  return k <= kMaxSqScanCapacity &&
-         k <= raft::matrix::detail::select::warpsort::kMaxCapacity;
+  return k <= kMaxSqScanCapacity;
 }
 
 // ---------------------------------------------------------------------------
@@ -71,11 +73,8 @@ using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
 // ---------------------------------------------------------------------------
 // configure_grid_dim_x: choose grid.x to saturate the GPU
 // ---------------------------------------------------------------------------
-inline uint32_t configure_grid_dim_x(uint32_t n_queries,
-                                     uint32_t n_probes,
-                                     int smem_size,
-                                     int block_size,
-                                     const void* kernel_ptr)
+inline uint32_t configure_grid_dim_x(
+  uint32_t n_queries, uint32_t n_probes, int smem_size, int block_size, const void* kernel_ptr)
 {
   int dev_id;
   RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
@@ -99,30 +98,42 @@ inline uint32_t configure_grid_dim_x(uint32_t n_queries,
 //   otherwise (Capacity == 0):
 //     grid (n_probes, n_queries) — one block per (query, probe)
 //
-// Shared-memory layout: [s_sq_scale(dim) | s_query_term(dim) | s_recon_base(dim)?]
-//   s_sq_scale is loaded once (invariant across probes).
-//   For L2, s_query_term is reloaded per probe.
-//   For IP/Cosine, s_query_term is loaded once and s_recon_base is reloaded per probe.
+// Shared-memory layout (always 3 × dim floats):
+//   [s_sq_scale(dim) | s_query_term(dim) | s_aux(dim)]
+//
+//   s_sq_scale = delta[d]  — SQ dequantization scale, invariant (Phase 1).
+//
+//   L2 path:
+//     Phase 1: s_aux[d] = query[d] - vmin[d]               (invariant)
+//     Phase 2: s_query_term[d] = s_aux[d] - centroid[d]     (per-probe)
+//     The full SQ reconstruction is centroid + vmin + code*delta, so
+//     query - reconstructed = (query - vmin - centroid) - code*delta
+//                           = s_query_term - code*s_sq_scale.
+//
+//   IP/Cosine path:
+//     Phase 1: s_query_term[d] = query[d]                   (invariant)
+//     Phase 2: s_aux[d] = centroid[d] + vmin[d]             (per-probe)
+//     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
+//
 //   After all probes are scanned, the smem is reused for block_sort merge.
 // ---------------------------------------------------------------------------
 template <int BlockDim, int Capacity, SqScanMetric Metric, typename IdxT, typename IvfSampleFilterT>
-__launch_bounds__(BlockDim) RAFT_KERNEL
-  ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
-                     const uint32_t* list_sizes,
-                     const uint32_t* coarse_indices,
-                     const float* queries_float,
-                     const float* centers,
-                     const float* sq_vmin,
-                     const float* sq_delta,
-                     const float* query_norms,
-                     uint32_t n_probes,
-                     uint32_t dim,
-                     uint32_t k,
-                     uint32_t max_samples,
-                     const uint32_t* chunk_indices,
-                     float* out_distances,
-                     uint32_t* out_indices,
-                     IvfSampleFilterT sample_filter)
+__launch_bounds__(BlockDim) RAFT_KERNEL ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
+                                                           const uint32_t* list_sizes,
+                                                           const uint32_t* coarse_indices,
+                                                           const float* queries_float,
+                                                           const float* centers,
+                                                           const float* sq_vmin,
+                                                           const float* sq_delta,
+                                                           const float* query_norms,
+                                                           uint32_t n_probes,
+                                                           uint32_t dim,
+                                                           uint32_t k,
+                                                           uint32_t max_samples,
+                                                           const uint32_t* chunk_indices,
+                                                           float* out_distances,
+                                                           uint32_t* out_indices,
+                                                           IvfSampleFilterT sample_filter)
 {
   static_assert(kIndexGroupSize == raft::WarpSize,
                 "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
@@ -135,10 +146,9 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
   extern __shared__ __align__(256) uint8_t smem_buf[];
   float* smem = reinterpret_cast<float*>(smem_buf);
 
-  // smem layout: [s_sq_scale | s_query_term | (s_recon_base for IP/Cosine)]
   float* s_sq_scale   = smem;
   float* s_query_term = smem + dim;
-  float* s_recon_base = kIsL2 ? nullptr : (smem + 2 * dim);
+  float* s_aux        = smem + 2 * dim;
 
   const uint32_t query_ix = blockIdx.y;
   const float* query      = queries_float + query_ix * dim;
@@ -152,7 +162,11 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
   // --- Phase 1: load shared memory that is invariant across probes ---
   for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
     s_sq_scale[d] = sq_delta[d];
-    if constexpr (!kIsL2) { s_query_term[d] = query[d]; }
+    if constexpr (kIsL2) {
+      s_aux[d] = query[d] - sq_vmin[d];
+    } else {
+      s_query_term[d] = query[d];
+    }
   }
   __syncthreads();
 
@@ -169,14 +183,18 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
 
   // --- Phase 2: loop over probes ---
   // Synchronization protocol:
-  //  (a) __syncthreads after Phase 1 (above) ensures s_sq_scale / s_query_term
-  //      are visible before any probe iteration overwrites s_query_term or
-  //      s_recon_base.
-  //  (b) __syncthreads after per-probe smem writes (below) ensures
-  //      probe-specific values are visible before the distance computation.
+  //  (a) __syncthreads after Phase 1 (above) ensures invariant smem arrays
+  //      (s_sq_scale, and L2: s_aux / IP-Cosine: s_query_term) are visible
+  //      before Phase 2 overwrites the per-probe array.
+  //  (b) __syncthreads after per-probe smem writes (L2: s_query_term /
+  //      IP-Cosine: s_aux) ensures probe-specific values are visible before
+  //      the distance computation.
   //  (c) __syncthreads at the end of each iteration ensures all distance
   //      computation reads are complete before the next iteration overwrites
-  //      the same smem regions.
+  //      the per-probe smem region.
+  //  When cluster_sz == 0, barrier (c) is skipped because no distance reads
+  //  occurred; all threads converge on the same branch uniformly, and the
+  //  next iteration's barrier (b) provides the needed ordering.
   for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
        probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
     const uint32_t cluster_id = my_coarse[probe_ix];
@@ -187,9 +205,9 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
       const float* centroid = centers + cluster_id * dim;
       for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
         if constexpr (kIsL2) {
-          s_query_term[d] = query[d] - centroid[d] - sq_vmin[d];
+          s_query_term[d] = s_aux[d] - centroid[d];
         } else {
-          s_recon_base[d] = centroid[d] + sq_vmin[d];
+          s_aux[d] = centroid[d] + sq_vmin[d];
         }
       }
     }
@@ -202,10 +220,10 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
       continue;
     }
 
-    const uint8_t* codes    = data_ptrs[cluster_id];
-    uint32_t sample_offset  = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
-    uint32_t padded_dim     = ((dim + veclen - 1) / veclen) * veclen;
-    uint32_t n_dim_blocks   = padded_dim / veclen;
+    const uint8_t* codes   = data_ptrs[cluster_id];
+    uint32_t sample_offset = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+    uint32_t padded_dim    = ((dim + veclen - 1) / veclen) * veclen;
+    uint32_t n_dim_blocks  = padded_dim / veclen;
 
     for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
          group += kWarpsPerBlock * kIndexGroupSize) {
@@ -232,7 +250,7 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
               float diff = s_query_term[l + j] - recon;
               dist += diff * diff;
             } else {
-              float v_d = s_recon_base[l + j] + recon;
+              float v_d = s_aux[l + j] + recon;
               dist += s_query_term[l + j] * v_d;
               if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
             }
@@ -288,15 +306,12 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
 // ---------------------------------------------------------------------------
 // Compute shared-memory size for a given kernel configuration
 // ---------------------------------------------------------------------------
-inline size_t sq_scan_smem_size(uint32_t dim, SqScanMetric metric)
-{
-  return (metric == SqScanMetric::kL2 ? 2 : 3) * dim * sizeof(float);
-}
+inline size_t sq_scan_smem_size(uint32_t dim) { return 3 * dim * sizeof(float); }
 
 template <int Capacity>
-size_t sq_scan_total_smem(uint32_t dim, uint32_t k, SqScanMetric metric)
+size_t sq_scan_total_smem(uint32_t dim, uint32_t k)
 {
-  size_t scan_smem = sq_scan_smem_size(dim, metric);
+  size_t scan_smem = sq_scan_smem_size(dim);
   if constexpr (Capacity > 0) {
     constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
     int num_subwarps           = kSqScanThreads / kSubwarpSize;
@@ -331,10 +346,25 @@ void ivf_sq_scan_launch(const index<IdxT>& idx,
   constexpr int kThreads          = kSqScanThreads;
   uint32_t dim                    = idx.dim();
 
-  constexpr uint32_t kMaxGridY = 32768;
+  constexpr uint32_t kMaxGridY = 65535;
+
+  auto do_launch = [&](auto kernel_ptr) {
+    size_t smem = sq_scan_total_smem<Capacity>(dim, k);
 
-  auto do_launch = [&](auto kernel_ptr, SqScanMetric metric_val) {
-    size_t smem = sq_scan_total_smem<Capacity>(dim, k, metric_val);
+    {
+      int dev_id;
+      RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+      int max_smem;
+      RAFT_CUDA_TRY(
+        cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_id));
+      RAFT_EXPECTS(smem <= size_t(max_smem),
+                   "IVF-SQ scan kernel requires %zu bytes of shared memory (dim=%u, k=%u), "
+                   "but the device supports at most %d bytes per block.",
+                   smem,
+                   dim,
+                   k,
+                   max_smem);
+    }
 
     RAFT_CUDA_TRY(
       cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
@@ -342,9 +372,11 @@ void ivf_sq_scan_launch(const index<IdxT>& idx,
     // If grid_dim_x == 0, compute the optimal value and return
     if constexpr (kManageLocalTopK) {
       if (grid_dim_x == 0) {
-        grid_dim_x = configure_grid_dim_x(
-          std::min(kMaxGridY, n_queries), n_probes, smem, kThreads,
-          reinterpret_cast<const void*>(kernel_ptr));
+        grid_dim_x = configure_grid_dim_x(std::min(kMaxGridY, n_queries),
+                                          n_probes,
+                                          smem,
+                                          kThreads,
+                                          reinterpret_cast<const void*>(kernel_ptr));
         return;
       }
     }
@@ -393,19 +425,14 @@ void ivf_sq_scan_launch(const index<IdxT>& idx,
   switch (idx.metric()) {
     case cuvs::distance::DistanceType::L2Expanded:
     case cuvs::distance::DistanceType::L2SqrtExpanded:
-      do_launch(
-        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, IdxT, IvfSampleFilterT>,
-        SqScanMetric::kL2);
+      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, IdxT, IvfSampleFilterT>);
       break;
     case cuvs::distance::DistanceType::InnerProduct:
-      do_launch(
-        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, IdxT, IvfSampleFilterT>,
-        SqScanMetric::kIP);
+      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, IdxT, IvfSampleFilterT>);
       break;
     case cuvs::distance::DistanceType::CosineExpanded:
       do_launch(
-        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>,
-        SqScanMetric::kCosine);
+        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>);
       break;
     default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
   }
@@ -434,13 +461,13 @@ void ivf_sq_scan(raft::resources const& handle,
   // Determine the fused top-k capacity (0 = disabled / fallback to materialization)
   int capacity = is_local_topk_feasible(k) ? raft::bound_by_power_of_two(int(k)) : 0;
 
-  // Clamp to supported compile-time Capacity values.
-  // Using a limited set to avoid excessive template instantiations.
-  if (capacity > 0 && capacity <= 32) {
+  // Snap to the nearest supported compile-time Capacity value (must be a
+  // power of two). Values up to 32 share one instantiation; 64, 128 and 256
+  // each get their own.  Beyond kMaxSqScanCapacity we fall back to the
+  // non-fused path (Capacity == 0).
+  if (capacity > 0 && capacity < 32) {
     capacity = 32;
-  } else if (capacity > 32 && capacity <= 256) {
-    capacity = 256;
-  } else if (capacity > 256) {
+  } else if (capacity > kMaxSqScanCapacity) {
     capacity = 0;
   }
 
@@ -464,6 +491,8 @@ void ivf_sq_scan(raft::resources const& handle,
   switch (capacity) {
     case 0: fwd(std::integral_constant<int, 0>{}); break;
     case 32: fwd(std::integral_constant<int, 32>{}); break;
+    case 64: fwd(std::integral_constant<int, 64>{}); break;
+    case 128: fwd(std::integral_constant<int, 128>{}); break;
     case 256: fwd(std::integral_constant<int, 256>{}); break;
     default: RAFT_FAIL("Unexpected capacity value %d", capacity);
   }
@@ -490,7 +519,8 @@ void search_impl(raft::resources const& handle,
 
   std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes);
 
-  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  bool needs_query_norms = index.metric() != cuvs::distance::DistanceType::InnerProduct;
+  rmm::device_uvector<float> query_norm_dev(needs_query_norms ? n_queries : 0, stream, search_mr);
   rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
   rmm::device_uvector<float> coarse_distances_dev(n_queries_probes, stream, search_mr);
   rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries_probes, stream, search_mr);
@@ -581,7 +611,7 @@ void search_impl(raft::resources const& handle,
     raft::linalg::map_offset(
       handle,
       distance_buffer_dev_view,
-      [=] __device__(const uint32_t idx, const float dist) {
+      [=] __device__(const int64_t idx, const float dist) {
         const auto query   = idx / n_lists_local;
         const auto cluster = idx % n_lists_local;
         float denom        = q_norm_ptr[query] * center_norm_ptr[cluster];
@@ -616,25 +646,29 @@ void search_impl(raft::resources const& handle,
   // Determine grid_dim_x for the fused path
   uint32_t grid_dim_x = 0;
   if (manage_local_topk) {
-    if (n_probes > 1) {
-      // Query the occupancy to compute optimal grid_dim_x (does not launch)
-      ivf_sq_scan(handle,
-                  index,
-                  converted_queries_ptr,
-                  query_norm_dev.data(),
-                  n_queries,
-                  n_probes,
-                  k,
-                  0,
-                  coarse_indices_dev.data(),
-                  chunk_index.data(),
-                  nullptr,
-                  nullptr,
-                  filter_adapter,
-                  grid_dim_x,
-                  stream);
-    } else {
-      grid_dim_x = 1;
+    // Query the occupancy to compute optimal grid_dim_x (does not launch)
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                query_norm_dev.data(),
+                n_queries,
+                n_probes,
+                k,
+                0,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                nullptr,
+                nullptr,
+                filter_adapter,
+                grid_dim_x,
+                stream);
+    if (grid_dim_x == 0) {
+      manage_local_topk = false;
+      RAFT_LOG_WARN(
+        "IVF-SQ fused top-k kernel has zero occupancy (dim=%u, k=%u); "
+        "falling back to the non-fused scan path.",
+        index.dim(),
+        k);
     }
   }
 
@@ -654,8 +688,8 @@ void search_impl(raft::resources const& handle,
     rmm::device_uvector<float> distances_tmp(0, stream, search_mr);
     rmm::device_uvector<uint32_t> indices_tmp(0, stream, search_mr);
 
-    float* dist_out_ptr    = nullptr;
-    uint32_t* idx_out_ptr  = nullptr;
+    float* dist_out_ptr   = nullptr;
+    uint32_t* idx_out_ptr = nullptr;
 
     if (grid_dim_x > 1) {
       distances_tmp.resize(target_size, stream);
@@ -688,18 +722,18 @@ void search_impl(raft::resources const& handle,
       auto cols = uint32_t(grid_dim_x) * k;
       cuvs::selection::select_k(
         handle,
-        raft::make_device_matrix_view<const float, int64_t>(
-          distances_tmp.data(), n_queries, cols),
-        raft::make_device_matrix_view<const uint32_t, int64_t>(
-          indices_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<const float, int64_t>(distances_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<const uint32_t, int64_t>(indices_tmp.data(), n_queries, cols),
         raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
         raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
         select_min);
     }
   } else {
     // --- Fallback: materialize all distances ---
-    uint32_t max_samples =
-      std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+    int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
+    RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
+                 "The maximum sample size is too big.");
+    uint32_t max_samples = static_cast<uint32_t>(ms);
 
     rmm::device_uvector<float> all_distances(
       std::size_t(n_queries) * max_samples, stream, search_mr);
@@ -798,8 +832,10 @@ inline void search_with_filtering(raft::resources const& handle,
 
   uint32_t max_samples = 0;
   if (!manage_local_topk) {
-    max_samples =
-      std::max<uint32_t>(static_cast<uint32_t>(index.accum_sorted_sizes()(n_probes)), k);
+    int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
+    RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
+                 "The maximum sample size is too big.");
+    max_samples = static_cast<uint32_t>(ms);
   }
 
   constexpr uint64_t kExpectedWsSize = 1024ull * 1024 * 1024;
@@ -813,7 +849,7 @@ inline void search_with_filtering(raft::resources const& handle,
     // (The scan output is at most grid_dim_x * k per query, which is small)
     // Conservatively assume grid_dim_x <= n_probes for the workspace estimate
     uint64_t fused_out = uint64_t(n_probes) * k;
-    ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
+    ws_per_query       = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
                                     converted_query_floats) +
                    sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + fused_out + k);
   } else {

From 3a3427f5ca5de84086c920172f8f244c36702c7f Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 7 Apr 2026 14:06:49 +0200
Subject: [PATCH 24/43] Addressing review (tests updates)

---
 cpp/tests/CMakeLists.txt                      |   2 +-
 cpp/tests/neighbors/ann_ivf_sq.cuh            | 317 ++++++++++--------
 .../ann_ivf_sq/test_float_int64_t.cu          |  20 ++
 ..._float_uint8_t.cu => test_half_int64_t.cu} |  18 +-
 4 files changed, 207 insertions(+), 150 deletions(-)
 create mode 100644 cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
 rename cpp/tests/neighbors/ann_ivf_sq/{test_float_uint8_t.cu => test_half_int64_t.cu} (55%)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 59a5f1fb70..eea4d7f775 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -135,7 +135,7 @@ ConfigureTest(
 
 ConfigureTest(
   NAME NEIGHBORS_ANN_IVF_SQ_TEST
-  PATH neighbors/ann_ivf_sq/test_float_uint8_t.cu
+  PATH neighbors/ann_ivf_sq/test_float_int64_t.cu neighbors/ann_ivf_sq/test_half_int64_t.cu
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index 5cc4e55075..f7311b75e4 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -57,143 +57,67 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
   {
   }
 
-  void testIVFSQ()
+  void testSearch()
   {
-    size_t queries_size = ps.num_queries * ps.k;
-    std::vector<IdxT> indices_ivfsq(queries_size);
-    std::vector<IdxT> indices_naive(queries_size);
-    std::vector<T> distances_ivfsq(queries_size);
-    std::vector<T> distances_naive(queries_size);
-
-    {
-      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
-      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-      cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
-                                                 distances_naive_dev.data(),
-                                                 indices_naive_dev.data(),
-                                                 search_queries.data(),
-                                                 database.data(),
-                                                 ps.num_queries,
-                                                 ps.num_db_vecs,
-                                                 ps.dim,
-                                                 ps.k,
-                                                 ps.metric);
-      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
-      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
-      raft::resource::sync_stream(handle_);
-    }
-
-    {
-      double min_recall =
-        std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
-
-      rmm::device_uvector<T> distances_ivfsq_dev(queries_size, stream_);
-      rmm::device_uvector<IdxT> indices_ivfsq_dev(queries_size, stream_);
-
-      {
-        cuvs::neighbors::ivf_sq::index_params index_params;
-        cuvs::neighbors::ivf_sq::search_params search_params;
-        index_params.n_lists   = ps.nlist;
-        index_params.metric    = ps.metric;
-        search_params.n_probes = ps.nprobe;
-
-        index_params.add_data_on_build        = true;
-        index_params.kmeans_trainset_fraction = 0.5;
-
-        cuvs::neighbors::ivf_sq::index_params index_params_no_add;
-        index_params_no_add.n_lists                  = ps.nlist;
-        index_params_no_add.metric                   = ps.metric;
-        index_params_no_add.add_data_on_build        = false;
-        index_params_no_add.kmeans_trainset_fraction = 0.5;
-
-        cuvs::neighbors::ivf_sq::index<uint8_t> idx(handle_);
-        cuvs::neighbors::ivf_sq::index<uint8_t> idx_empty(handle_);
-
-        if (!ps.host_dataset) {
-          auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-            (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-
-          idx = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
-
-          idx_empty = cuvs::neighbors::ivf_sq::build(handle_, index_params_no_add, database_view);
-
-          auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
-          raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
-          raft::resource::sync_stream(handle_);
-
-          auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
-            vector_indices.data_handle(), ps.num_db_vecs);
-          cuvs::neighbors::ivf_sq::extend(
-            handle_,
-            database_view,
-            std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
-            &idx_empty);
-        } else {
-          auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
-          raft::copy(
-            host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
-          raft::resource::sync_stream(handle_);
-
-          idx = cuvs::neighbors::ivf_sq::build(
-            handle_, index_params, raft::make_const_mdspan(host_database.view()));
-
-          idx_empty = cuvs::neighbors::ivf_sq::build(
-            handle_, index_params_no_add, raft::make_const_mdspan(host_database.view()));
-
-          auto vector_indices = raft::make_host_vector<IdxT>(handle_, ps.num_db_vecs);
-          std::iota(
-            vector_indices.data_handle(), vector_indices.data_handle() + ps.num_db_vecs, IdxT(0));
-
-          auto indices_view = raft::make_host_vector_view<const IdxT, IdxT>(
-            vector_indices.data_handle(), ps.num_db_vecs);
-          auto host_database_view = raft::make_host_matrix_view<const DataT, IdxT>(
-            host_database.data_handle(), ps.num_db_vecs, ps.dim);
-          cuvs::neighbors::ivf_sq::extend(
-            handle_,
-            host_database_view,
-            std::make_optional<raft::host_vector_view<const IdxT, IdxT>>(indices_view),
-            &idx_empty);
-        }
-
-        // Serialize / deserialize round-trip
-        tmp_index_file index_file;
-        cuvs::neighbors::ivf_sq::serialize(handle_, index_file.filename, idx);
-        cuvs::neighbors::ivf_sq::index<uint8_t> index_loaded(handle_);
-        cuvs::neighbors::ivf_sq::deserialize(handle_, index_file.filename, &index_loaded);
-        ASSERT_EQ(idx.size(), index_loaded.size());
-        ASSERT_EQ(idx.dim(), index_loaded.dim());
-        ASSERT_EQ(idx.n_lists(), index_loaded.n_lists());
-
-        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          search_queries.data(), ps.num_queries, ps.dim);
-        auto indices_out_view =
-          raft::make_device_matrix_view<IdxT, IdxT>(indices_ivfsq_dev.data(), ps.num_queries, ps.k);
-        auto dists_out_view =
-          raft::make_device_matrix_view<T, IdxT>(distances_ivfsq_dev.data(), ps.num_queries, ps.k);
+    auto naive   = compute_naive_knn();
+    auto idx     = build_index(true);
+    auto results = search_index(idx);
+
+    float eps = 0.1;
+    ASSERT_TRUE(eval_neighbours(naive.indices,
+                                results.indices,
+                                naive.distances,
+                                results.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                min_recall_threshold()));
+  }
 
-        cuvs::neighbors::ivf_sq::search(handle_,
-                                        search_params,
-                                        index_loaded,
-                                        search_queries_view,
-                                        indices_out_view,
-                                        dists_out_view);
+  void testSerialize()
+  {
+    auto idx = build_index(true);
+
+    tmp_index_file index_file;
+    cuvs::neighbors::ivf_sq::serialize(handle_, index_file.filename, idx);
+    cuvs::neighbors::ivf_sq::index<uint8_t> index_loaded(handle_);
+    cuvs::neighbors::ivf_sq::deserialize(handle_, index_file.filename, &index_loaded);
+
+    ASSERT_EQ(idx.size(), index_loaded.size());
+    ASSERT_EQ(idx.dim(), index_loaded.dim());
+    ASSERT_EQ(idx.n_lists(), index_loaded.n_lists());
+
+    auto results_orig   = search_index(idx);
+    auto results_loaded = search_index(index_loaded);
+
+    float eps = 0.001;
+    ASSERT_TRUE(eval_neighbours(results_orig.indices,
+                                results_loaded.indices,
+                                results_orig.distances,
+                                results_loaded.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                1.0));
+  }
 
-        raft::update_host(
-          distances_ivfsq.data(), distances_ivfsq_dev.data(), queries_size, stream_);
-        raft::update_host(indices_ivfsq.data(), indices_ivfsq_dev.data(), queries_size, stream_);
-        raft::resource::sync_stream(handle_);
-      }
-      // SQ introduces quantization error, so we relax the distance epsilon
-      float eps = 0.1;
-      ASSERT_TRUE(eval_neighbours(indices_naive,
-                                  indices_ivfsq,
-                                  distances_naive,
-                                  distances_ivfsq,
-                                  ps.num_queries,
-                                  ps.k,
-                                  eps,
-                                  min_recall));
-    }
+  void testExtend()
+  {
+    auto naive     = compute_naive_knn();
+    auto idx_empty = build_index(false);
+    extend_index(&idx_empty);
+
+    auto results = search_index(idx_empty);
+
+    float eps = 0.1;
+    ASSERT_TRUE(eval_neighbours(naive.indices,
+                                results.indices,
+                                naive.distances,
+                                results.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                min_recall_threshold()));
   }
 
   void testFilter()
@@ -318,6 +242,128 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
   }
 
  private:
+  struct SearchResults {
+    std::vector<IdxT> indices;
+    std::vector<T> distances;
+  };
+
+  double min_recall_threshold()
+  {
+    return std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
+  }
+
+  SearchResults compute_naive_knn()
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    SearchResults results;
+    results.indices.resize(queries_size);
+    results.distances.resize(queries_size);
+
+    rmm::device_uvector<T> distances_dev(queries_size, stream_);
+    rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+    cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
+                                               distances_dev.data(),
+                                               indices_dev.data(),
+                                               search_queries.data(),
+                                               database.data(),
+                                               ps.num_queries,
+                                               ps.num_db_vecs,
+                                               ps.dim,
+                                               ps.k,
+                                               ps.metric);
+    raft::update_host(results.distances.data(), distances_dev.data(), queries_size, stream_);
+    raft::update_host(results.indices.data(), indices_dev.data(), queries_size, stream_);
+    raft::resource::sync_stream(handle_);
+    return results;
+  }
+
+  cuvs::neighbors::ivf_sq::index<uint8_t> build_index(bool add_data_on_build)
+  {
+    cuvs::neighbors::ivf_sq::index_params index_params;
+    index_params.n_lists                  = ps.nlist;
+    index_params.metric                   = ps.metric;
+    index_params.add_data_on_build        = add_data_on_build;
+    index_params.kmeans_trainset_fraction = 0.5;
+
+    if (!ps.host_dataset) {
+      auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+        (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+      return cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
+    } else {
+      auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
+      raft::copy(host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
+      raft::resource::sync_stream(handle_);
+      return cuvs::neighbors::ivf_sq::build(
+        handle_, index_params, raft::make_const_mdspan(host_database.view()));
+    }
+  }
+
+  void extend_index(cuvs::neighbors::ivf_sq::index<uint8_t>* idx)
+  {
+    if (!ps.host_dataset) {
+      auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+        (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+      auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
+      raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
+      raft::resource::sync_stream(handle_);
+
+      auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+        vector_indices.data_handle(), ps.num_db_vecs);
+      cuvs::neighbors::ivf_sq::extend(
+        handle_,
+        database_view,
+        std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
+        idx);
+    } else {
+      auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
+      raft::copy(host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
+      raft::resource::sync_stream(handle_);
+
+      auto vector_indices = raft::make_host_vector<IdxT>(handle_, ps.num_db_vecs);
+      std::iota(
+        vector_indices.data_handle(), vector_indices.data_handle() + ps.num_db_vecs, IdxT(0));
+
+      auto indices_view =
+        raft::make_host_vector_view<const IdxT, IdxT>(vector_indices.data_handle(), ps.num_db_vecs);
+      auto host_database_view = raft::make_host_matrix_view<const DataT, IdxT>(
+        host_database.data_handle(), ps.num_db_vecs, ps.dim);
+      cuvs::neighbors::ivf_sq::extend(
+        handle_,
+        host_database_view,
+        std::make_optional<raft::host_vector_view<const IdxT, IdxT>>(indices_view),
+        idx);
+    }
+  }
+
+  SearchResults search_index(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    SearchResults results;
+    results.indices.resize(queries_size);
+    results.distances.resize(queries_size);
+
+    cuvs::neighbors::ivf_sq::search_params search_params;
+    search_params.n_probes = ps.nprobe;
+
+    rmm::device_uvector<T> distances_dev(queries_size, stream_);
+    rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+    auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+      search_queries.data(), ps.num_queries, ps.dim);
+    auto indices_out_view =
+      raft::make_device_matrix_view<IdxT, IdxT>(indices_dev.data(), ps.num_queries, ps.k);
+    auto dists_out_view =
+      raft::make_device_matrix_view<T, IdxT>(distances_dev.data(), ps.num_queries, ps.k);
+
+    cuvs::neighbors::ivf_sq::search(
+      handle_, search_params, idx, search_queries_view, indices_out_view, dists_out_view);
+
+    raft::update_host(results.distances.data(), distances_dev.data(), queries_size, stream_);
+    raft::update_host(results.indices.data(), indices_dev.data(), queries_size, stream_);
+    raft::resource::sync_stream(handle_);
+    return results;
+  }
+
   raft::resources handle_;
   rmm::cuda_stream_view stream_;
   AnnIvfSqInputs<IdxT> ps;
@@ -398,7 +444,8 @@ const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
   {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct},
 
-  // ===== Large k (beyond fused top-k kMaxSqScanCapacity=256, exercises materialized fallback) =====
+  // ===== Large k (beyond fused top-k kMaxSqScanCapacity=256, exercises materialized fallback)
+  // =====
   // k=257: smallest k that forces the materialized path (Capacity clamped to 0)
   {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::L2Expanded},
   {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::InnerProduct},
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
new file mode 100644
index 0000000000..734f736b09
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
@@ -0,0 +1,20 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_ivf_sq.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+typedef AnnIVFSQTest<float, float, int64_t> AnnIVFSQTestF_float;
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQSearch) { this->testSearch(); }
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQSerialize) { this->testSerialize(); }
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQExtend) { this->testExtend(); }
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQFilter) { this->testFilter(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
similarity index 55%
rename from cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
rename to cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
index f136b6f41c..e6f5e44dd3 100644
--- a/cpp/tests/neighbors/ann_ivf_sq/test_float_uint8_t.cu
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
@@ -9,21 +9,11 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-typedef AnnIVFSQTest<float, float, int64_t> AnnIVFSQTestF_float;
-TEST_P(AnnIVFSQTestF_float, AnnIVFSQ)
-{
-  this->testIVFSQ();
-  this->testFilter();
-}
-
-INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
-
 typedef AnnIVFSQTest<float, half, int64_t> AnnIVFSQTestF_half;
-TEST_P(AnnIVFSQTestF_half, AnnIVFSQ)
-{
-  this->testIVFSQ();
-  this->testFilter();
-}
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQSearch) { this->testSearch(); }
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQSerialize) { this->testSerialize(); }
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQExtend) { this->testExtend(); }
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQFilter) { this->testFilter(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_half, ::testing::ValuesIn(inputs_half));
 

From 1b182d70487eae2b1e0b5609fc775506a90167de Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 20 Apr 2026 13:34:01 +0200
Subject: [PATCH 25/43] Swap IdxT for CodeT

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         |  36 ++---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     |  76 +++++-----
 ...f_sq_build_extend_float_uint8_t_int64_t.cu | 142 +++++++++---------
 ...vf_sq_build_extend_half_uint8_t_int64_t.cu | 142 +++++++++---------
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh    |  86 ++++++-----
 .../ivf_sq_search_float_uint8_t_int64_t.cu    |   4 +-
 .../ivf_sq_search_half_uint8_t_int64_t.cu     |   4 +-
 cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh |  54 +++----
 cpp/src/neighbors/ivf_sq_index.cpp            | 124 +++++++--------
 9 files changed, 336 insertions(+), 332 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index bb29b3fef1..ba4bb39437 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -78,13 +78,13 @@ static_assert(std::is_aggregate_v<search_params>);
  * @{
  */
 
-template <typename SizeT, typename IdxT, typename ExtT>
+template <typename SizeT, typename CodeT, typename IdxT>
 struct list_spec {
-  static_assert(std::is_same_v<IdxT, uint8_t>, "IVF-SQ code type IdxT must be uint8_t");
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ code type CodeT must be uint8_t");
 
-  using value_type   = IdxT;
+  using value_type   = CodeT;
   using list_extents = raft::matrix_extent<SizeT>;
-  using index_type   = ExtT;
+  using index_type   = IdxT;
 
   SizeT align_max;
   SizeT align_min;
@@ -98,7 +98,7 @@ struct list_spec {
   }
 
   template <typename OtherSizeT>
-  constexpr explicit list_spec(const list_spec<OtherSizeT, IdxT, ExtT>& other_spec)
+  constexpr explicit list_spec(const list_spec<OtherSizeT, CodeT, IdxT>& other_spec)
     : dim{other_spec.dim}, align_min{other_spec.align_min}, align_max{other_spec.align_max}
   {
   }
@@ -112,8 +112,8 @@ struct list_spec {
   }
 };
 
-template <typename IdxT, typename ExtT, typename SizeT = uint32_t>
-using list_data = ivf::list<list_spec, SizeT, IdxT, ExtT>;
+template <typename CodeT, typename IdxT, typename SizeT = uint32_t>
+using list_data = ivf::list<list_spec, SizeT, CodeT, IdxT>;
 
 /**
  * @}
@@ -142,18 +142,18 @@ using list_data = ivf::list<list_spec, SizeT, IdxT, ExtT>;
  * search speed, and recall compared to flat (uncompressed) and product-quantized (PQ)
  * representations.
  *
- * @tparam IdxT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
+ * @tparam CodeT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
  *
  */
-template <typename IdxT>
+template <typename CodeT>
 struct index : cuvs::neighbors::index {
-  static_assert(std::is_same_v<IdxT, uint8_t>, "IVF-SQ code type IdxT must be uint8_t for now.");
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ code type CodeT must be uint8_t for now.");
 
   using index_params_type  = ivf_sq::index_params;
   using search_params_type = ivf_sq::search_params;
-  using code_type          = IdxT;
+  using code_type          = CodeT;
 
-  static constexpr uint32_t sq_bits = sizeof(IdxT) * 8;
+  static constexpr uint32_t sq_bits = sizeof(CodeT) * 8;
 
  public:
   index(const index&)            = delete;
@@ -195,14 +195,14 @@ struct index : cuvs::neighbors::index {
   raft::host_vector_view<int64_t, uint32_t> accum_sorted_sizes() noexcept;
   [[nodiscard]] raft::host_vector_view<const int64_t, uint32_t> accum_sorted_sizes() const noexcept;
 
-  raft::device_vector_view<IdxT*, uint32_t> data_ptrs() noexcept;
-  raft::device_vector_view<IdxT* const, uint32_t> data_ptrs() const noexcept;
+  raft::device_vector_view<CodeT*, uint32_t> data_ptrs() noexcept;
+  raft::device_vector_view<CodeT* const, uint32_t> data_ptrs() const noexcept;
 
   raft::device_vector_view<int64_t*, uint32_t> inds_ptrs() noexcept;
   raft::device_vector_view<int64_t* const, uint32_t> inds_ptrs() const noexcept;
 
-  std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& lists() noexcept;
-  const std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& lists() const noexcept;
+  std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& lists() noexcept;
+  const std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& lists() const noexcept;
 
   void check_consistency();
 
@@ -210,14 +210,14 @@ struct index : cuvs::neighbors::index {
   cuvs::distance::DistanceType metric_;
   bool conservative_memory_allocation_;
 
-  std::vector<std::shared_ptr<list_data<IdxT, int64_t>>> lists_;
+  std::vector<std::shared_ptr<list_data<CodeT, int64_t>>> lists_;
   raft::device_vector<uint32_t, uint32_t> list_sizes_;
   raft::device_matrix<float, uint32_t, raft::row_major> centers_;
   std::optional<raft::device_vector<float, uint32_t>> center_norms_;
   raft::device_vector<float, uint32_t> sq_vmin_;
   raft::device_vector<float, uint32_t> sq_delta_;
 
-  raft::device_vector<IdxT*, uint32_t> data_ptrs_;
+  raft::device_vector<CodeT*, uint32_t> data_ptrs_;
   raft::device_vector<int64_t*, uint32_t> inds_ptrs_;
   raft::host_vector<int64_t, uint32_t> accum_sorted_sizes_;
 };
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 7934460fb5..7940d0d7fb 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -104,12 +104,12 @@ __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __r
   }
 }
 
-template <typename IdxT>
-auto clone(const raft::resources& res, const index<IdxT>& source) -> index<IdxT>
+template <typename CodeT>
+auto clone(const raft::resources& res, const index<CodeT>& source) -> index<CodeT>
 {
   auto stream = raft::resource::get_cuda_stream(res);
 
-  index<IdxT> target(
+  index<CodeT> target(
     res, source.metric(), source.n_lists(), source.dim(), source.conservative_memory_allocation());
 
   raft::copy(target.list_sizes().data_handle(),
@@ -219,9 +219,9 @@ RAFT_KERNEL compute_residuals_inplace_kernel(
   }
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void extend(raft::resources const& handle,
-            index<IdxT>* index,
+            index<CodeT>* index,
             const T* new_vectors,
             const int64_t* new_indices,
             int64_t n_rows)
@@ -233,8 +233,8 @@ void extend(raft::resources const& handle,
   auto stream  = raft::resource::get_cuda_stream(handle);
   auto n_lists = index->n_lists();
   auto dim     = index->dim();
-  list_spec<uint32_t, IdxT, int64_t> list_device_spec{index->dim(),
-                                                      index->conservative_memory_allocation()};
+  list_spec<uint32_t, CodeT, int64_t> list_device_spec{index->dim(),
+                                                       index->conservative_memory_allocation()};
   cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "ivf_sq::extend(%zu, %u)", size_t(n_rows), dim);
 
@@ -401,24 +401,24 @@ void extend(raft::resources const& handle,
   }
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 auto extend(raft::resources const& handle,
-            const index<IdxT>& orig_index,
+            const index<CodeT>& orig_index,
             const T* new_vectors,
             const int64_t* new_indices,
-            int64_t n_rows) -> index<IdxT>
+            int64_t n_rows) -> index<CodeT>
 {
   auto ext_index = clone(handle, orig_index);
   detail::extend(handle, &ext_index, new_vectors, new_indices, n_rows);
   return ext_index;
 }
 
-template <typename T, typename IdxT, typename accessor>
+template <typename T, typename CodeT, typename accessor>
 inline auto build(
   raft::resources const& handle,
   const index_params& params,
   raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, accessor> dataset)
-  -> index<IdxT>
+  -> index<CodeT>
 {
   int64_t n_rows = dataset.extent(0);
   uint32_t dim   = dataset.extent(1);
@@ -431,7 +431,7 @@ inline auto build(
   RAFT_EXPECTS(params.metric != cuvs::distance::DistanceType::CosineExpanded || dim > 1,
                "Cosine metric requires more than one dim");
 
-  index<IdxT> idx(handle, params, dim);
+  index<CodeT> idx(handle, params, dim);
 
   // Train k-means centroids and SQ parameters on the same training subset.
   // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
@@ -502,35 +502,35 @@ inline auto build(
   }
 
   if (params.add_data_on_build) {
-    detail::extend<T, IdxT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
+    detail::extend<T, CodeT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
   }
 
   return idx;
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void build(raft::resources const& handle,
            const index_params& params,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
-           index<IdxT>& idx)
+           index<CodeT>& idx)
 {
-  idx = build<T, IdxT>(handle, params, dataset);
+  idx = build<T, CodeT>(handle, params, dataset);
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void build(raft::resources const& handle,
            const index_params& params,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
-           index<IdxT>& idx)
+           index<CodeT>& idx)
 {
-  idx = build<T, IdxT>(handle, params, dataset);
+  idx = build<T, CodeT>(handle, params, dataset);
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 auto extend(raft::resources const& handle,
             raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
-            const index<IdxT>& orig_index) -> index<IdxT>
+            const index<CodeT>& orig_index) -> index<CodeT>
 {
   RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
                "new_vectors should have the same dimension as the index");
@@ -539,18 +539,18 @@ auto extend(raft::resources const& handle,
                  "new_vectors and new_indices have different number of rows");
   }
   int64_t n_rows = new_vectors.extent(0);
-  return extend<T, IdxT>(handle,
-                         orig_index,
-                         new_vectors.data_handle(),
-                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                         n_rows);
+  return extend<T, CodeT>(handle,
+                          orig_index,
+                          new_vectors.data_handle(),
+                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                          n_rows);
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 auto extend(raft::resources const& handle,
             raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
-            const index<IdxT>& orig_index) -> index<IdxT>
+            const index<CodeT>& orig_index) -> index<CodeT>
 {
   RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
                "new_vectors should have the same dimension as the index");
@@ -559,18 +559,18 @@ auto extend(raft::resources const& handle,
                  "new_vectors and new_indices have different number of rows");
   }
   int64_t n_rows = new_vectors.extent(0);
-  return extend<T, IdxT>(handle,
-                         orig_index,
-                         new_vectors.data_handle(),
-                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                         n_rows);
+  return extend<T, CodeT>(handle,
+                          orig_index,
+                          new_vectors.data_handle(),
+                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                          n_rows);
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void extend(raft::resources const& handle,
             raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
-            index<IdxT>* idx)
+            index<CodeT>* idx)
 {
   RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
                "new_vectors should have the same dimension as the index");
@@ -585,11 +585,11 @@ void extend(raft::resources const& handle,
                  new_vectors.extent(0));
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void extend(raft::resources const& handle,
             raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
-            index<IdxT>* idx)
+            index<CodeT>* idx)
 {
   RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
                "new_vectors should have the same dimension as the index");
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
index a97aebb11c..f22154e50f 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
@@ -9,77 +9,77 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, IdxT)                                               \
-  auto build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)            \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
-  }                                                                                          \
-                                                                                             \
-  void build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,            \
-             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
-  }                                                                                          \
-                                                                                             \
-  auto build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)              \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
-  }                                                                                          \
-                                                                                             \
-  void build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,              \
-             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
-  }                                                                                          \
-                                                                                             \
-  auto extend(raft::resources const& handle,                                                 \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                     \
-  }                                                                                          \
-                                                                                             \
-  void extend(raft::resources const& handle,                                                 \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
-              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
-  }                                                                                          \
-                                                                                             \
-  auto extend(raft::resources const& handle,                                                 \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                     \
-  }                                                                                          \
-                                                                                             \
-  void extend(raft::resources const& handle,                                                 \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
-              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, CodeT)                                               \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)             \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
   }
 
 CUVS_INST_IVF_SQ_BUILD_EXTEND(float, uint8_t);
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
index 9148e5c328..e6900af4a0 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
@@ -9,77 +9,77 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, IdxT)                                               \
-  auto build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)            \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
-  }                                                                                          \
-                                                                                             \
-  void build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,            \
-             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
-  }                                                                                          \
-                                                                                             \
-  auto build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)              \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset)));  \
-  }                                                                                          \
-                                                                                             \
-  void build(raft::resources const& handle,                                                  \
-             const cuvs::neighbors::ivf_sq::index_params& params,                            \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,              \
-             cuvs::neighbors::ivf_sq::index<IdxT>& idx)                                      \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::build<T, IdxT>(handle, params, dataset, idx);           \
-  }                                                                                          \
-                                                                                             \
-  auto extend(raft::resources const& handle,                                                 \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                     \
-  }                                                                                          \
-                                                                                             \
-  void extend(raft::resources const& handle,                                                 \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,       \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,   \
-              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
-  }                                                                                          \
-                                                                                             \
-  auto extend(raft::resources const& handle,                                                 \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<IdxT>                                                  \
-  {                                                                                          \
-    return cuvs::neighbors::ivf_sq::index<IdxT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                     \
-  }                                                                                          \
-                                                                                             \
-  void extend(raft::resources const& handle,                                                 \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,         \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,     \
-              cuvs::neighbors::ivf_sq::index<IdxT>* idx)                                     \
-  {                                                                                          \
-    cuvs::neighbors::ivf_sq::detail::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, CodeT)                                               \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)             \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
   }
 
 CUVS_INST_IVF_SQ_BUILD_EXTEND(half, uint8_t);
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 17e6c4f9eb..9728b90796 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -117,7 +117,11 @@ inline uint32_t configure_grid_dim_x(
 //
 //   After all probes are scanned, the smem is reused for block_sort merge.
 // ---------------------------------------------------------------------------
-template <int BlockDim, int Capacity, SqScanMetric Metric, typename IdxT, typename IvfSampleFilterT>
+template <int BlockDim,
+          int Capacity,
+          SqScanMetric Metric,
+          typename CodeT,
+          typename IvfSampleFilterT>
 __launch_bounds__(BlockDim) RAFT_KERNEL ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
                                                            const uint32_t* list_sizes,
                                                            const uint32_t* coarse_indices,
@@ -326,8 +330,8 @@ size_t sq_scan_total_smem(uint32_t dim, uint32_t k)
 // ---------------------------------------------------------------------------
 // Launch helper: dispatches on Metric, handles grid_dim_x query vs launch
 // ---------------------------------------------------------------------------
-template <int Capacity, typename IdxT, typename IvfSampleFilterT>
-void ivf_sq_scan_launch(const index<IdxT>& idx,
+template <int Capacity, typename CodeT, typename IvfSampleFilterT>
+void ivf_sq_scan_launch(const index<CodeT>& idx,
                         const float* queries_float,
                         const float* query_norms,
                         uint32_t n_queries,
@@ -425,14 +429,14 @@ void ivf_sq_scan_launch(const index<IdxT>& idx,
   switch (idx.metric()) {
     case cuvs::distance::DistanceType::L2Expanded:
     case cuvs::distance::DistanceType::L2SqrtExpanded:
-      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, IdxT, IvfSampleFilterT>);
+      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, CodeT, IvfSampleFilterT>);
       break;
     case cuvs::distance::DistanceType::InnerProduct:
-      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, IdxT, IvfSampleFilterT>);
+      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, CodeT, IvfSampleFilterT>);
       break;
     case cuvs::distance::DistanceType::CosineExpanded:
       do_launch(
-        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, IdxT, IvfSampleFilterT>);
+        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, CodeT, IvfSampleFilterT>);
       break;
     default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
   }
@@ -441,9 +445,9 @@ void ivf_sq_scan_launch(const index<IdxT>& idx,
 // ---------------------------------------------------------------------------
 // ivf_sq_scan: top-level scan dispatch with Capacity selection
 // ---------------------------------------------------------------------------
-template <typename IdxT, typename IvfSampleFilterT>
+template <typename CodeT, typename IvfSampleFilterT>
 void ivf_sq_scan(raft::resources const& handle,
-                 const index<IdxT>& idx,
+                 const index<CodeT>& idx,
                  const float* queries_float,
                  const float* query_norms,
                  uint32_t n_queries,
@@ -472,20 +476,20 @@ void ivf_sq_scan(raft::resources const& handle,
   }
 
   auto fwd = [&](auto cap_tag) {
-    ivf_sq_scan_launch<decltype(cap_tag)::value, IdxT, IvfSampleFilterT>(idx,
-                                                                         queries_float,
-                                                                         query_norms,
-                                                                         n_queries,
-                                                                         n_probes,
-                                                                         k,
-                                                                         max_samples,
-                                                                         coarse_indices,
-                                                                         chunk_indices,
-                                                                         out_distances,
-                                                                         out_indices,
-                                                                         sample_filter,
-                                                                         grid_dim_x,
-                                                                         stream);
+    ivf_sq_scan_launch<decltype(cap_tag)::value, CodeT, IvfSampleFilterT>(idx,
+                                                                          queries_float,
+                                                                          query_norms,
+                                                                          n_queries,
+                                                                          n_probes,
+                                                                          k,
+                                                                          max_samples,
+                                                                          coarse_indices,
+                                                                          chunk_indices,
+                                                                          out_distances,
+                                                                          out_indices,
+                                                                          sample_filter,
+                                                                          grid_dim_x,
+                                                                          stream);
   };
 
   switch (capacity) {
@@ -501,9 +505,9 @@ void ivf_sq_scan(raft::resources const& handle,
 // ---------------------------------------------------------------------------
 // search_impl — host-side search logic
 // ---------------------------------------------------------------------------
-template <typename T, typename IdxT, typename IvfSampleFilterT>
+template <typename T, typename CodeT, typename IvfSampleFilterT>
 void search_impl(raft::resources const& handle,
-                 const index<IdxT>& index,
+                 const index<CodeT>& index,
                  const T* queries,
                  uint32_t n_queries,
                  uint32_t k,
@@ -802,11 +806,11 @@ void search_impl(raft::resources const& handle,
 }
 
 template <typename T,
-          typename IdxT,
+          typename CodeT,
           typename IvfSampleFilterT = cuvs::neighbors::filtering::none_sample_filter>
 inline void search_with_filtering(raft::resources const& handle,
                                   const search_params& params,
-                                  const index<IdxT>& index,
+                                  const index<CodeT>& index,
                                   const T* queries,
                                   uint32_t n_queries,
                                   uint32_t k,
@@ -864,24 +868,24 @@ inline void search_with_filtering(raft::resources const& handle,
   for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
     uint32_t queries_batch = std::min(max_queries, n_queries - offset_q);
 
-    search_impl<T, IdxT, IvfSampleFilterT>(handle,
-                                           index,
-                                           queries + std::size_t(offset_q) * index.dim(),
-                                           queries_batch,
-                                           k,
-                                           n_probes,
-                                           cuvs::distance::is_min_close(index.metric()),
-                                           neighbors + std::size_t(offset_q) * k,
-                                           distances + std::size_t(offset_q) * k,
-                                           raft::resource::get_workspace_resource(handle),
-                                           sample_filter);
+    search_impl<T, CodeT, IvfSampleFilterT>(handle,
+                                            index,
+                                            queries + std::size_t(offset_q) * index.dim(),
+                                            queries_batch,
+                                            k,
+                                            n_probes,
+                                            cuvs::distance::is_min_close(index.metric()),
+                                            neighbors + std::size_t(offset_q) * k,
+                                            distances + std::size_t(offset_q) * k,
+                                            raft::resource::get_workspace_resource(handle),
+                                            sample_filter);
   }
 }
 
-template <typename T, typename IdxT, typename IvfSampleFilterT>
+template <typename T, typename CodeT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
                            const search_params& params,
-                           const index<IdxT>& index,
+                           const index<CodeT>& index,
                            raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
                            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
                            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -906,10 +910,10 @@ void search_with_filtering(raft::resources const& handle,
                         sample_filter);
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename CodeT>
 void search(raft::resources const& handle,
             const search_params& params,
-            const index<IdxT>& idx,
+            const index<CodeT>& idx,
             raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
index 60d95a153f..de185de8ec 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
@@ -9,10 +9,10 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-#define CUVS_INST_IVF_SQ_SEARCH(T, IdxT)                                             \
+#define CUVS_INST_IVF_SQ_SEARCH(T, CodeT)                                            \
   void search(raft::resources const& handle,                                         \
               const cuvs::neighbors::ivf_sq::search_params& params,                  \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& index,                     \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& index,                    \
               raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
               raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
               raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
index fbed3fd432..40029119b2 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
@@ -9,10 +9,10 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-#define CUVS_INST_IVF_SQ_SEARCH(T, IdxT)                                             \
+#define CUVS_INST_IVF_SQ_SEARCH(T, CodeT)                                            \
   void search(raft::resources const& handle,                                         \
               const cuvs::neighbors::ivf_sq::search_params& params,                  \
-              const cuvs::neighbors::ivf_sq::index<IdxT>& index,                     \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& index,                    \
               raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
               raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
               raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
index 8aa1f12e04..b201cceee7 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
@@ -22,13 +22,13 @@ namespace cuvs::neighbors::ivf_sq::detail {
 
 constexpr int serialization_version = 1;
 
-template <typename IdxT>
-void serialize(raft::resources const& handle, std::ostream& os, const index<IdxT>& index_)
+template <typename CodeT>
+void serialize(raft::resources const& handle, std::ostream& os, const index<CodeT>& index_)
 {
   RAFT_LOG_DEBUG(
     "Saving IVF-SQ index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
 
-  std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>().to_string();
+  std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<CodeT>().to_string();
   dtype_string.resize(4);
   os << dtype_string;
 
@@ -60,7 +60,7 @@ void serialize(raft::resources const& handle, std::ostream& os, const index<IdxT
   raft::resource::sync_stream(handle);
   serialize_mdspan(handle, os, sizes_host.view());
 
-  list_spec<uint32_t, IdxT, int64_t> list_store_spec{index_.dim(), true};
+  list_spec<uint32_t, CodeT, int64_t> list_store_spec{index_.dim(), true};
   for (uint32_t label = 0; label < index_.n_lists(); label++) {
     ivf::serialize_list(handle,
                         os,
@@ -71,10 +71,10 @@ void serialize(raft::resources const& handle, std::ostream& os, const index<IdxT
   raft::resource::sync_stream(handle);
 }
 
-template <typename IdxT>
+template <typename CodeT>
 void serialize(raft::resources const& handle,
                const std::string& filename,
-               const index<IdxT>& index_)
+               const index<CodeT>& index_)
 {
   std::ofstream of(filename, std::ios::out | std::ios::binary);
   if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
@@ -83,8 +83,8 @@ void serialize(raft::resources const& handle,
   if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
 }
 
-template <typename IdxT>
-auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
+template <typename CodeT>
+auto deserialize(raft::resources const& handle, std::istream& is) -> index<CodeT>
 {
   char dtype_string[4];
   is.read(dtype_string, 4);
@@ -99,7 +99,7 @@ auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
   auto metric  = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
   bool cma     = raft::deserialize_scalar<bool>(handle, is);
 
-  index<IdxT> index_ = index<IdxT>(handle, metric, n_lists, dim, cma);
+  index<CodeT> index_ = index<CodeT>(handle, metric, n_lists, dim, cma);
 
   deserialize_mdspan(handle, is, index_.centers());
 
@@ -119,8 +119,8 @@ auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
 
   deserialize_mdspan(handle, is, index_.list_sizes());
 
-  list_spec<uint32_t, IdxT, int64_t> list_device_spec{index_.dim(), cma};
-  list_spec<uint32_t, IdxT, int64_t> list_store_spec{index_.dim(), true};
+  list_spec<uint32_t, CodeT, int64_t> list_device_spec{index_.dim(), cma};
+  list_spec<uint32_t, CodeT, int64_t> list_store_spec{index_.dim(), true};
   for (uint32_t label = 0; label < index_.n_lists(); label++) {
     ivf::deserialize_list(handle, is, index_.lists()[label], list_store_spec, list_device_spec);
   }
@@ -131,29 +131,29 @@ auto deserialize(raft::resources const& handle, std::istream& is) -> index<IdxT>
   return index_;
 }
 
-template <typename IdxT>
-auto deserialize(raft::resources const& handle, const std::string& filename) -> index<IdxT>
+template <typename CodeT>
+auto deserialize(raft::resources const& handle, const std::string& filename) -> index<CodeT>
 {
   std::ifstream is(filename, std::ios::in | std::ios::binary);
   if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
-  auto index = detail::deserialize<IdxT>(handle, is);
+  auto index = detail::deserialize<CodeT>(handle, is);
   is.close();
   return index;
 }
 
 }  // namespace cuvs::neighbors::ivf_sq::detail
 
-#define CUVS_INST_IVF_SQ_SERIALIZE(IdxT)                                           \
-  void serialize(raft::resources const& handle,                                    \
-                 const std::string& filename,                                      \
-                 const cuvs::neighbors::ivf_sq::index<IdxT>& index)                \
-  {                                                                                \
-    cuvs::neighbors::ivf_sq::detail::serialize(handle, filename, index);           \
-  }                                                                                \
-                                                                                   \
-  void deserialize(raft::resources const& handle,                                  \
-                   const std::string& filename,                                    \
-                   cuvs::neighbors::ivf_sq::index<IdxT>* index)                    \
-  {                                                                                \
-    *index = cuvs::neighbors::ivf_sq::detail::deserialize<IdxT>(handle, filename); \
+#define CUVS_INST_IVF_SQ_SERIALIZE(CodeT)                                           \
+  void serialize(raft::resources const& handle,                                     \
+                 const std::string& filename,                                       \
+                 const cuvs::neighbors::ivf_sq::index<CodeT>& index)                \
+  {                                                                                 \
+    cuvs::neighbors::ivf_sq::detail::serialize(handle, filename, index);            \
+  }                                                                                 \
+                                                                                    \
+  void deserialize(raft::resources const& handle,                                   \
+                   const std::string& filename,                                     \
+                   cuvs::neighbors::ivf_sq::index<CodeT>* index)                    \
+  {                                                                                 \
+    *index = cuvs::neighbors::ivf_sq::detail::deserialize<CodeT>(handle, filename); \
   }
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index bf5b6df288..5a110a476c 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -12,24 +12,24 @@
 
 namespace cuvs::neighbors::ivf_sq {
 
-template <typename IdxT>
-index<IdxT>::index(raft::resources const& res)
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res)
   : index(res, cuvs::distance::DistanceType::L2Expanded, 0, 0, false)
 {
 }
 
-template <typename IdxT>
-index<IdxT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
   : index(res, params.metric, params.n_lists, dim, params.conservative_memory_allocation)
 {
 }
 
-template <typename IdxT>
-index<IdxT>::index(raft::resources const& res,
-                   cuvs::distance::DistanceType metric,
-                   uint32_t n_lists,
-                   uint32_t dim,
-                   bool conservative_memory_allocation)
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res,
+                    cuvs::distance::DistanceType metric,
+                    uint32_t n_lists,
+                    uint32_t dim,
+                    bool conservative_memory_allocation)
   : cuvs::neighbors::index(),
     metric_(metric),
     conservative_memory_allocation_(conservative_memory_allocation),
@@ -39,7 +39,7 @@ index<IdxT>::index(raft::resources const& res,
     center_norms_(std::nullopt),
     sq_vmin_{raft::make_device_vector<float, uint32_t>(res, dim)},
     sq_delta_{raft::make_device_vector<float, uint32_t>(res, dim)},
-    data_ptrs_{raft::make_device_vector<IdxT*, uint32_t>(res, n_lists)},
+    data_ptrs_{raft::make_device_vector<CodeT*, uint32_t>(res, n_lists)},
     inds_ptrs_{raft::make_device_vector<int64_t*, uint32_t>(res, n_lists)},
     accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
 {
@@ -49,68 +49,68 @@ index<IdxT>::index(raft::resources const& res,
   RAFT_CUDA_TRY(
     cudaMemsetAsync(list_sizes_.data_handle(), 0, list_sizes_.size() * sizeof(uint32_t), stream));
   RAFT_CUDA_TRY(
-    cudaMemsetAsync(data_ptrs_.data_handle(), 0, data_ptrs_.size() * sizeof(IdxT*), stream));
+    cudaMemsetAsync(data_ptrs_.data_handle(), 0, data_ptrs_.size() * sizeof(CodeT*), stream));
   RAFT_CUDA_TRY(
     cudaMemsetAsync(inds_ptrs_.data_handle(), 0, inds_ptrs_.size() * sizeof(int64_t*), stream));
 }
 
-template <typename IdxT>
-cuvs::distance::DistanceType index<IdxT>::metric() const noexcept
+template <typename CodeT>
+cuvs::distance::DistanceType index<CodeT>::metric() const noexcept
 {
   return metric_;
 }
 
-template <typename IdxT>
-int64_t index<IdxT>::size() const noexcept
+template <typename CodeT>
+int64_t index<CodeT>::size() const noexcept
 {
   return accum_sorted_sizes()(n_lists());
 }
 
-template <typename IdxT>
-uint32_t index<IdxT>::dim() const noexcept
+template <typename CodeT>
+uint32_t index<CodeT>::dim() const noexcept
 {
   return centers_.extent(1);
 }
 
-template <typename IdxT>
-uint32_t index<IdxT>::n_lists() const noexcept
+template <typename CodeT>
+uint32_t index<CodeT>::n_lists() const noexcept
 {
   return lists_.size();
 }
 
-template <typename IdxT>
-bool index<IdxT>::conservative_memory_allocation() const noexcept
+template <typename CodeT>
+bool index<CodeT>::conservative_memory_allocation() const noexcept
 {
   return conservative_memory_allocation_;
 }
 
-template <typename IdxT>
-raft::device_vector_view<uint32_t, uint32_t> index<IdxT>::list_sizes() noexcept
+template <typename CodeT>
+raft::device_vector_view<uint32_t, uint32_t> index<CodeT>::list_sizes() noexcept
 {
   return list_sizes_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<const uint32_t, uint32_t> index<IdxT>::list_sizes() const noexcept
+template <typename CodeT>
+raft::device_vector_view<const uint32_t, uint32_t> index<CodeT>::list_sizes() const noexcept
 {
   return list_sizes_.view();
 }
 
-template <typename IdxT>
-raft::device_matrix_view<float, uint32_t, raft::row_major> index<IdxT>::centers() noexcept
+template <typename CodeT>
+raft::device_matrix_view<float, uint32_t, raft::row_major> index<CodeT>::centers() noexcept
 {
   return centers_.view();
 }
 
-template <typename IdxT>
-raft::device_matrix_view<const float, uint32_t, raft::row_major> index<IdxT>::centers()
+template <typename CodeT>
+raft::device_matrix_view<const float, uint32_t, raft::row_major> index<CodeT>::centers()
   const noexcept
 {
   return centers_.view();
 }
 
-template <typename IdxT>
-std::optional<raft::device_vector_view<float, uint32_t>> index<IdxT>::center_norms() noexcept
+template <typename CodeT>
+std::optional<raft::device_vector_view<float, uint32_t>> index<CodeT>::center_norms() noexcept
 {
   if (center_norms_.has_value()) {
     return std::make_optional<raft::device_vector_view<float, uint32_t>>(center_norms_->view());
@@ -119,8 +119,8 @@ std::optional<raft::device_vector_view<float, uint32_t>> index<IdxT>::center_nor
   }
 }
 
-template <typename IdxT>
-std::optional<raft::device_vector_view<const float, uint32_t>> index<IdxT>::center_norms()
+template <typename CodeT>
+std::optional<raft::device_vector_view<const float, uint32_t>> index<CodeT>::center_norms()
   const noexcept
 {
   if (center_norms_.has_value()) {
@@ -131,8 +131,8 @@ std::optional<raft::device_vector_view<const float, uint32_t>> index<IdxT>::cent
   }
 }
 
-template <typename IdxT>
-void index<IdxT>::allocate_center_norms(raft::resources const& res)
+template <typename CodeT>
+void index<CodeT>::allocate_center_norms(raft::resources const& res)
 {
   switch (metric_) {
     case cuvs::distance::DistanceType::L2Expanded:
@@ -146,80 +146,80 @@ void index<IdxT>::allocate_center_norms(raft::resources const& res)
   }
 }
 
-template <typename IdxT>
-raft::device_vector_view<float, uint32_t> index<IdxT>::sq_vmin() noexcept
+template <typename CodeT>
+raft::device_vector_view<float, uint32_t> index<CodeT>::sq_vmin() noexcept
 {
   return sq_vmin_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<const float, uint32_t> index<IdxT>::sq_vmin() const noexcept
+template <typename CodeT>
+raft::device_vector_view<const float, uint32_t> index<CodeT>::sq_vmin() const noexcept
 {
   return sq_vmin_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<float, uint32_t> index<IdxT>::sq_delta() noexcept
+template <typename CodeT>
+raft::device_vector_view<float, uint32_t> index<CodeT>::sq_delta() noexcept
 {
   return sq_delta_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<const float, uint32_t> index<IdxT>::sq_delta() const noexcept
+template <typename CodeT>
+raft::device_vector_view<const float, uint32_t> index<CodeT>::sq_delta() const noexcept
 {
   return sq_delta_.view();
 }
 
-template <typename IdxT>
-raft::host_vector_view<int64_t, uint32_t> index<IdxT>::accum_sorted_sizes() noexcept
+template <typename CodeT>
+raft::host_vector_view<int64_t, uint32_t> index<CodeT>::accum_sorted_sizes() noexcept
 {
   return accum_sorted_sizes_.view();
 }
 
-template <typename IdxT>
-raft::host_vector_view<const int64_t, uint32_t> index<IdxT>::accum_sorted_sizes() const noexcept
+template <typename CodeT>
+raft::host_vector_view<const int64_t, uint32_t> index<CodeT>::accum_sorted_sizes() const noexcept
 {
   return accum_sorted_sizes_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<IdxT*, uint32_t> index<IdxT>::data_ptrs() noexcept
+template <typename CodeT>
+raft::device_vector_view<CodeT*, uint32_t> index<CodeT>::data_ptrs() noexcept
 {
   return data_ptrs_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<IdxT* const, uint32_t> index<IdxT>::data_ptrs() const noexcept
+template <typename CodeT>
+raft::device_vector_view<CodeT* const, uint32_t> index<CodeT>::data_ptrs() const noexcept
 {
   return data_ptrs_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<int64_t*, uint32_t> index<IdxT>::inds_ptrs() noexcept
+template <typename CodeT>
+raft::device_vector_view<int64_t*, uint32_t> index<CodeT>::inds_ptrs() noexcept
 {
   return inds_ptrs_.view();
 }
 
-template <typename IdxT>
-raft::device_vector_view<int64_t* const, uint32_t> index<IdxT>::inds_ptrs() const noexcept
+template <typename CodeT>
+raft::device_vector_view<int64_t* const, uint32_t> index<CodeT>::inds_ptrs() const noexcept
 {
   return inds_ptrs_.view();
 }
 
-template <typename IdxT>
-std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& index<IdxT>::lists() noexcept
+template <typename CodeT>
+std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& index<CodeT>::lists() noexcept
 {
   return lists_;
 }
 
-template <typename IdxT>
-const std::vector<std::shared_ptr<list_data<IdxT, int64_t>>>& index<IdxT>::lists() const noexcept
+template <typename CodeT>
+const std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& index<CodeT>::lists() const noexcept
 {
   return lists_;
 }
 
-template <typename IdxT>
-void index<IdxT>::check_consistency()
+template <typename CodeT>
+void index<CodeT>::check_consistency()
 {
   auto n_lists = lists_.size();
   RAFT_EXPECTS(list_sizes_.extent(0) == n_lists, "inconsistent list size");

From 8c44557997c77a0d41c865e8c046fc7a01ba155a Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 20 Apr 2026 16:12:20 +0200
Subject: [PATCH 26/43] addressing review

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     | 229 +++++++++++++-----
 cpp/tests/neighbors/ann_ivf_sq.cuh            |  60 +++--
 .../ann_ivf_sq/test_float_int64_t.cu          |   5 +-
 .../neighbors/ann_ivf_sq/test_half_int64_t.cu |   5 +-
 4 files changed, 207 insertions(+), 92 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 7940d0d7fb..570ccbb872 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -56,15 +56,99 @@ struct ColMinMaxOp {
   }
 };
 
+/**
+ * Vectorized load helper: reads VecCols contiguous elements of type T as
+ * a single aligned wide load and unpacks them into floats.
+ *
+ * The primary benefit over scalar loads is halving (VecCols=2) or
+ * quartering (VecCols=4) the number of LDG instructions issued per warp,
+ * which is the dominant cost in the column-strided access pattern of
+ * fused_column_minmax_kernel. VecCols=1 is provided as the degenerate
+ * scalar fallback for odd `dim`.
+ *
+ * Requires `p` to be aligned to sizeof(T) * VecCols.
+ */
+template <typename T, int VecCols>
+struct vec_loader;
+
+template <>
+struct vec_loader<float, 1> {
+  __device__ __forceinline__ static void load(const float* p, float (&out)[1]) { out[0] = *p; }
+};
+
+template <>
+struct vec_loader<half, 1> {
+  __device__ __forceinline__ static void load(const half* p, float (&out)[1])
+  {
+    out[0] = float(*p);
+  }
+};
+
+template <>
+struct vec_loader<float, 4> {
+  __device__ __forceinline__ static void load(const float* p, float (&out)[4])
+  {
+    float4 v = *reinterpret_cast<const float4*>(p);
+    out[0]   = v.x;
+    out[1]   = v.y;
+    out[2]   = v.z;
+    out[3]   = v.w;
+  }
+};
+
+template <>
+struct vec_loader<float, 2> {
+  __device__ __forceinline__ static void load(const float* p, float (&out)[2])
+  {
+    float2 v = *reinterpret_cast<const float2*>(p);
+    out[0]   = v.x;
+    out[1]   = v.y;
+  }
+};
+
+template <>
+struct vec_loader<half, 4> {
+  __device__ __forceinline__ static void load(const half* p, float (&out)[4])
+  {
+    // Single 8-byte load covering 4 halves; memcpy avoids aliasing issues
+    // and is compiled to a register move in device code.
+    uint2 raw = *reinterpret_cast<const uint2*>(p);
+    half h[4];
+    static_assert(sizeof(h) == sizeof(raw), "unexpected half packing");
+    memcpy(&h[0], &raw, sizeof(raw));
+#pragma unroll
+    for (int k = 0; k < 4; ++k)
+      out[k] = float(h[k]);
+  }
+};
+
+template <>
+struct vec_loader<half, 2> {
+  __device__ __forceinline__ static void load(const half* p, float (&out)[2])
+  {
+    uint32_t raw = *reinterpret_cast<const uint32_t*>(p);
+    half h[2];
+    static_assert(sizeof(h) == sizeof(raw), "unexpected half packing");
+    memcpy(&h[0], &raw, sizeof(raw));
+    out[0] = float(h[0]);
+    out[1] = float(h[1]);
+  }
+};
+
 /**
  * Fused per-column min+max in a single pass (2x less DRAM traffic than two
- * separate reductions).  One thread block per column; threads stride over
- * rows and feed CUB BlockReduce with a combined min/max pair.
+ * separate reductions). Each block owns VecCols contiguous columns and
+ * threads read them with a single aligned wide load (float4/float2 for
+ * float, uint2/uint32 for half) per row instead of VecCols scalar loads.
+ * Requires dim to be a multiple of VecCols so that
+ * `data + row*dim + col_base` is `sizeof(T)*VecCols` aligned for every
+ * row; the host-side dispatcher picks the widest VecCols that satisfies
+ * this, down to VecCols=1 (pure scalar fallback) for arbitrary dim.
  *
  * Row-loop is manually 4x-unrolled so the compiler can overlap four
  * independent read-only loads in the memory pipeline.
  */
-template <int BlockSize, typename T>
+template <int BlockSize, int VecCols, typename T>
 __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __restrict__ data,
                                                                     float* __restrict__ col_min,
                                                                     float* __restrict__ col_max,
@@ -74,33 +158,76 @@ __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __r
   using BlockReduce = cub::BlockReduce<ColMinMaxPair, BlockSize>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  const uint32_t col = blockIdx.x;
-  if (col >= dim) return;
+  const uint32_t col_base = blockIdx.x * VecCols;
+  // When launched with gridDim.x = dim / VecCols and dim % VecCols == 0
+  // (enforced by the host-side dispatch), col_base + VecCols <= dim always.
 
-  ColMinMaxPair agg = {std::numeric_limits<float>::max(), std::numeric_limits<float>::lowest()};
+  ColMinMaxPair agg[VecCols];
+#pragma unroll
+  for (int k = 0; k < VecCols; ++k) {
+    agg[k] = {std::numeric_limits<float>::max(), std::numeric_limits<float>::lowest()};
+  }
 
   const int64_t stride = static_cast<int64_t>(BlockSize);
   int64_t row          = static_cast<int64_t>(threadIdx.x);
 
+  // 4x row-unrolled loop with vectorized loads: 4 * VecCols values per iter
+  // are pulled into registers via 4 wide LDGs, exposing ILP across both
+  // the column and row axes.
   for (; row + 3 * stride < n_rows; row += 4 * stride) {
-    float v0    = float(data[row * dim + col]);
-    float v1    = float(data[(row + stride) * dim + col]);
-    float v2    = float(data[(row + 2 * stride) * dim + col]);
-    float v3    = float(data[(row + 3 * stride) * dim + col]);
-    agg.min_val = fminf(agg.min_val, fminf(fminf(v0, v1), fminf(v2, v3)));
-    agg.max_val = fmaxf(agg.max_val, fmaxf(fmaxf(v0, v1), fmaxf(v2, v3)));
+    float r0[VecCols], r1[VecCols], r2[VecCols], r3[VecCols];
+    vec_loader<T, VecCols>::load(data + row * dim + col_base, r0);
+    vec_loader<T, VecCols>::load(data + (row + stride) * dim + col_base, r1);
+    vec_loader<T, VecCols>::load(data + (row + 2 * stride) * dim + col_base, r2);
+    vec_loader<T, VecCols>::load(data + (row + 3 * stride) * dim + col_base, r3);
+#pragma unroll
+    for (int k = 0; k < VecCols; ++k) {
+      float mn       = fminf(fminf(r0[k], r1[k]), fminf(r2[k], r3[k]));
+      float mx       = fmaxf(fmaxf(r0[k], r1[k]), fmaxf(r2[k], r3[k]));
+      agg[k].min_val = fminf(agg[k].min_val, mn);
+      agg[k].max_val = fmaxf(agg[k].max_val, mx);
+    }
   }
   for (; row < n_rows; row += stride) {
-    float val   = float(data[row * dim + col]);
-    agg.min_val = fminf(agg.min_val, val);
-    agg.max_val = fmaxf(agg.max_val, val);
+    float r[VecCols];
+    vec_loader<T, VecCols>::load(data + row * dim + col_base, r);
+#pragma unroll
+    for (int k = 0; k < VecCols; ++k) {
+      agg[k].min_val = fminf(agg[k].min_val, r[k]);
+      agg[k].max_val = fmaxf(agg[k].max_val, r[k]);
+    }
   }
 
-  agg = BlockReduce(temp_storage).Reduce(agg, ColMinMaxOp());
+  // One block-reduce per owned column. CUB requires a __syncthreads()
+  // between reuses of the shared temp_storage.
+  for (int k = 0; k < VecCols; ++k) {
+    if (k > 0) __syncthreads();
+    auto r = BlockReduce(temp_storage).Reduce(agg[k], ColMinMaxOp());
+    if (threadIdx.x == 0) {
+      col_min[col_base + k] = r.min_val;
+      col_max[col_base + k] = r.max_val;
+    }
+  }
+}
 
-  if (threadIdx.x == 0) {
-    col_min[col] = agg.min_val;
-    col_max[col] = agg.max_val;
+/**
+ * Host-side dispatch that selects the widest VecCols compatible with the
+ * given dim alignment (VecCols in {4, 2, 1}), and launches
+ * fused_column_minmax_kernel with the corresponding grid shape.
+ */
+template <int BlockSize, typename T>
+inline void launch_fused_column_minmax(
+  const T* data, float* col_min, float* col_max, int64_t n_rows, uint32_t dim, cudaStream_t stream)
+{
+  if (dim % 4 == 0) {
+    fused_column_minmax_kernel<BlockSize, 4, T>
+      <<<dim / 4, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
+  } else if (dim % 2 == 0) {
+    fused_column_minmax_kernel<BlockSize, 2, T>
+      <<<dim / 2, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
+  } else {
+    fused_column_minmax_kernel<BlockSize, 1, T>
+      <<<dim, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
   }
 }
 
@@ -141,15 +268,21 @@ auto clone(const raft::resources& res, const index<CodeT>& source) -> index<Code
 }
 
 /**
- * Kernel to encode float residuals to uint8_t SQ codes and write them interleaved.
+ * Kernel to compute residuals on the fly and encode them to uint8_t SQ codes
+ * interleaved into the target list.
  *
  * Uses warp-per-vector parallelism: each warp cooperatively encodes one vector
- * so that reads from residuals/vmin/delta are coalesced across the 32 lanes.
- * Lane 0 handles the atomic position assignment and the index write.
+ * so that reads from new_vectors/centers/vmin/delta are coalesced across the
+ * 32 lanes. Lane 0 handles the atomic position assignment and the index write.
+ *
+ * The residual (cast<float>(new_vectors[i,d]) - centers[list_id,d]) is computed
+ * in registers and consumed immediately, avoiding a full HBM round trip through
+ * an intermediate residuals buffer.
  */
-template <int BlockSize>
+template <int BlockSize, typename T>
 __launch_bounds__(BlockSize) RAFT_KERNEL encode_and_fill_kernel(const uint32_t* labels,
-                                                                const float* residuals,
+                                                                const T* new_vectors,
+                                                                const float* centers,
                                                                 const int64_t* source_ixs,
                                                                 uint8_t** list_data_ptrs,
                                                                 int64_t** list_index_ptrs,
@@ -186,12 +319,13 @@ __launch_bounds__(BlockSize) RAFT_KERNEL encode_and_fill_kernel(const uint32_t*
   constexpr uint32_t veclen = list_spec<uint32_t, uint8_t, int64_t>::kVecLen;
   uint32_t padded_dim       = ((dim + veclen - 1) / veclen) * veclen;
   auto* list_dat   = list_data_ptrs[list_id] + static_cast<size_t>(group_offset) * padded_dim;
-  const float* src = residuals + row_id * dim;
+  const T* src     = new_vectors + row_id * dim;
+  const float* ctr = centers + static_cast<size_t>(list_id) * dim;
 
   for (uint32_t d = lane_id; d < padded_dim; d += kWarpSize) {
     uint8_t out;
     if (d < dim) {
-      float val  = src[d];
+      float val  = utils::mapping<float>{}(src[d]) - ctr[d];
       float dv   = delta[d];
       float v    = vmin[d];
       float code = (dv > 0.0f) ? roundf((val - v) / dv) : 0.0f;
@@ -271,6 +405,8 @@ void extend(raft::resources const& handle,
                                             enable_prefetch);
   vec_batches.prefetch_next_batch();
 
+  const bool needs_prefetch_sync = enable_prefetch && vec_batches.does_copy();
+
   for (const auto& batch : vec_batches) {
     auto batch_data_view =
       raft::make_device_matrix_view<const T, int64_t>(batch.data(), batch.size(), index->dim());
@@ -279,7 +415,7 @@ void extend(raft::resources const& handle,
     cuvs::cluster::kmeans::predict(
       handle, kmeans_params, batch_data_view, orig_centroids_view, batch_labels_view);
     vec_batches.prefetch_next_batch();
-    raft::resource::sync_stream(handle);
+    if (needs_prefetch_sync) { raft::resource::sync_stream(handle); }
   }
 
   auto* list_sizes_ptr    = index->list_sizes().data_handle();
@@ -320,41 +456,21 @@ void extend(raft::resources const& handle,
   vec_batches.prefetch_next_batch();
   utils::batch_load_iterator<int64_t> idx_batch = vec_indices.begin();
 
-  auto residuals_buf = raft::make_device_vector<float>(handle, max_batch_size * dim);
-
   size_t next_report_offset = 0;
   size_t d_report_offset    = n_rows * 5 / 100;
 
   for (const auto& batch : vec_batches) {
     int64_t bs = batch.size();
 
-    {
-      auto batch_view = raft::make_device_matrix_view<const T, int64_t>(batch.data(), bs, dim);
-      auto residuals_view =
-        raft::make_device_matrix_view<float, int64_t>(residuals_buf.data_handle(), bs, dim);
-
-      const float* centers_ptr   = index->centers().data_handle();
-      const uint32_t* labels_ptr = new_labels.data_handle() + batch.offset();
-
-      raft::linalg::map_offset(
-        handle,
-        residuals_view,
-        [centers_ptr, labels_ptr, dim] __device__(auto idx, T x) {
-          auto i = idx / dim;
-          auto j = idx % dim;
-          return utils::mapping<float>{}(x)-centers_ptr[labels_ptr[i] * dim + j];
-        },
-        batch_view);
-    }
-
     {
       constexpr int kEncodeBlockSize   = 256;
       constexpr int kEncodeWarpsPerBlk = kEncodeBlockSize / kIndexGroupSize;
       const dim3 block_dim(kEncodeBlockSize);
       const dim3 grid_dim(raft::ceildiv<int64_t>(bs, int64_t(kEncodeWarpsPerBlk)));
-      encode_and_fill_kernel<kEncodeBlockSize>
+      encode_and_fill_kernel<kEncodeBlockSize, T>
         <<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle() + batch.offset(),
-                                             residuals_buf.data_handle(),
+                                             batch.data(),
+                                             index->centers().data_handle(),
                                              idx_batch->data(),
                                              index->data_ptrs().data_handle(),
                                              index->inds_ptrs().data_handle(),
@@ -368,8 +484,7 @@ void extend(raft::resources const& handle,
     }
 
     vec_batches.prefetch_next_batch();
-    raft::resource::sync_stream(handle);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    if (needs_prefetch_sync) { raft::resource::sync_stream(handle); }
 
     if (batch.offset() > next_report_offset) {
       float progress = batch.offset() * 100.0f / n_rows;
@@ -456,13 +571,11 @@ inline auto build(
     // Train SQ: predict labels for the training subset, compute residuals in-place,
     // and derive per-dimension vmin/delta from them.
     {
-      auto train_labels = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
-      cuvs::cluster::kmeans::balanced_params pred_params;
-      pred_params.metric      = idx.metric();
+      auto train_labels       = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
       auto centers_const_view = raft::make_device_matrix_view<const float, int64_t>(
         idx.centers().data_handle(), idx.n_lists(), dim);
       cuvs::cluster::kmeans::predict(
-        handle, pred_params, trainset_const_view, centers_const_view, train_labels.view());
+        handle, kmeans_params, trainset_const_view, centers_const_view, train_labels.view());
 
       constexpr int kResidualBlockSize = 256;
       compute_residuals_inplace_kernel<T>
@@ -483,8 +596,8 @@ inline auto build(
       auto* vmax_ptr = vmax_buf.data_handle();
 
       constexpr int kMinMaxBlockSize = 256;
-      fused_column_minmax_kernel<kMinMaxBlockSize><<<dim, kMinMaxBlockSize, 0, stream>>>(
-        residuals.data_handle(), vmin_ptr, vmax_ptr, n_rows_train, dim);
+      launch_fused_column_minmax<kMinMaxBlockSize, T>(
+        residuals.data_handle(), vmin_ptr, vmax_ptr, n_rows_train, dim, stream);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
 
       // Expand the observed range by a small margin to reduce clipping on unseen data,
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index f7311b75e4..b62892d9b1 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -57,10 +57,37 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
   {
   }
 
-  void testSearch()
+  void testAll()
+  {
+    auto naive = compute_naive_knn();
+    auto idx   = build_index(true);
+
+    {
+      SCOPED_TRACE("Search");
+      checkSearch(idx, naive);
+    }
+    {
+      SCOPED_TRACE("Serialize");
+      checkSerialize(idx);
+    }
+    {
+      SCOPED_TRACE("Filter");
+      checkFilter(idx);
+    }
+    {
+      SCOPED_TRACE("Extend");
+      checkExtend(naive);
+    }
+  }
+
+ protected:
+  struct SearchResults {
+    std::vector<IdxT> indices;
+    std::vector<T> distances;
+  };
+
+  void checkSearch(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx, const SearchResults& naive)
   {
-    auto naive   = compute_naive_knn();
-    auto idx     = build_index(true);
     auto results = search_index(idx);
 
     float eps = 0.1;
@@ -74,10 +101,8 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
                                 min_recall_threshold()));
   }
 
-  void testSerialize()
+  void checkSerialize(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
   {
-    auto idx = build_index(true);
-
     tmp_index_file index_file;
     cuvs::neighbors::ivf_sq::serialize(handle_, index_file.filename, idx);
     cuvs::neighbors::ivf_sq::index<uint8_t> index_loaded(handle_);
@@ -101,9 +126,8 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
                                 1.0));
   }
 
-  void testExtend()
+  void checkExtend(const SearchResults& naive)
   {
-    auto naive     = compute_naive_knn();
     auto idx_empty = build_index(false);
     extend_index(&idx_empty);
 
@@ -120,7 +144,7 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
                                 min_recall_threshold()));
   }
 
-  void testFilter()
+  void checkFilter(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
   {
     if (ps.num_db_vecs <= static_cast<IdxT>(test_ivf_sample_filter::offset)) {
       GTEST_SKIP() << "Skipping filter test: num_db_vecs <= filter offset";
@@ -164,19 +188,9 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
       rmm::device_uvector<IdxT> indices_ivfsq_dev(queries_size, stream_);
 
       {
-        cuvs::neighbors::ivf_sq::index_params index_params;
         cuvs::neighbors::ivf_sq::search_params search_params;
-        index_params.n_lists   = ps.nlist;
-        index_params.metric    = ps.metric;
         search_params.n_probes = ps.nprobe;
 
-        index_params.add_data_on_build        = true;
-        index_params.kmeans_trainset_fraction = 0.5;
-
-        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-        auto index = cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
-
         auto removed_indices =
           raft::make_device_vector<IdxT, int64_t>(handle_, test_ivf_sample_filter::offset);
         raft::linalg::map_offset(handle_, removed_indices.view(), raft::identity_op{});
@@ -196,7 +210,7 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
 
         cuvs::neighbors::ivf_sq::search(handle_,
                                         search_params,
-                                        index,
+                                        idx,
                                         search_queries_view,
                                         indices_out_view,
                                         dists_out_view,
@@ -241,12 +255,6 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
     search_queries.resize(0, stream_);
   }
 
- private:
-  struct SearchResults {
-    std::vector<IdxT> indices;
-    std::vector<T> distances;
-  };
-
   double min_recall_threshold()
   {
     return std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
index 734f736b09..8831ae720a 100644
--- a/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
@@ -10,10 +10,7 @@
 namespace cuvs::neighbors::ivf_sq {
 
 typedef AnnIVFSQTest<float, float, int64_t> AnnIVFSQTestF_float;
-TEST_P(AnnIVFSQTestF_float, AnnIVFSQSearch) { this->testSearch(); }
-TEST_P(AnnIVFSQTestF_float, AnnIVFSQSerialize) { this->testSerialize(); }
-TEST_P(AnnIVFSQTestF_float, AnnIVFSQExtend) { this->testExtend(); }
-TEST_P(AnnIVFSQTestF_float, AnnIVFSQFilter) { this->testFilter(); }
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQ) { this->testAll(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
 
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
index e6f5e44dd3..fc17e246dd 100644
--- a/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
@@ -10,10 +10,7 @@
 namespace cuvs::neighbors::ivf_sq {
 
 typedef AnnIVFSQTest<float, half, int64_t> AnnIVFSQTestF_half;
-TEST_P(AnnIVFSQTestF_half, AnnIVFSQSearch) { this->testSearch(); }
-TEST_P(AnnIVFSQTestF_half, AnnIVFSQSerialize) { this->testSerialize(); }
-TEST_P(AnnIVFSQTestF_half, AnnIVFSQExtend) { this->testExtend(); }
-TEST_P(AnnIVFSQTestF_half, AnnIVFSQFilter) { this->testFilter(); }
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQ) { this->testAll(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_half, ::testing::ValuesIn(inputs_half));
 

From 80a55fd0406fb212345b74230145c722dbed2f60 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Wed, 22 Apr 2026 18:56:44 +0200
Subject: [PATCH 27/43] account for RAFT update

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh  | 13 +++++++++----
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 570ccbb872..a4a2063ca5 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -377,7 +377,7 @@ void extend(raft::resources const& handle,
 
   auto new_labels =
     raft::make_device_mdarray<LabelT>(handle,
-                                      raft::resource::get_large_workspace_resource(handle),
+                                      raft::resource::get_large_workspace_resource_ref(handle),
                                       raft::make_extents<int64_t>(n_rows));
   cuvs::cluster::kmeans::balanced_params kmeans_params;
   kmeans_params.metric     = index->metric();
@@ -401,7 +401,7 @@ void extend(raft::resources const& handle,
                                             index->dim(),
                                             max_batch_size,
                                             copy_stream,
-                                            raft::resource::get_workspace_resource(handle),
+                                            raft::resource::get_workspace_resource_ref(handle),
                                             enable_prefetch);
   vec_batches.prefetch_next_batch();
 
@@ -451,7 +451,12 @@ void extend(raft::resources const& handle,
   raft::copy(list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
 
   utils::batch_load_iterator<int64_t> vec_indices(
-    new_indices, n_rows, 1, max_batch_size, stream, raft::resource::get_workspace_resource(handle));
+    new_indices,
+    n_rows,
+    1,
+    max_batch_size,
+    stream,
+    raft::resource::get_workspace_resource_ref(handle));
   vec_batches.reset();
   vec_batches.prefetch_next_batch();
   utils::batch_load_iterator<int64_t> idx_batch = vec_indices.begin();
@@ -557,7 +562,7 @@ inline auto build(
     auto n_rows_train = n_rows / trainset_ratio;
     auto trainset =
       raft::make_device_mdarray<T>(handle,
-                                   raft::resource::get_large_workspace_resource(handle),
+                                   raft::resource::get_large_workspace_resource_ref(handle),
                                    raft::make_extents<int64_t>(n_rows_train, idx.dim()));
     raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     auto trainset_const_view = raft::make_const_mdspan(trainset.view());
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 9728b90796..6201cdd4f4 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -877,7 +877,7 @@ inline void search_with_filtering(raft::resources const& handle,
                                             cuvs::distance::is_min_close(index.metric()),
                                             neighbors + std::size_t(offset_q) * k,
                                             distances + std::size_t(offset_q) * k,
-                                            raft::resource::get_workspace_resource(handle),
+                                            raft::resource::get_workspace_resource_ref(handle),
                                             sample_filter);
   }
 }

From ac8ea4e8836dbdda4822d23b4f53b4dbf850814a Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 27 Apr 2026 16:27:16 +0200
Subject: [PATCH 28/43] IVF-SQ JIT-LTO

---
 cpp/CMakeLists.txt                            |  32 +-
 .../detail/jit_lto/ivf_sq/scan_fragments.hpp  |  20 +
 .../jit_lto_kernels/device_functions.cuh      |  25 +
 .../jit_lto_kernels/filter_kernel.cu.in       |  31 +
 .../detail/jit_lto_kernels/filter_matrix.json |   3 +
 .../detail/jit_lto_kernels/kernel_def.hpp     |  38 ++
 .../detail/jit_lto_kernels/scan_impl.cuh      | 260 ++++++++
 .../detail/jit_lto_kernels/scan_kernel.cu.in  |  62 ++
 .../detail/jit_lto_kernels/scan_matrix.json   |   8 +
 .../detail/jit_lto_kernels/scan_planner.hpp   |  38 ++
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh    | 567 ++++++------------
 .../ivf_sq_search_half_uint8_t_int64_t.cu     |  29 -
 ..._t.cu => ivf_sq_search_uint8_t_int64_t.cu} |   1 +
 13 files changed, 711 insertions(+), 403 deletions(-)
 create mode 100644 cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
 delete mode 100644 cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
 rename cpp/src/neighbors/ivf_sq/{ivf_sq_search_float_uint8_t_int64_t.cu => ivf_sq_search_uint8_t_int64_t.cu} (97%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e3f366661e..f957f8b702 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -487,6 +487,35 @@ if(NOT BUILD_CPU_ONLY)
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/post_process"
     KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
   )
+  set(ivf_sq_ns "cuvs::neighbors::ivf_sq::detail")
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<${ivf_sq_ns}::tag_metric_@metric_name@, @capacity@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+                              "<cuvs/detail/jit_lto/common_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/scan"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_filter_@filter_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_ivf_sq_filter<${neighbors_ns}::tag_filter_@filter_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+                              "<cuvs/detail/jit_lto/common_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/filter"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
   set(ivf_pq_ns "cuvs::neighbors::ivf_pq::detail")
   generate_jit_lto_kernels(
     jit_lto_files
@@ -900,8 +929,7 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/ivf_sq_index.cpp
     src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
     src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
-    src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
-    src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
     src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
     src/neighbors/knn_merge_parts.cu
     src/neighbors/nn_descent.cu
diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
new file mode 100644
index 0000000000..b20684b11f
--- /dev/null
+++ b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
@@ -0,0 +1,20 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+struct tag_metric_l2 {};
+struct tag_metric_ip {};
+struct tag_metric_cosine {};
+
+template <typename MetricTag, int Capacity>
+struct fragment_tag_ivf_sq_scan {};
+
+template <typename FilterTag>
+struct fragment_tag_ivf_sq_filter {};
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
new file mode 100644
index 0000000000..188933d803
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
@@ -0,0 +1,25 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Forward declaration of the sample filter device function.
+// The concrete implementation is provided by a JIT-LTO filter-adapter fragment
+// (see filter_kernel.cu.in) that delegates to the shared
+// cuvs::neighbors::detail::sample_filter_<name><uint32_t, int64_t> fragment.
+template <typename IndexT>
+__device__ bool sample_filter(const IndexT* const* const inds_ptrs,
+                              const uint32_t query_ix,
+                              const uint32_t cluster_ix,
+                              const uint32_t sample_ix,
+                              uint32_t* bitset_ptr,
+                              IndexT bitset_len,
+                              IndexT original_nbits);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in
new file mode 100644
index 0000000000..c93038de84
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/detail/jit_lto_kernels/filter.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+
+namespace {
+
+constexpr auto sample_filter_impl =
+  cuvs::neighbors::detail::sample_filter_@filter_name@<uint32_t, int64_t>;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+template <>
+__device__ bool sample_filter<int64_t>(const int64_t* const* const inds_ptrs,
+                                       const uint32_t query_ix,
+                                       const uint32_t cluster_ix,
+                                       const uint32_t sample_ix,
+                                       uint32_t* bitset_ptr,
+                                       int64_t bitset_len,
+                                       int64_t original_nbits)
+{
+  return sample_filter_impl(
+    inds_ptrs, query_ix, cluster_ix, sample_ix, bitset_ptr, bitset_len, original_nbits);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json
new file mode 100644
index 0000000000..2a01bc4583
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json
@@ -0,0 +1,3 @@
+{
+  "filter_name": ["none", "bitset"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp
new file mode 100644
index 0000000000..5df97deabc
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+static constexpr int kSqScanThreads = 128;
+
+// Function-pointer signature for the JIT-LTO scan entrypoint.
+// Must exactly match the extern "C" __global__ ivf_sq_scan(...) signature
+// produced by scan_kernel.cu.in.
+template <typename IdxT>
+using ivf_sq_scan_func_t = void(const uint8_t* const* data_ptrs,
+                                const uint32_t* list_sizes,
+                                const uint32_t* coarse_indices,
+                                const float* queries_float,
+                                const float* centers,
+                                const float* sq_vmin,
+                                const float* sq_delta,
+                                const float* query_norms,
+                                uint32_t n_probes,
+                                uint32_t dim,
+                                uint32_t k,
+                                uint32_t max_samples,
+                                const uint32_t* chunk_indices,
+                                float* out_distances,
+                                uint32_t* out_indices,
+                                IdxT* const* inds_ptrs,
+                                uint32_t* bitset_ptr,
+                                IdxT bitset_len,
+                                IdxT original_nbits);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
new file mode 100644
index 0000000000..bc8f7587ee
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
@@ -0,0 +1,260 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "device_functions.cuh"
+#include "kernel_def.hpp"
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <neighbors/ivf_common.cuh>
+
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>
+#include <raft/util/integer_utils.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// block_sort type selection: dispatch the dummy block sort when Capacity == 0
+// so the same impl body works for both the fused top-k path (Capacity > 0) and
+// the materialize-all path (Capacity == 0).
+template <int Capacity, bool Ascending>
+struct sq_block_sort {
+  using type = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    uint32_t>;
+};
+
+template <bool Ascending>
+struct sq_block_sort<0, Ascending> {
+  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, Ascending>;
+};
+
+template <int Capacity, bool Ascending>
+using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
+
+// IVF-SQ scan kernel body with fused in-kernel top-k.
+//
+// Grid layout:
+//   kManageLocalTopK (Capacity > 0):
+//     grid (grid_dim_x, n_queries) - each block loops over probes
+//   otherwise (Capacity == 0):
+//     grid (n_probes, n_queries) - one block per (query, probe)
+//
+// Shared-memory layout (always 3 x dim floats):
+//   [s_sq_scale(dim) | s_query_term(dim) | s_aux(dim)]
+//
+//   s_sq_scale = delta[d]  - SQ dequantization scale, invariant (Phase 1).
+//
+//   L2 path:
+//     Phase 1: s_aux[d] = query[d] - vmin[d]              (invariant)
+//     Phase 2: s_query_term[d] = s_aux[d] - centroid[d]    (per-probe)
+//     The full SQ reconstruction is centroid + vmin + code*delta, so
+//     query - reconstructed = (query - vmin - centroid) - code*delta
+//                           = s_query_term - code*s_sq_scale.
+//
+//   IP/Cosine path:
+//     Phase 1: s_query_term[d] = query[d]                  (invariant)
+//     Phase 2: s_aux[d] = centroid[d] + vmin[d]            (per-probe)
+//     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
+//
+//   After all probes are scanned, the smem is reused for block_sort merge.
+template <int Capacity, typename MetricTag>
+__device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs,
+                                                 const uint32_t* list_sizes,
+                                                 const uint32_t* coarse_indices,
+                                                 const float* queries_float,
+                                                 const float* centers,
+                                                 const float* sq_vmin,
+                                                 const float* sq_delta,
+                                                 const float* query_norms,
+                                                 uint32_t n_probes,
+                                                 uint32_t dim,
+                                                 uint32_t k,
+                                                 uint32_t max_samples,
+                                                 const uint32_t* chunk_indices,
+                                                 float* out_distances,
+                                                 uint32_t* out_indices,
+                                                 const int64_t* const* inds_ptrs,
+                                                 uint32_t* bitset_ptr,
+                                                 int64_t bitset_len,
+                                                 int64_t original_nbits)
+{
+  static_assert(kIndexGroupSize == raft::WarpSize,
+                "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
+
+  constexpr int BlockDim          = kSqScanThreads;
+  constexpr bool kManageLocalTopK = (Capacity > 0);
+  constexpr bool kIsL2            = std::is_same_v<MetricTag, tag_metric_l2>;
+  constexpr bool kIsCosine        = std::is_same_v<MetricTag, tag_metric_cosine>;
+  constexpr bool kIsIP            = std::is_same_v<MetricTag, tag_metric_ip>;
+  constexpr bool kAscending       = !kIsIP;
+
+  extern __shared__ __align__(256) uint8_t smem_buf[];
+  float* smem = reinterpret_cast<float*>(smem_buf);
+
+  float* s_sq_scale   = smem;
+  float* s_query_term = smem + dim;
+  float* s_aux        = smem + 2 * dim;
+
+  const uint32_t query_ix = blockIdx.y;
+  const float* query      = queries_float + query_ix * dim;
+
+  if constexpr (kManageLocalTopK) {
+    out_distances += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+    out_indices += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+  }
+
+  // Phase 1: load shared memory that is invariant across probes.
+  for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
+    s_sq_scale[d] = sq_delta[d];
+    if constexpr (kIsL2) {
+      s_aux[d] = query[d] - sq_vmin[d];
+    } else {
+      s_query_term[d] = query[d];
+    }
+  }
+  __syncthreads();
+
+  using local_topk_t = sq_block_sort_t<Capacity, kAscending>;
+  local_topk_t queue(k);
+
+  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
+  const uint32_t* my_chunk  = chunk_indices + query_ix * n_probes;
+
+  constexpr uint32_t veclen         = 16;
+  constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
+  const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
+
+  // Phase 2: loop over probes.
+  // Synchronization protocol:
+  //  (a) __syncthreads after Phase 1 (above) ensures invariant smem arrays
+  //      (s_sq_scale, and L2: s_aux / IP-Cosine: s_query_term) are visible
+  //      before Phase 2 overwrites the per-probe array.
+  //  (b) __syncthreads after per-probe smem writes (L2: s_query_term /
+  //      IP-Cosine: s_aux) ensures probe-specific values are visible before
+  //      the distance computation.
+  //  (c) __syncthreads at the end of each iteration ensures all distance
+  //      computation reads are complete before the next iteration overwrites
+  //      the per-probe smem region.
+  //  When cluster_sz == 0, barrier (c) is skipped because no distance reads
+  //  occurred; all threads converge on the same branch uniformly, and the
+  //  next iteration's barrier (b) provides the needed ordering.
+  for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
+       probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
+    const uint32_t cluster_id = my_coarse[probe_ix];
+    const uint32_t cluster_sz = list_sizes[cluster_id];
+
+    {
+      const float* centroid = centers + cluster_id * dim;
+      for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
+        if constexpr (kIsL2) {
+          s_query_term[d] = s_aux[d] - centroid[d];
+        } else {
+          s_aux[d] = centroid[d] + sq_vmin[d];
+        }
+      }
+    }
+    __syncthreads();  // (b)
+
+    if (cluster_sz == 0) {
+      if constexpr (!kManageLocalTopK) break;
+      continue;
+    }
+
+    const uint8_t* codes   = data_ptrs[cluster_id];
+    uint32_t sample_offset = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+    uint32_t padded_dim    = ((dim + veclen - 1) / veclen) * veclen;
+    uint32_t n_dim_blocks  = padded_dim / veclen;
+
+    for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
+         group += kWarpsPerBlock * kIndexGroupSize) {
+      const uint32_t row = group + lane_id;
+      const bool valid =
+        (row < cluster_sz) &&
+        sample_filter<int64_t>(
+          inds_ptrs, query_ix, cluster_id, row, bitset_ptr, bitset_len, original_nbits);
+
+      float dist      = 0.0f;
+      float v_norm_sq = 0.0f;
+
+      const uint8_t* group_data = codes + size_t(group) * padded_dim;
+
+      for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
+        uint8_t codes_local[veclen];
+        *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
+          group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
+
+        const uint32_t l = bl * veclen;
+#pragma unroll
+        for (uint32_t j = 0; j < veclen; j++) {
+          if (l + j < dim) {
+            float recon = float(codes_local[j]) * s_sq_scale[l + j];
+
+            if constexpr (kIsL2) {
+              float diff = s_query_term[l + j] - recon;
+              dist += diff * diff;
+            } else {
+              float v_d = s_aux[l + j] + recon;
+              dist += s_query_term[l + j] * v_d;
+              if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
+            }
+          }
+        }
+      }
+
+      if constexpr (kIsCosine) {
+        float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
+        dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+      }
+
+      if constexpr (kManageLocalTopK) {
+        float val = valid ? dist : local_topk_t::queue_t::kDummy;
+        queue.add(val, sample_offset + row);
+      } else {
+        if (valid) {
+          uint32_t out_idx       = query_ix * max_samples + sample_offset + row;
+          out_distances[out_idx] = dist;
+          out_indices[out_idx]   = sample_offset + row;
+        }
+      }
+    }
+
+    __syncthreads();  // (c)
+    if constexpr (!kManageLocalTopK) break;
+  }
+
+  if constexpr (kManageLocalTopK) {
+    // All probe iterations are done; smem_buf is reused for block_sort merge.
+    // The loop's last (b) or (c) barrier ensures all prior smem accesses have
+    // completed, so this additional barrier is only needed to synchronize any
+    // register-level state across warps before the merge.
+    __syncthreads();
+    queue.done(smem_buf);
+    queue.store(out_distances, out_indices);
+
+    // block_sort initializes unused slots with (kDummy, idx=0). When the
+    // probed clusters have fewer than k total valid vectors, those slots
+    // survive into the output and share idx=0 with the real first vector,
+    // causing duplicates. Mark them with an invalid index so
+    // postprocess_neighbors treats them as out-of-bounds.
+    // store() is a warp-0-only operation, restrict the fixup to the same warp.
+    if (threadIdx.x < raft::WarpSize) {
+      constexpr auto kDummyVal = local_topk_t::queue_t::kDummy;
+      for (uint32_t i = threadIdx.x; i < k; i += raft::WarpSize) {
+        if (out_distances[i] == kDummyVal) { out_indices[i] = uint32_t(0xFFFFFFFF); }
+      }
+    }
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
new file mode 100644
index 0000000000..3c59f91764
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh>
+
+namespace {
+
+using metric_tag       = cuvs::neighbors::ivf_sq::detail::tag_metric_@metric_name@;
+constexpr int capacity = @capacity@;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+extern "C" __global__ __launch_bounds__(kSqScanThreads) void ivf_sq_scan(
+  const uint8_t* const* data_ptrs,
+  const uint32_t* list_sizes,
+  const uint32_t* coarse_indices,
+  const float* queries_float,
+  const float* centers,
+  const float* sq_vmin,
+  const float* sq_delta,
+  const float* query_norms,
+  uint32_t n_probes,
+  uint32_t dim,
+  uint32_t k,
+  uint32_t max_samples,
+  const uint32_t* chunk_indices,
+  float* out_distances,
+  uint32_t* out_indices,
+  int64_t* const* inds_ptrs,
+  uint32_t* bitset_ptr,
+  int64_t bitset_len,
+  int64_t original_nbits)
+{
+  ivf_sq_scan_impl<capacity, metric_tag>(data_ptrs,
+                                         list_sizes,
+                                         coarse_indices,
+                                         queries_float,
+                                         centers,
+                                         sq_vmin,
+                                         sq_delta,
+                                         query_norms,
+                                         n_probes,
+                                         dim,
+                                         k,
+                                         max_samples,
+                                         chunk_indices,
+                                         out_distances,
+                                         out_indices,
+                                         inds_ptrs,
+                                         bitset_ptr,
+                                         bitset_len,
+                                         original_nbits);
+}
+
+static_assert(std::is_same_v<decltype(ivf_sq_scan), ivf_sq_scan_func_t<int64_t>>);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
new file mode 100644
index 0000000000..fac4154c18
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
@@ -0,0 +1,8 @@
+{
+  "capacity": [
+    "0", "32", "64", "128", "256"
+  ],
+  "metric_name": [
+    "l2", "ip", "cosine"
+  ]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
new file mode 100644
index 0000000000..ae157e3994
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/detail/jit_lto/AlgorithmPlanner.hpp>
+#include <cuvs/detail/jit_lto/common_fragments.hpp>
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
+
+#include <string>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+struct IvfSqScanPlanner : AlgorithmPlanner {
+  inline static LauncherJitCache launcher_jit_cache{};
+
+  IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {}
+
+  template <typename MetricTag, int Capacity>
+  void add_entrypoint()
+  {
+    this->add_static_fragment<fragment_tag_ivf_sq_scan<MetricTag, Capacity>>();
+  }
+
+  template <typename FilterTag>
+  void add_filter_device_function()
+  {
+    this->add_static_fragment<fragment_tag_ivf_sq_filter<FilterTag>>();
+    this->add_static_fragment<
+      cuvs::neighbors::detail::fragment_tag_sample_filter<cuvs::neighbors::detail::tag_bitset_u32,
+                                                          cuvs::neighbors::detail::tag_index_i64,
+                                                          FilterTag>>();
+  }
+};
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 6201cdd4f4..bbe6f05df1 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -9,6 +9,10 @@
 #include "../detail/ann_utils.cuh"
 #include "../ivf_common.cuh"
 #include "../sample_filter.cuh"
+#include "detail/jit_lto_kernels/kernel_def.hpp"
+#include "detail/jit_lto_kernels/scan_planner.hpp"
+#include <cuvs/detail/jit_lto/common_fragments.hpp>
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_sq.hpp>
 
@@ -23,6 +27,7 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/integer_utils.hpp>
 
 #include <rmm/resource_ref.hpp>
@@ -33,10 +38,6 @@ namespace cuvs::neighbors::ivf_sq::detail {
 
 using namespace cuvs::spatial::knn::detail;  // NOLINT
 
-enum class SqScanMetric { kL2, kIP, kCosine };
-
-static constexpr int kSqScanThreads = 128;
-
 // Maximum fused top-k capacity we instantiate for the scan kernel.
 // Must match the highest Capacity case in ivf_sq_scan's switch.
 static constexpr int kMaxSqScanCapacity = 256;
@@ -49,32 +50,57 @@ auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k) -> bool
   return k <= kMaxSqScanCapacity;
 }
 
-// ---------------------------------------------------------------------------
-// block_sort type selection (fused top-k vs dummy for Capacity == 0)
-// ---------------------------------------------------------------------------
-template <int Capacity, bool Ascending>
-struct sq_block_sort {
-  using type = raft::matrix::detail::select::warpsort::block_sort<
-    raft::matrix::detail::select::warpsort::warp_sort_filtered,
-    Capacity,
-    Ascending,
-    float,
-    uint32_t>;
-};
-
-template <bool Ascending>
-struct sq_block_sort<0, Ascending> {
-  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, Ascending>;
-};
-
-template <int Capacity, bool Ascending>
-using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
+// Compute shared-memory size for the scan kernel (3 x dim floats), plus the
+// block_sort merge buffer when fused top-k is enabled. Must match the smem
+// layout used by ivf_sq_scan_impl.
+inline size_t sq_scan_smem_size(uint32_t dim) { return 3 * dim * sizeof(float); }
 
-// ---------------------------------------------------------------------------
-// configure_grid_dim_x: choose grid.x to saturate the GPU
-// ---------------------------------------------------------------------------
+template <int Capacity>
+size_t sq_scan_total_smem(uint32_t dim, uint32_t k)
+{
+  size_t scan_smem = sq_scan_smem_size(dim);
+  if constexpr (Capacity > 0) {
+    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
+    int num_subwarps           = kSqScanThreads / kSubwarpSize;
+    size_t merge_smem =
+      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, uint32_t>(
+        num_subwarps, k);
+    return std::max(scan_smem, merge_smem);
+  }
+  return scan_smem;
+}
+
+// Map the runtime distance metric onto the corresponding compile-time tag and
+// invoke `f` (a generic lambda) with the tag type. Used by the JIT-LTO launch
+// dispatch to forward the metric to the planner without bloating the static
+// library with the kernel body.
+template <typename F>
+void dispatch_metric_tag(cuvs::distance::DistanceType metric, F&& f)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded: f(tag_metric_l2{}); return;
+    case cuvs::distance::DistanceType::InnerProduct: f(tag_metric_ip{}); return;
+    case cuvs::distance::DistanceType::CosineExpanded: f(tag_metric_cosine{}); return;
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
+  }
+}
+
+template <typename FilterT>
+constexpr auto get_filter_type_tag()
+{
+  using namespace cuvs::neighbors::filtering;
+  if constexpr (std::is_same_v<FilterT, none_sample_filter>) {
+    return cuvs::neighbors::detail::tag_filter_none{};
+  } else if constexpr (std::is_same_v<FilterT, bitset_filter<uint32_t, int64_t>>) {
+    return cuvs::neighbors::detail::tag_filter_bitset{};
+  }
+}
+
+// Choose grid.x to saturate the GPU. Mirrors the previous static
+// configure_grid_dim_x but takes the JIT-linked cudaKernel_t.
 inline uint32_t configure_grid_dim_x(
-  uint32_t n_queries, uint32_t n_probes, int smem_size, int block_size, const void* kernel_ptr)
+  uint32_t n_queries, uint32_t n_probes, int smem_size, int block_size, cudaKernel_t kernel)
 {
   int dev_id;
   RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
@@ -82,7 +108,7 @@ inline uint32_t configure_grid_dim_x(
   RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
   int num_blocks_per_sm = 0;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks_per_sm, kernel_ptr, block_size, smem_size));
+    &num_blocks_per_sm, kernel, block_size, smem_size));
 
   size_t min_grid_size = size_t(num_sms) * num_blocks_per_sm;
   size_t min_grid_x    = raft::ceildiv<size_t>(min_grid_size, n_queries);
@@ -90,355 +116,149 @@ inline uint32_t configure_grid_dim_x(
 }
 
 // ---------------------------------------------------------------------------
-// IVF-SQ scan kernel with fused in-kernel top-k
+// JIT-LTO scan launcher
+//
+// The scan kernel `ivf_sq_scan_impl` lives as a fatbin fragment under
+// `detail/jit_lto_kernels/`. At runtime we ask the planner for the linked
+// kernel matching (Capacity, MetricTag, FilterTag), set the dynamic shared
+// memory attribute, compute grid_dim_x via cudaOccupancy*, and dispatch.
 //
-// Grid layout:
+// Grid layout matches the original static kernel:
 //   kManageLocalTopK (Capacity > 0):
-//     grid (grid_dim_x, n_queries) — each block loops over probes
+//     grid (grid_dim_x, n_queries) - each block loops over probes
 //   otherwise (Capacity == 0):
-//     grid (n_probes, n_queries) — one block per (query, probe)
-//
-// Shared-memory layout (always 3 × dim floats):
-//   [s_sq_scale(dim) | s_query_term(dim) | s_aux(dim)]
-//
-//   s_sq_scale = delta[d]  — SQ dequantization scale, invariant (Phase 1).
-//
-//   L2 path:
-//     Phase 1: s_aux[d] = query[d] - vmin[d]               (invariant)
-//     Phase 2: s_query_term[d] = s_aux[d] - centroid[d]     (per-probe)
-//     The full SQ reconstruction is centroid + vmin + code*delta, so
-//     query - reconstructed = (query - vmin - centroid) - code*delta
-//                           = s_query_term - code*s_sq_scale.
-//
-//   IP/Cosine path:
-//     Phase 1: s_query_term[d] = query[d]                   (invariant)
-//     Phase 2: s_aux[d] = centroid[d] + vmin[d]             (per-probe)
-//     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
-//
-//   After all probes are scanned, the smem is reused for block_sort merge.
+//     grid (n_probes, n_queries) - one block per (query, probe)
 // ---------------------------------------------------------------------------
-template <int BlockDim,
-          int Capacity,
-          SqScanMetric Metric,
+template <int Capacity,
+          typename MetricTag,
           typename CodeT,
+          typename IvfSampleFilterTag,
           typename IvfSampleFilterT>
-__launch_bounds__(BlockDim) RAFT_KERNEL ivf_sq_scan_kernel(const uint8_t* const* data_ptrs,
-                                                           const uint32_t* list_sizes,
-                                                           const uint32_t* coarse_indices,
-                                                           const float* queries_float,
-                                                           const float* centers,
-                                                           const float* sq_vmin,
-                                                           const float* sq_delta,
-                                                           const float* query_norms,
-                                                           uint32_t n_probes,
-                                                           uint32_t dim,
-                                                           uint32_t k,
-                                                           uint32_t max_samples,
-                                                           const uint32_t* chunk_indices,
-                                                           float* out_distances,
-                                                           uint32_t* out_indices,
-                                                           IvfSampleFilterT sample_filter)
+void launch_kernel(const index<CodeT>& idx,
+                   const float* queries_float,
+                   const float* query_norms,
+                   uint32_t n_queries,
+                   uint32_t n_probes,
+                   uint32_t k,
+                   uint32_t max_samples,
+                   const uint32_t* coarse_indices,
+                   const uint32_t* chunk_indices,
+                   float* out_distances,
+                   uint32_t* out_indices,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream,
+                   IvfSampleFilterT sample_filter)
 {
-  static_assert(kIndexGroupSize == raft::WarpSize,
-                "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ JIT-LTO scan only supports CodeT=uint8_t");
 
   constexpr bool kManageLocalTopK = (Capacity > 0);
-  constexpr bool kIsL2            = (Metric == SqScanMetric::kL2);
-  constexpr bool kIsCosine        = (Metric == SqScanMetric::kCosine);
-  constexpr bool kAscending       = (Metric != SqScanMetric::kIP);
-
-  extern __shared__ __align__(256) uint8_t smem_buf[];
-  float* smem = reinterpret_cast<float*>(smem_buf);
+  constexpr int kThreads          = kSqScanThreads;
+  uint32_t dim                    = idx.dim();
 
-  float* s_sq_scale   = smem;
-  float* s_query_term = smem + dim;
-  float* s_aux        = smem + 2 * dim;
+  IvfSqScanPlanner kernel_planner;
+  kernel_planner.add_entrypoint<MetricTag, Capacity>();
+  kernel_planner.add_filter_device_function<IvfSampleFilterTag>();
+  auto kernel_launcher = kernel_planner.get_launcher();
+
+  // Extract bitset arguments from the user's filter object so they can be
+  // passed to the JIT-linked kernel as plain pointers/scalars; the JIT'd
+  // sample_filter fragment reconstructs the bitset_view inside the kernel.
+  int64_t* const* inds_ptrs_dev = idx.inds_ptrs().data_handle();
+  uint32_t* bitset_ptr          = nullptr;
+  int64_t bitset_len            = 0;
+  int64_t original_nbits        = 0;
+  if constexpr (std::is_same_v<IvfSampleFilterT,
+                               cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>>) {
+    bitset_ptr     = sample_filter.view().data();
+    bitset_len     = sample_filter.view().size();
+    original_nbits = sample_filter.view().get_original_nbits();
+  }
 
-  const uint32_t query_ix = blockIdx.y;
-  const float* query      = queries_float + query_ix * dim;
+  size_t smem = sq_scan_total_smem<Capacity>(dim, k);
 
-  // Point output to this block's slice when using fused top-k
-  if constexpr (kManageLocalTopK) {
-    out_distances += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
-    out_indices += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+  {
+    int dev_id;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    int max_smem;
+    RAFT_CUDA_TRY(
+      cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_id));
+    RAFT_EXPECTS(smem <= size_t(max_smem),
+                 "IVF-SQ scan kernel requires %zu bytes of shared memory (dim=%u, k=%u), "
+                 "but the device supports at most %d bytes per block.",
+                 smem,
+                 dim,
+                 k,
+                 max_smem);
   }
 
-  // --- Phase 1: load shared memory that is invariant across probes ---
-  for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
-    s_sq_scale[d] = sq_delta[d];
-    if constexpr (kIsL2) {
-      s_aux[d] = query[d] - sq_vmin[d];
-    } else {
-      s_query_term[d] = query[d];
-    }
+  {
+    int dev_id;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    RAFT_CUDA_TRY(cudaKernelSetAttributeForDevice(kernel_launcher->get_kernel(),
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  static_cast<int>(smem),
+                                                  dev_id));
   }
-  __syncthreads();
-
-  using local_topk_t = sq_block_sort_t<Capacity, kAscending>;
-  local_topk_t queue(k);
-
-  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
-  const uint32_t* my_chunk  = chunk_indices + query_ix * n_probes;
-
-  constexpr uint32_t veclen         = 16;
-  constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
-  const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
-  const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
-
-  // --- Phase 2: loop over probes ---
-  // Synchronization protocol:
-  //  (a) __syncthreads after Phase 1 (above) ensures invariant smem arrays
-  //      (s_sq_scale, and L2: s_aux / IP-Cosine: s_query_term) are visible
-  //      before Phase 2 overwrites the per-probe array.
-  //  (b) __syncthreads after per-probe smem writes (L2: s_query_term /
-  //      IP-Cosine: s_aux) ensures probe-specific values are visible before
-  //      the distance computation.
-  //  (c) __syncthreads at the end of each iteration ensures all distance
-  //      computation reads are complete before the next iteration overwrites
-  //      the per-probe smem region.
-  //  When cluster_sz == 0, barrier (c) is skipped because no distance reads
-  //  occurred; all threads converge on the same branch uniformly, and the
-  //  next iteration's barrier (b) provides the needed ordering.
-  for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
-       probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
-    const uint32_t cluster_id = my_coarse[probe_ix];
-    const uint32_t cluster_sz = list_sizes[cluster_id];
-
-    // Load centroid-dependent shared memory terms
-    {
-      const float* centroid = centers + cluster_id * dim;
-      for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
-        if constexpr (kIsL2) {
-          s_query_term[d] = s_aux[d] - centroid[d];
-        } else {
-          s_aux[d] = centroid[d] + sq_vmin[d];
-        }
-      }
-    }
-    __syncthreads();  // (b)
-
-    if (cluster_sz == 0) {
-      // No distance computation reads happened, so no end-of-iteration
-      // barrier is needed; the next iteration's barrier (b) is sufficient.
-      if constexpr (!kManageLocalTopK) break;
-      continue;
-    }
-
-    const uint8_t* codes   = data_ptrs[cluster_id];
-    uint32_t sample_offset = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
-    uint32_t padded_dim    = ((dim + veclen - 1) / veclen) * veclen;
-    uint32_t n_dim_blocks  = padded_dim / veclen;
-
-    for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
-         group += kWarpsPerBlock * kIndexGroupSize) {
-      const uint32_t row = group + lane_id;
-      const bool valid   = (row < cluster_sz) && sample_filter(query_ix, cluster_id, row);
-
-      float dist      = 0.0f;
-      float v_norm_sq = 0.0f;
-
-      const uint8_t* group_data = codes + size_t(group) * padded_dim;
-
-      for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
-        uint8_t codes_local[veclen];
-        *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
-          group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
-
-        const uint32_t l = bl * veclen;
-#pragma unroll
-        for (uint32_t j = 0; j < veclen; j++) {
-          if (l + j < dim) {
-            float recon = float(codes_local[j]) * s_sq_scale[l + j];
-
-            if constexpr (kIsL2) {
-              float diff = s_query_term[l + j] - recon;
-              dist += diff * diff;
-            } else {
-              float v_d = s_aux[l + j] + recon;
-              dist += s_query_term[l + j] * v_d;
-              if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
-            }
-          }
-        }
-      }
-
-      if constexpr (kIsCosine) {
-        float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
-        dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
-      }
-
-      if constexpr (kManageLocalTopK) {
-        float val = valid ? dist : local_topk_t::queue_t::kDummy;
-        queue.add(val, sample_offset + row);
-      } else {
-        if (valid) {
-          uint32_t out_idx       = query_ix * max_samples + sample_offset + row;
-          out_distances[out_idx] = dist;
-          out_indices[out_idx]   = sample_offset + row;
-        }
-      }
-    }
 
-    __syncthreads();  // (c)
-    if constexpr (!kManageLocalTopK) break;
-  }
+  constexpr uint32_t kMaxGridY = 65535;
 
   if constexpr (kManageLocalTopK) {
-    // All probe iterations are done; smem_buf is reused for block_sort merge.
-    // The loop's last (b) or (c) barrier ensures all prior smem accesses have
-    // completed, so this additional barrier is only needed to synchronize any
-    // register-level state across warps before the merge.
-    __syncthreads();
-    queue.done(smem_buf);
-    queue.store(out_distances, out_indices);
-
-    // block_sort initializes unused slots with (kDummy, idx=0). When the
-    // probed clusters have fewer than k total valid vectors, those slots
-    // survive into the output and share idx=0 with the real first vector,
-    // causing duplicates.  Mark them with an invalid index so
-    // postprocess_neighbors treats them as out-of-bounds.
-    // store() is a warp-0-only operation, restrict the fixup to the same warp.
-    if (threadIdx.x < raft::WarpSize) {
-      constexpr auto kDummyVal = local_topk_t::queue_t::kDummy;
-      for (uint32_t i = threadIdx.x; i < k; i += raft::WarpSize) {
-        if (out_distances[i] == kDummyVal) { out_indices[i] = uint32_t(0xFFFFFFFF); }
-      }
+    if (grid_dim_x == 0) {
+      grid_dim_x = configure_grid_dim_x(
+        std::min(kMaxGridY, n_queries), n_probes, smem, kThreads, kernel_launcher->get_kernel());
+      return;
     }
   }
-}
-
-// ---------------------------------------------------------------------------
-// Compute shared-memory size for a given kernel configuration
-// ---------------------------------------------------------------------------
-inline size_t sq_scan_smem_size(uint32_t dim) { return 3 * dim * sizeof(float); }
-
-template <int Capacity>
-size_t sq_scan_total_smem(uint32_t dim, uint32_t k)
-{
-  size_t scan_smem = sq_scan_smem_size(dim);
-  if constexpr (Capacity > 0) {
-    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
-    int num_subwarps           = kSqScanThreads / kSubwarpSize;
-    size_t merge_smem =
-      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, uint32_t>(
-        num_subwarps, k);
-    return std::max(scan_smem, merge_smem);
-  }
-  return scan_smem;
-}
-
-// ---------------------------------------------------------------------------
-// Launch helper: dispatches on Metric, handles grid_dim_x query vs launch
-// ---------------------------------------------------------------------------
-template <int Capacity, typename CodeT, typename IvfSampleFilterT>
-void ivf_sq_scan_launch(const index<CodeT>& idx,
-                        const float* queries_float,
-                        const float* query_norms,
-                        uint32_t n_queries,
-                        uint32_t n_probes,
-                        uint32_t k,
-                        uint32_t max_samples,
-                        const uint32_t* coarse_indices,
-                        const uint32_t* chunk_indices,
-                        float* out_distances,
-                        uint32_t* out_indices,
-                        IvfSampleFilterT sample_filter,
-                        uint32_t& grid_dim_x,
-                        rmm::cuda_stream_view stream)
-{
-  constexpr bool kManageLocalTopK = (Capacity > 0);
-  constexpr int kThreads          = kSqScanThreads;
-  uint32_t dim                    = idx.dim();
 
-  constexpr uint32_t kMaxGridY = 65535;
-
-  auto do_launch = [&](auto kernel_ptr) {
-    size_t smem = sq_scan_total_smem<Capacity>(dim, k);
-
-    {
-      int dev_id;
-      RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
-      int max_smem;
-      RAFT_CUDA_TRY(
-        cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_id));
-      RAFT_EXPECTS(smem <= size_t(max_smem),
-                   "IVF-SQ scan kernel requires %zu bytes of shared memory (dim=%u, k=%u), "
-                   "but the device supports at most %d bytes per block.",
-                   smem,
-                   dim,
-                   k,
-                   max_smem);
-    }
+  dim3 block(kThreads);
 
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+  for (uint32_t query_offset = 0; query_offset < n_queries; query_offset += kMaxGridY) {
+    uint32_t batch = std::min(kMaxGridY, n_queries - query_offset);
+    dim3 grid      = kManageLocalTopK ? dim3(grid_dim_x, batch) : dim3(n_probes, batch);
 
-    // If grid_dim_x == 0, compute the optimal value and return
+    const float* q_ptr  = queries_float + uint64_t(query_offset) * dim;
+    const float* qn_ptr = query_norms ? query_norms + query_offset : query_norms;
+    const uint32_t* ci  = coarse_indices + uint64_t(query_offset) * n_probes;
+    const uint32_t* ch  = chunk_indices + uint64_t(query_offset) * n_probes;
+    float* od           = out_distances;
+    uint32_t* oi        = out_indices;
     if constexpr (kManageLocalTopK) {
-      if (grid_dim_x == 0) {
-        grid_dim_x = configure_grid_dim_x(std::min(kMaxGridY, n_queries),
-                                          n_probes,
-                                          smem,
-                                          kThreads,
-                                          reinterpret_cast<const void*>(kernel_ptr));
-        return;
-      }
-    }
-
-    dim3 block(kThreads);
-
-    // Batch over queries to respect the gridDim.y limit (65535)
-    for (uint32_t query_offset = 0; query_offset < n_queries; query_offset += kMaxGridY) {
-      uint32_t batch = std::min(kMaxGridY, n_queries - query_offset);
-      dim3 grid      = kManageLocalTopK ? dim3(grid_dim_x, batch) : dim3(n_probes, batch);
-
-      auto q_ptr  = queries_float + uint64_t(query_offset) * dim;
-      auto qn_ptr = query_norms ? query_norms + query_offset : query_norms;
-      auto ci     = coarse_indices + uint64_t(query_offset) * n_probes;
-      auto ch     = chunk_indices + uint64_t(query_offset) * n_probes;
-      auto od     = out_distances;
-      auto oi     = out_indices;
-      if constexpr (kManageLocalTopK) {
-        od += uint64_t(query_offset) * grid_dim_x * k;
-        oi += uint64_t(query_offset) * grid_dim_x * k;
-      } else {
-        od += uint64_t(query_offset) * max_samples;
-        oi += uint64_t(query_offset) * max_samples;
-      }
-
-      kernel_ptr<<<grid, block, smem, stream>>>(idx.data_ptrs().data_handle(),
-                                                idx.list_sizes().data_handle(),
-                                                ci,
-                                                q_ptr,
-                                                idx.centers().data_handle(),
-                                                idx.sq_vmin().data_handle(),
-                                                idx.sq_delta().data_handle(),
-                                                qn_ptr,
-                                                n_probes,
-                                                dim,
-                                                k,
-                                                max_samples,
-                                                ch,
-                                                od,
-                                                oi,
-                                                sample_filter);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
+      od += uint64_t(query_offset) * grid_dim_x * k;
+      oi += uint64_t(query_offset) * grid_dim_x * k;
+    } else {
+      od += uint64_t(query_offset) * max_samples;
+      oi += uint64_t(query_offset) * max_samples;
     }
-  };
 
-  switch (idx.metric()) {
-    case cuvs::distance::DistanceType::L2Expanded:
-    case cuvs::distance::DistanceType::L2SqrtExpanded:
-      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kL2, CodeT, IvfSampleFilterT>);
-      break;
-    case cuvs::distance::DistanceType::InnerProduct:
-      do_launch(ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kIP, CodeT, IvfSampleFilterT>);
-      break;
-    case cuvs::distance::DistanceType::CosineExpanded:
-      do_launch(
-        ivf_sq_scan_kernel<kThreads, Capacity, SqScanMetric::kCosine, CodeT, IvfSampleFilterT>);
-      break;
-    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
+    const uint8_t* const* data_ptrs = idx.data_ptrs().data_handle();
+    const uint32_t* list_sizes      = idx.list_sizes().data_handle();
+    const float* centers            = idx.centers().data_handle();
+    const float* sq_vmin            = idx.sq_vmin().data_handle();
+    const float* sq_delta           = idx.sq_delta().data_handle();
+
+    kernel_launcher->dispatch<ivf_sq_scan_func_t<int64_t>>(stream,
+                                                           grid,
+                                                           block,
+                                                           smem,
+                                                           data_ptrs,
+                                                           list_sizes,
+                                                           ci,
+                                                           q_ptr,
+                                                           centers,
+                                                           sq_vmin,
+                                                           sq_delta,
+                                                           qn_ptr,
+                                                           n_probes,
+                                                           dim,
+                                                           k,
+                                                           max_samples,
+                                                           ch,
+                                                           od,
+                                                           oi,
+                                                           inds_ptrs_dev,
+                                                           bitset_ptr,
+                                                           bitset_len,
+                                                           original_nbits);
   }
 }
 
@@ -475,21 +295,27 @@ void ivf_sq_scan(raft::resources const& handle,
     capacity = 0;
   }
 
+  using IvfSampleFilterTag = decltype(get_filter_type_tag<IvfSampleFilterT>());
+
   auto fwd = [&](auto cap_tag) {
-    ivf_sq_scan_launch<decltype(cap_tag)::value, CodeT, IvfSampleFilterT>(idx,
-                                                                          queries_float,
-                                                                          query_norms,
-                                                                          n_queries,
-                                                                          n_probes,
-                                                                          k,
-                                                                          max_samples,
-                                                                          coarse_indices,
-                                                                          chunk_indices,
-                                                                          out_distances,
-                                                                          out_indices,
-                                                                          sample_filter,
-                                                                          grid_dim_x,
-                                                                          stream);
+    constexpr int kCap = decltype(cap_tag)::value;
+    dispatch_metric_tag(idx.metric(), [&](auto metric_tag) {
+      using MetricTag = decltype(metric_tag);
+      launch_kernel<kCap, MetricTag, CodeT, IvfSampleFilterTag, IvfSampleFilterT>(idx,
+                                                                                  queries_float,
+                                                                                  query_norms,
+                                                                                  n_queries,
+                                                                                  n_probes,
+                                                                                  k,
+                                                                                  max_samples,
+                                                                                  coarse_indices,
+                                                                                  chunk_indices,
+                                                                                  out_distances,
+                                                                                  out_indices,
+                                                                                  grid_dim_x,
+                                                                                  stream,
+                                                                                  sample_filter);
+    });
   };
 
   switch (capacity) {
@@ -642,9 +468,6 @@ void search_impl(raft::resources const& handle,
                                                                   num_samples.data(),
                                                                   stream);
 
-  auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter(
-    index.inds_ptrs().data_handle(), sample_filter);
-
   bool manage_local_topk = is_local_topk_feasible(k);
 
   // Determine grid_dim_x for the fused path
@@ -663,7 +486,7 @@ void search_impl(raft::resources const& handle,
                 chunk_index.data(),
                 nullptr,
                 nullptr,
-                filter_adapter,
+                sample_filter,
                 grid_dim_x,
                 stream);
     if (grid_dim_x == 0) {
@@ -717,7 +540,7 @@ void search_impl(raft::resources const& handle,
                 chunk_index.data(),
                 dist_out_ptr,
                 idx_out_ptr,
-                filter_adapter,
+                sample_filter,
                 grid_dim_x,
                 stream);
 
@@ -770,7 +593,7 @@ void search_impl(raft::resources const& handle,
                 chunk_index.data(),
                 all_distances.data(),
                 all_indices.data(),
-                filter_adapter,
+                sample_filter,
                 gdx,
                 stream);
 
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
deleted file mode 100644
index 40029119b2..0000000000
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search_half_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include <cuvs/neighbors/ivf_sq.hpp>
-
-#include "ivf_sq_search.cuh"
-
-namespace cuvs::neighbors::ivf_sq {
-
-#define CUVS_INST_IVF_SQ_SEARCH(T, CodeT)                                            \
-  void search(raft::resources const& handle,                                         \
-              const cuvs::neighbors::ivf_sq::search_params& params,                  \
-              const cuvs::neighbors::ivf_sq::index<CodeT>& index,                    \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
-              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
-              raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
-              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
-  {                                                                                  \
-    cuvs::neighbors::ivf_sq::detail::search(                                         \
-      handle, params, index, queries, neighbors, distances, sample_filter);          \
-  }
-
-CUVS_INST_IVF_SQ_SEARCH(half, uint8_t);
-
-#undef CUVS_INST_IVF_SQ_SEARCH
-
-}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
similarity index 97%
rename from cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
rename to cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
index de185de8ec..48247e207b 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search_float_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
@@ -23,6 +23,7 @@ namespace cuvs::neighbors::ivf_sq {
   }
 
 CUVS_INST_IVF_SQ_SEARCH(float, uint8_t);
+CUVS_INST_IVF_SQ_SEARCH(half, uint8_t);
 
 #undef CUVS_INST_IVF_SQ_SEARCH
 

From 55a91bf794dc23f0795b8b2c8d0d7d00008f451c Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Thu, 30 Apr 2026 15:24:20 +0200
Subject: [PATCH 29/43] doc fix + build assert addition

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp     | 14 +++++++++-----
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh |  2 ++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index ba4bb39437..b342d0ce50 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -130,12 +130,16 @@ using list_data = ivf::list<list_spec, SizeT, CodeT, IdxT>;
  * In the IVF-SQ index, a database vector is first assigned to the nearest cluster center
  * using an inverted file (IVF) structure, and then compressed using scalar quantization (SQ).
  *
- * Scalar quantization independently maps each dimension of the vector to a fixed-width integer
- * code. For 8-bit quantization (uint8_t), each floating-point component is linearly mapped to
- * an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and range (`sq_delta`)
- * values:
+ * Scalar quantization independently maps each dimension of the per-cluster residual (the
+ * input vector minus its assigned centroid) to a fixed-width integer code. For 8-bit
+ * quantization (uint8_t), each residual component r_i = x_i - centroid_i is linearly
+ * mapped to an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and
+ * step-size (`sq_delta`) values:
  *
- *   code_i = round((x_i - vmin_i) / delta_i * 255)
+ *   code_i = clamp(round((r_i - vmin_i) / delta_i), 0, 255)
+ *
+ * where delta_i is the per-level step size (range divided by 255), so the corresponding
+ * reconstruction is x_i ≈ centroid_i + vmin_i + code_i * delta_i.
  *
  * This provides a compact representation (1 byte per dimension) while preserving the relative
  * distances between vectors with high fidelity, offering a good trade-off between index size,
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index a4a2063ca5..7574c3d892 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -548,6 +548,8 @@ inline auto build(
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, half>, "unsupported data type");
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
   RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
+  RAFT_EXPECTS(params.kmeans_trainset_fraction > 0.0 && params.kmeans_trainset_fraction <= 1.0,
+               "kmeans_trainset_fraction must be in (0, 1]");
   RAFT_EXPECTS(params.metric != cuvs::distance::DistanceType::CosineExpanded || dim > 1,
                "Cosine metric requires more than one dim");
 

From 6d5ec7247af304e37d6736052a1883861f54f462 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 4 May 2026 10:45:51 +0200
Subject: [PATCH 30/43] Switching to raft::TxN_t

---
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh | 91 +++++------------------
 1 file changed, 19 insertions(+), 72 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 7574c3d892..52be653985 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -30,6 +30,7 @@
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
 
 #include <cub/block/block_reduce.cuh>
 
@@ -58,82 +59,28 @@ struct ColMinMaxOp {
 
 /**
  * Vectorized load helper: reads VecCols contiguous elements of type T as
- * a single aligned wide load and unpacks them into floats.
+ * a single aligned wide load (via raft::TxN_t -> __ldg on the promoted
+ * io_t) and unpacks them into floats.
  *
  * The primary benefit over scalar loads is halving (VecCols=2) or
  * quartering (VecCols=4) the number of LDG instructions issued per warp,
  * which is the dominant cost in the column-strided access pattern of
- * fused_column_minmax_kernel. VecCols=1 is provided as the degenerate
- * scalar fallback for odd `dim`.
+ * fused_column_minmax_kernel. VecCols=1 is the degenerate scalar
+ * fallback for odd `dim`.
  *
- * Requires `p` to be aligned to sizeof(T) * VecCols.
+ * Requires `p` to be aligned to sizeof(raft::IOType<T, VecCols>::Type),
+ * i.e. sizeof(T) * VecCols.
  */
 template <typename T, int VecCols>
-struct vec_loader;
-
-template <>
-struct vec_loader<float, 1> {
-  __device__ __forceinline__ static void load(const float* p, float (&out)[1]) { out[0] = *p; }
-};
-
-template <>
-struct vec_loader<half, 1> {
-  __device__ __forceinline__ static void load(const half* p, float (&out)[1])
-  {
-    out[0] = float(*p);
-  }
-};
-
-template <>
-struct vec_loader<float, 4> {
-  __device__ __forceinline__ static void load(const float* p, float (&out)[4])
-  {
-    float4 v = *reinterpret_cast<const float4*>(p);
-    out[0]   = v.x;
-    out[1]   = v.y;
-    out[2]   = v.z;
-    out[3]   = v.w;
-  }
-};
-
-template <>
-struct vec_loader<float, 2> {
-  __device__ __forceinline__ static void load(const float* p, float (&out)[2])
-  {
-    float2 v = *reinterpret_cast<const float2*>(p);
-    out[0]   = v.x;
-    out[1]   = v.y;
-  }
-};
-
-template <>
-struct vec_loader<half, 4> {
-  __device__ __forceinline__ static void load(const half* p, float (&out)[4])
-  {
-    // Single 8-byte load covering 4 halves; memcpy avoids aliasing issues
-    // and is compiled to a register move in device code.
-    uint2 raw = *reinterpret_cast<const uint2*>(p);
-    half h[4];
-    static_assert(sizeof(h) == sizeof(raw), "unexpected half packing");
-    memcpy(&h[0], &raw, sizeof(raw));
+__device__ __forceinline__ void load_cols_as_float(const T* p, float (&out)[VecCols])
+{
+  raft::TxN_t<T, VecCols> v;
+  v.load(p, 0);
 #pragma unroll
-    for (int k = 0; k < 4; ++k)
-      out[k] = float(h[k]);
-  }
-};
-
-template <>
-struct vec_loader<half, 2> {
-  __device__ __forceinline__ static void load(const half* p, float (&out)[2])
-  {
-    uint32_t raw = *reinterpret_cast<const uint32_t*>(p);
-    half h[2];
-    static_assert(sizeof(h) == sizeof(raw), "unexpected half packing");
-    memcpy(&h[0], &raw, sizeof(raw));
-    out[0] = float(h[0]);
-    out[1] = float(h[1]);
+  for (int k = 0; k < VecCols; ++k) {
+    out[k] = static_cast<float>(v.val.data[k]);
   }
-};
+}
 
 /**
  * Fused per-column min+max in a single pass (2x less DRAM traffic than two
@@ -176,10 +123,10 @@ __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __r
   // the column and row axes.
   for (; row + 3 * stride < n_rows; row += 4 * stride) {
     float r0[VecCols], r1[VecCols], r2[VecCols], r3[VecCols];
-    vec_loader<T, VecCols>::load(data + row * dim + col_base, r0);
-    vec_loader<T, VecCols>::load(data + (row + stride) * dim + col_base, r1);
-    vec_loader<T, VecCols>::load(data + (row + 2 * stride) * dim + col_base, r2);
-    vec_loader<T, VecCols>::load(data + (row + 3 * stride) * dim + col_base, r3);
+    load_cols_as_float<T, VecCols>(data + row * dim + col_base, r0);
+    load_cols_as_float<T, VecCols>(data + (row + stride) * dim + col_base, r1);
+    load_cols_as_float<T, VecCols>(data + (row + 2 * stride) * dim + col_base, r2);
+    load_cols_as_float<T, VecCols>(data + (row + 3 * stride) * dim + col_base, r3);
 #pragma unroll
     for (int k = 0; k < VecCols; ++k) {
       float mn       = fminf(fminf(r0[k], r1[k]), fminf(r2[k], r3[k]));
@@ -190,7 +137,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __r
   }
   for (; row < n_rows; row += stride) {
     float r[VecCols];
-    vec_loader<T, VecCols>::load(data + row * dim + col_base, r);
+    load_cols_as_float<T, VecCols>(data + row * dim + col_base, r);
 #pragma unroll
     for (int k = 0; k < VecCols; ++k) {
       agg[k].min_val = fminf(agg[k].min_val, r[k]);

From 688962424f93ee27587264a04f902a0232af562b Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 5 May 2026 11:39:27 +0200
Subject: [PATCH 31/43] Dropping the MetricTag template parameter

---
 cpp/CMakeLists.txt                            | 57 ++++++++++++++++-
 .../detail/jit_lto/ivf_sq/scan_fragments.hpp  | 17 ++++-
 .../accumulate_distance_impl.cuh              | 46 ++++++++++++++
 .../accumulate_distance_kernel.cu.in          | 24 +++++++
 .../accumulate_distance_matrix.json           |  3 +
 .../jit_lto_kernels/device_functions.cuh      | 28 +++++++++
 .../finalize_distance_impl.cuh                | 40 ++++++++++++
 .../finalize_distance_kernel.cu.in            | 23 +++++++
 .../finalize_distance_matrix.json             |  3 +
 .../detail/jit_lto_kernels/scan_impl.cuh      | 63 +++++++++----------
 .../detail/jit_lto_kernels/scan_kernel.cu.in  | 42 ++++++-------
 .../detail/jit_lto_kernels/scan_matrix.json   | 11 +++-
 .../detail/jit_lto_kernels/scan_planner.hpp   | 28 ++++++++-
 .../setup_invariant_smem_impl.cuh             | 50 +++++++++++++++
 .../setup_invariant_smem_kernel.cu.in         | 27 ++++++++
 .../setup_invariant_smem_matrix.json          |  3 +
 .../setup_per_probe_smem_impl.cuh             | 55 ++++++++++++++++
 .../setup_per_probe_smem_kernel.cu.in         | 27 ++++++++
 .../setup_per_probe_smem_matrix.json          |  3 +
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh    | 12 +++-
 20 files changed, 497 insertions(+), 65 deletions(-)
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in
 create mode 100644 cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f957f8b702..65cec8837a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -490,18 +490,69 @@ if(NOT BUILD_CPU_ONLY)
   set(ivf_sq_ns "cuvs::neighbors::ivf_sq::detail")
   generate_jit_lto_kernels(
     jit_lto_files
-    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@_metric_@metric_name@"
+    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@_@ascending_descending@"
     MATRIX_JSON_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json"
     KERNEL_INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in"
-    FRAGMENT_TAG_FORMAT
-      "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<${ivf_sq_ns}::tag_metric_@metric_name@, @capacity@>"
+    FRAGMENT_TAG_FORMAT "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<@capacity@, @ascending_value@>"
     FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
                               "<cuvs/detail/jit_lto/common_fragments.hpp>"
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/scan"
     KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
   )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_setup_invariant_smem_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_setup_invariant_smem<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/setup_invariant_smem"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_setup_per_probe_smem_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_setup_per_probe_smem<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/setup_per_probe_smem"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_accumulate_distance_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_accumulate_distance<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/accumulate_distance"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_finalize_distance_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_finalize_distance<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/finalize_distance"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
   generate_jit_lto_kernels(
     jit_lto_files
     NAME_FORMAT "ivf_sq_filter_@filter_name@"
diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
index b20684b11f..c505ed3f8f 100644
--- a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
+++ b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
@@ -11,10 +11,25 @@ struct tag_metric_l2 {};
 struct tag_metric_ip {};
 struct tag_metric_cosine {};
 
-template <typename MetricTag, int Capacity>
+// Scan entrypoint fragment. Templated only on (Capacity, Ascending). The
+// metric specialization lives in the four device-function fragments below.
+template <int Capacity, bool Ascending>
 struct fragment_tag_ivf_sq_scan {};
 
 template <typename FilterTag>
 struct fragment_tag_ivf_sq_filter {};
 
+// Metric-specific device-function fragments composed in at JIT-link time.
+template <typename MetricTag>
+struct fragment_tag_setup_invariant_smem {};
+
+template <typename MetricTag>
+struct fragment_tag_setup_per_probe_smem {};
+
+template <typename MetricTag>
+struct fragment_tag_accumulate_distance {};
+
+template <typename MetricTag>
+struct fragment_tag_finalize_distance {};
+
 }  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh
new file mode 100644
index 0000000000..5368e101cd
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Per-element distance accumulator. Called inside the unrolled inner loop of
+// ivf_sq_scan_impl. After JIT-LTO inlining, these bodies fold directly into
+// the unrolled loop and the v_norm_sq plumbing is dead-code-eliminated for
+// non-cosine metrics.
+//
+// L2:        diff = qt - code*scale; dist += diff*diff
+// IP:        v    = aux + code*scale; dist += qt*v
+// Cosine:    as IP, plus v_norm_sq += v*v
+
+__device__ void accumulate_distance_l2_impl(
+  float qt, float /* aux */, float scale, uint8_t code, float& dist, float& /* v_norm_sq */)
+{
+  float recon = float(code) * scale;
+  float diff  = qt - recon;
+  dist += diff * diff;
+}
+
+__device__ void accumulate_distance_ip_impl(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& /* v_norm_sq */)
+{
+  float recon = float(code) * scale;
+  float v_d   = aux + recon;
+  dist += qt * v_d;
+}
+
+__device__ void accumulate_distance_cosine_impl(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq)
+{
+  float recon = float(code) * scale;
+  float v_d   = aux + recon;
+  dist += qt * v_d;
+  v_norm_sq += v_d * v_d;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in
new file mode 100644
index 0000000000..7d7e77c85a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+
+namespace {
+
+constexpr auto accumulate_distance_impl =
+  cuvs::neighbors::ivf_sq::detail::accumulate_distance_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void accumulate_distance(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq)
+{
+  accumulate_distance_impl(qt, aux, scale, code, dist, v_norm_sq);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
index 188933d803..ca22c21168 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
@@ -22,4 +22,32 @@ __device__ bool sample_filter(const IndexT* const* const inds_ptrs,
                               IndexT bitset_len,
                               IndexT original_nbits);
 
+// Forward declarations of the metric-specific scan device functions. The
+// concrete implementations are provided by JIT-LTO fragments generated from
+// setup_invariant_smem_kernel.cu.in, setup_per_probe_smem_kernel.cu.in,
+// accumulate_distance_kernel.cu.in and finalize_distance_kernel.cu.in. After
+// nvJitLink LTO inlines these into ivf_sq_scan_impl, the codegen matches the
+// pre-refactor `if constexpr (kIsL2 / kIsCosine)` form.
+
+// Phase 1: load the metric-invariant smem array. Called once per query.
+__device__ void setup_invariant_smem(uint32_t dim,
+                                     const float* __restrict__ query,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term);
+
+// Phase 2: load the per-probe smem array. Called once per (query, probe).
+__device__ void setup_per_probe_smem(uint32_t dim,
+                                     const float* __restrict__ centroid,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term);
+
+// Per-element distance accumulator. Called inside the unrolled inner loop.
+__device__ void accumulate_distance(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq);
+
+// Per-row distance finalize. Called once per scanned row.
+__device__ float finalize_distance(float dist, float v_norm_sq, float query_norm);
+
 }  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
new file mode 100644
index 0000000000..2d31ff863a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
@@ -0,0 +1,40 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Per-row distance finalize. Called once per scanned row.
+//
+// L2/IP:  return dist
+// Cosine: denom = query_norm * sqrtf(v_norm_sq);
+//         return denom > 0 ? 1 - dist/denom : 0
+
+__device__ float finalize_distance_l2_impl(float dist,
+                                           float /* v_norm_sq */,
+                                           float /* query_norm */)
+{
+  return dist;
+}
+
+__device__ float finalize_distance_ip_impl(float dist,
+                                           float /* v_norm_sq */,
+                                           float /* query_norm */)
+{
+  return dist;
+}
+
+__device__ float finalize_distance_cosine_impl(float dist, float v_norm_sq, float query_norm)
+{
+  float denom = query_norm * sqrtf(v_norm_sq);
+  return (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in
new file mode 100644
index 0000000000..642e4f1d5d
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh>
+
+namespace {
+
+constexpr auto finalize_distance_impl =
+  cuvs::neighbors::ivf_sq::detail::finalize_distance_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ float finalize_distance(float dist, float v_norm_sq, float query_norm)
+{
+  return finalize_distance_impl(dist, v_norm_sq, query_norm);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
index bc8f7587ee..38bea46eef 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
@@ -43,6 +43,18 @@ using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
 
 // IVF-SQ scan kernel body with fused in-kernel top-k.
 //
+// The kernel is metric-agnostic: the four metric-specific pieces are
+// linked in at runtime via JIT-LTO as fragments declared in
+// device_functions.cuh:
+//   - setup_invariant_smem  (Phase 1, once per query)
+//   - setup_per_probe_smem  (Phase 2, once per probe)
+//   - accumulate_distance   (per-element, inner unrolled loop)
+//   - finalize_distance     (per-row, after the dim accumulation)
+//
+// The host launcher (ivf_sq_search.cuh) derives Ascending from the metric
+// (Ascending = !is_ip) and registers the matching metric variant of each
+// fragment with the planner.
+//
 // Grid layout:
 //   kManageLocalTopK (Capacity > 0):
 //     grid (grid_dim_x, n_queries) - each block loops over probes
@@ -67,7 +79,7 @@ using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
 //     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
 //
 //   After all probes are scanned, the smem is reused for block_sort merge.
-template <int Capacity, typename MetricTag>
+template <int Capacity, bool Ascending>
 __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs,
                                                  const uint32_t* list_sizes,
                                                  const uint32_t* coarse_indices,
@@ -93,10 +105,6 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
 
   constexpr int BlockDim          = kSqScanThreads;
   constexpr bool kManageLocalTopK = (Capacity > 0);
-  constexpr bool kIsL2            = std::is_same_v<MetricTag, tag_metric_l2>;
-  constexpr bool kIsCosine        = std::is_same_v<MetricTag, tag_metric_cosine>;
-  constexpr bool kIsIP            = std::is_same_v<MetricTag, tag_metric_ip>;
-  constexpr bool kAscending       = !kIsIP;
 
   extern __shared__ __align__(256) uint8_t smem_buf[];
   float* smem = reinterpret_cast<float*>(smem_buf);
@@ -108,6 +116,11 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
   const uint32_t query_ix = blockIdx.y;
   const float* query      = queries_float + query_ix * dim;
 
+  // Hoist the per-query scalar load to a uniform value. The cosine fragment of
+  // finalize_distance is the only consumer; for L2/IP, the unused argument is
+  // dead-code-eliminated after JIT-LTO inlining and this load disappears.
+  const float q_norm = (query_norms != nullptr) ? query_norms[query_ix] : 0.0f;
+
   if constexpr (kManageLocalTopK) {
     out_distances += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
     out_indices += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
@@ -116,15 +129,11 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
   // Phase 1: load shared memory that is invariant across probes.
   for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
     s_sq_scale[d] = sq_delta[d];
-    if constexpr (kIsL2) {
-      s_aux[d] = query[d] - sq_vmin[d];
-    } else {
-      s_query_term[d] = query[d];
-    }
   }
+  setup_invariant_smem(dim, query, sq_vmin, s_aux, s_query_term);
   __syncthreads();
 
-  using local_topk_t = sq_block_sort_t<Capacity, kAscending>;
+  using local_topk_t = sq_block_sort_t<Capacity, Ascending>;
   local_topk_t queue(k);
 
   const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
@@ -154,16 +163,7 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
     const uint32_t cluster_id = my_coarse[probe_ix];
     const uint32_t cluster_sz = list_sizes[cluster_id];
 
-    {
-      const float* centroid = centers + cluster_id * dim;
-      for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
-        if constexpr (kIsL2) {
-          s_query_term[d] = s_aux[d] - centroid[d];
-        } else {
-          s_aux[d] = centroid[d] + sq_vmin[d];
-        }
-      }
-    }
+    setup_per_probe_smem(dim, centers + cluster_id * dim, sq_vmin, s_aux, s_query_term);
     __syncthreads();  // (b)
 
     if (cluster_sz == 0) {
@@ -198,24 +198,17 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
 #pragma unroll
         for (uint32_t j = 0; j < veclen; j++) {
           if (l + j < dim) {
-            float recon = float(codes_local[j]) * s_sq_scale[l + j];
-
-            if constexpr (kIsL2) {
-              float diff = s_query_term[l + j] - recon;
-              dist += diff * diff;
-            } else {
-              float v_d = s_aux[l + j] + recon;
-              dist += s_query_term[l + j] * v_d;
-              if constexpr (kIsCosine) { v_norm_sq += v_d * v_d; }
-            }
+            accumulate_distance(s_query_term[l + j],
+                                s_aux[l + j],
+                                s_sq_scale[l + j],
+                                codes_local[j],
+                                dist,
+                                v_norm_sq);
           }
         }
       }
 
-      if constexpr (kIsCosine) {
-        float denom = query_norms[query_ix] * sqrtf(v_norm_sq);
-        dist        = (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
-      }
+      dist = finalize_distance(dist, v_norm_sq, q_norm);
 
       if constexpr (kManageLocalTopK) {
         float val = valid ? dist : local_topk_t::queue_t::kDummy;
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
index 3c59f91764..252ad98ed3 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
@@ -8,8 +8,8 @@
 
 namespace {
 
-using metric_tag       = cuvs::neighbors::ivf_sq::detail::tag_metric_@metric_name@;
-constexpr int capacity = @capacity@;
+constexpr int capacity   = @capacity@;
+constexpr bool ascending = @ascending_value@;
 
 }  // namespace
 
@@ -36,25 +36,25 @@ extern "C" __global__ __launch_bounds__(kSqScanThreads) void ivf_sq_scan(
   int64_t bitset_len,
   int64_t original_nbits)
 {
-  ivf_sq_scan_impl<capacity, metric_tag>(data_ptrs,
-                                         list_sizes,
-                                         coarse_indices,
-                                         queries_float,
-                                         centers,
-                                         sq_vmin,
-                                         sq_delta,
-                                         query_norms,
-                                         n_probes,
-                                         dim,
-                                         k,
-                                         max_samples,
-                                         chunk_indices,
-                                         out_distances,
-                                         out_indices,
-                                         inds_ptrs,
-                                         bitset_ptr,
-                                         bitset_len,
-                                         original_nbits);
+  ivf_sq_scan_impl<capacity, ascending>(data_ptrs,
+                                        list_sizes,
+                                        coarse_indices,
+                                        queries_float,
+                                        centers,
+                                        sq_vmin,
+                                        sq_delta,
+                                        query_norms,
+                                        n_probes,
+                                        dim,
+                                        k,
+                                        max_samples,
+                                        chunk_indices,
+                                        out_distances,
+                                        out_indices,
+                                        inds_ptrs,
+                                        bitset_ptr,
+                                        bitset_len,
+                                        original_nbits);
 }
 
 static_assert(std::is_same_v<decltype(ivf_sq_scan), ivf_sq_scan_func_t<int64_t>>);
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
index fac4154c18..79250fea1b 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
@@ -2,7 +2,14 @@
   "capacity": [
     "0", "32", "64", "128", "256"
   ],
-  "metric_name": [
-    "l2", "ip", "cosine"
+  "_ascending": [
+    {
+      "ascending_descending": "ascending",
+      "ascending_value": "true"
+    },
+    {
+      "ascending_descending": "descending",
+      "ascending_value": "false"
+    }
   ]
 }
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
index ae157e3994..da0fc1ce23 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
@@ -18,10 +18,10 @@ struct IvfSqScanPlanner : AlgorithmPlanner {
 
   IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {}
 
-  template <typename MetricTag, int Capacity>
+  template <int Capacity, bool Ascending>
   void add_entrypoint()
   {
-    this->add_static_fragment<fragment_tag_ivf_sq_scan<MetricTag, Capacity>>();
+    this->add_static_fragment<fragment_tag_ivf_sq_scan<Capacity, Ascending>>();
   }
 
   template <typename FilterTag>
@@ -33,6 +33,30 @@ struct IvfSqScanPlanner : AlgorithmPlanner {
                                                           cuvs::neighbors::detail::tag_index_i64,
                                                           FilterTag>>();
   }
+
+  template <typename MetricTag>
+  void add_setup_invariant_smem_function()
+  {
+    this->add_static_fragment<fragment_tag_setup_invariant_smem<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_setup_per_probe_smem_function()
+  {
+    this->add_static_fragment<fragment_tag_setup_per_probe_smem<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_accumulate_distance_function()
+  {
+    this->add_static_fragment<fragment_tag_accumulate_distance<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_finalize_distance_function()
+  {
+    this->add_static_fragment<fragment_tag_finalize_distance<MetricTag>>();
+  }
 };
 
 }  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh
new file mode 100644
index 0000000000..b0e2d5ea9e
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Phase 1 invariant smem setup. Called once per query.
+//
+// Loads the metric-invariant smem array. The metric-variant array is left
+// untouched and will be filled per-probe by setup_per_probe_smem.
+
+__device__ void setup_invariant_smem_l2_impl(uint32_t dim,
+                                             const float* __restrict__ query,
+                                             const float* __restrict__ sq_vmin,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = query[d] - sq_vmin[d];
+  }
+}
+
+__device__ void setup_invariant_smem_ip_impl(uint32_t dim,
+                                             const float* __restrict__ query,
+                                             const float* __restrict__ /* sq_vmin */,
+                                             float* __restrict__ /* s_aux */,
+                                             float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = query[d];
+  }
+}
+
+__device__ void setup_invariant_smem_cosine_impl(uint32_t dim,
+                                                 const float* __restrict__ query,
+                                                 const float* __restrict__ /* sq_vmin */,
+                                                 float* __restrict__ /* s_aux */,
+                                                 float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = query[d];
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in
new file mode 100644
index 0000000000..43a31f7fc4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh>
+
+namespace {
+
+constexpr auto setup_invariant_smem_impl =
+  cuvs::neighbors::ivf_sq::detail::setup_invariant_smem_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void setup_invariant_smem(uint32_t dim,
+                                     const float* __restrict__ query,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term)
+{
+  setup_invariant_smem_impl(dim, query, sq_vmin, s_aux, s_query_term);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh
new file mode 100644
index 0000000000..6e0b19a1f9
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Phase 2 per-probe smem setup. Called once per (query, probe).
+//
+// Loads the per-probe smem array using the invariant array filled in Phase 1.
+//
+// L2:        s_query_term[d] = s_aux[d]   - centroid[d]   (where s_aux holds query - sq_vmin)
+// IP/Cosine: s_aux[d]        = centroid[d] + sq_vmin[d]
+//
+// The "in" and "out" smem arrays do not alias — they are distinct regions of
+// the kernel's smem layout — so __restrict__ is safe.
+
+__device__ void setup_per_probe_smem_l2_impl(uint32_t dim,
+                                             const float* __restrict__ centroid,
+                                             const float* __restrict__ /* sq_vmin */,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = s_aux[d] - centroid[d];
+  }
+}
+
+__device__ void setup_per_probe_smem_ip_impl(uint32_t dim,
+                                             const float* __restrict__ centroid,
+                                             const float* __restrict__ sq_vmin,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = centroid[d] + sq_vmin[d];
+  }
+}
+
+__device__ void setup_per_probe_smem_cosine_impl(uint32_t dim,
+                                                 const float* __restrict__ centroid,
+                                                 const float* __restrict__ sq_vmin,
+                                                 float* __restrict__ s_aux,
+                                                 float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = centroid[d] + sq_vmin[d];
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in
new file mode 100644
index 0000000000..81d206f78d
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh>
+
+namespace {
+
+constexpr auto setup_per_probe_smem_impl =
+  cuvs::neighbors::ivf_sq::detail::setup_per_probe_smem_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void setup_per_probe_smem(uint32_t dim,
+                                     const float* __restrict__ centroid,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term)
+{
+  setup_per_probe_smem_impl(dim, centroid, sq_vmin, s_aux, s_query_term);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index bbe6f05df1..d4171a0fcc 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -155,8 +155,18 @@ void launch_kernel(const index<CodeT>& idx,
   constexpr int kThreads          = kSqScanThreads;
   uint32_t dim                    = idx.dim();
 
+  // The scan entrypoint is templated only on (Capacity, Ascending). Metric
+  // specialization is composed in at link time via four device-function
+  // fragments (setup_invariant_smem, setup_per_probe_smem,
+  // accumulate_distance, finalize_distance).
+  constexpr bool kAscending = !std::is_same_v<MetricTag, tag_metric_ip>;
+
   IvfSqScanPlanner kernel_planner;
-  kernel_planner.add_entrypoint<MetricTag, Capacity>();
+  kernel_planner.add_entrypoint<Capacity, kAscending>();
+  kernel_planner.add_setup_invariant_smem_function<MetricTag>();
+  kernel_planner.add_setup_per_probe_smem_function<MetricTag>();
+  kernel_planner.add_accumulate_distance_function<MetricTag>();
+  kernel_planner.add_finalize_distance_function<MetricTag>();
   kernel_planner.add_filter_device_function<IvfSampleFilterTag>();
   auto kernel_launcher = kernel_planner.get_launcher();
 

From df55c51b3cf74cb460d0e0d954c68c00269caebd Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 5 May 2026 12:01:26 +0200
Subject: [PATCH 32/43] Inner product trick

---
 cpp/CMakeLists.txt                            |  4 +-
 .../detail/jit_lto/ivf_sq/scan_fragments.hpp  |  8 ++--
 .../finalize_distance_impl.cuh                | 13 ++++--
 .../detail/jit_lto_kernels/scan_impl.cuh      | 33 +++++++++------
 .../detail/jit_lto_kernels/scan_kernel.cu.in  | 41 +++++++++----------
 .../detail/jit_lto_kernels/scan_matrix.json   | 10 -----
 .../detail/jit_lto_kernels/scan_planner.hpp   |  4 +-
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh    | 38 +++++++++--------
 8 files changed, 80 insertions(+), 71 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 65cec8837a..f501f5a39d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -490,12 +490,12 @@ if(NOT BUILD_CPU_ONLY)
   set(ivf_sq_ns "cuvs::neighbors::ivf_sq::detail")
   generate_jit_lto_kernels(
     jit_lto_files
-    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@_@ascending_descending@"
+    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@"
     MATRIX_JSON_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json"
     KERNEL_INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in"
-    FRAGMENT_TAG_FORMAT "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<@capacity@, @ascending_value@>"
+    FRAGMENT_TAG_FORMAT "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<@capacity@>"
     FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
                               "<cuvs/detail/jit_lto/common_fragments.hpp>"
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/scan"
diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
index c505ed3f8f..d16d47d653 100644
--- a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
+++ b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
@@ -11,9 +11,11 @@ struct tag_metric_l2 {};
 struct tag_metric_ip {};
 struct tag_metric_cosine {};
 
-// Scan entrypoint fragment. Templated only on (Capacity, Ascending). The
-// metric specialization lives in the four device-function fragments below.
-template <int Capacity, bool Ascending>
+// Scan entrypoint fragment. Templated only on Capacity. The warpsort runs
+// ascending for all metrics; metric specialization lives in the four
+// device-function fragments below (finalize_distance is responsible for
+// returning a min-close value).
+template <int Capacity>
 struct fragment_tag_ivf_sq_scan {};
 
 template <typename FilterTag>
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
index 2d31ff863a..4276f80dd7 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
@@ -13,9 +13,14 @@ namespace cuvs::neighbors::ivf_sq::detail {
 
 // Per-row distance finalize. Called once per scanned row.
 //
-// L2/IP:  return dist
-// Cosine: denom = query_norm * sqrtf(v_norm_sq);
-//         return denom > 0 ? 1 - dist/denom : 0
+// All metrics return a min-close value so the warpsort epilogue and the
+// host-side select_k can be hardcoded to ascending / select-min:
+//   L2:     return dist                              (squared L2 is min-close)
+//   IP:     return -dist                             (negate so min-close;
+//                                                     postprocess_distances
+//                                                     undoes the sign)
+//   Cosine: denom = query_norm * sqrtf(v_norm_sq);
+//           return denom > 0 ? 1 - dist/denom : 0    (cosine distance is min-close)
 
 __device__ float finalize_distance_l2_impl(float dist,
                                            float /* v_norm_sq */,
@@ -28,7 +33,7 @@ __device__ float finalize_distance_ip_impl(float dist,
                                            float /* v_norm_sq */,
                                            float /* query_norm */)
 {
-  return dist;
+  return -dist;
 }
 
 __device__ float finalize_distance_cosine_impl(float dist, float v_norm_sq, float query_norm)
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
index 38bea46eef..85cb9605be 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
@@ -23,23 +23,27 @@ namespace cuvs::neighbors::ivf_sq::detail {
 // block_sort type selection: dispatch the dummy block sort when Capacity == 0
 // so the same impl body works for both the fused top-k path (Capacity > 0) and
 // the materialize-all path (Capacity == 0).
-template <int Capacity, bool Ascending>
+//
+// All metrics are min-close after finalize_distance (IP is negated; cosine
+// returns 1 - cos_sim; L2 is squared L2), so the warpsort is hardcoded to
+// ascending.
+template <int Capacity>
 struct sq_block_sort {
   using type = raft::matrix::detail::select::warpsort::block_sort<
     raft::matrix::detail::select::warpsort::warp_sort_filtered,
     Capacity,
-    Ascending,
+    /*Ascending=*/true,
     float,
     uint32_t>;
 };
 
-template <bool Ascending>
-struct sq_block_sort<0, Ascending> {
-  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, Ascending>;
+template <>
+struct sq_block_sort<0> {
+  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, /*Ascending=*/true>;
 };
 
-template <int Capacity, bool Ascending>
-using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
+template <int Capacity>
+using sq_block_sort_t = typename sq_block_sort<Capacity>::type;
 
 // IVF-SQ scan kernel body with fused in-kernel top-k.
 //
@@ -49,11 +53,14 @@ using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
 //   - setup_invariant_smem  (Phase 1, once per query)
 //   - setup_per_probe_smem  (Phase 2, once per probe)
 //   - accumulate_distance   (per-element, inner unrolled loop)
-//   - finalize_distance     (per-row, after the dim accumulation)
+//   - finalize_distance     (per-row, after the dim accumulation; ensures
+//                            all metrics are min-close so the warpsort
+//                            epilogue and the host select_k can be
+//                            hardcoded to ascending / select-min)
 //
-// The host launcher (ivf_sq_search.cuh) derives Ascending from the metric
-// (Ascending = !is_ip) and registers the matching metric variant of each
-// fragment with the planner.
+// The host launcher (ivf_sq_search.cuh) registers the matching metric
+// variant of each fragment with the planner. IP scores are negated by
+// finalize_distance and undone by postprocess_distances.
 //
 // Grid layout:
 //   kManageLocalTopK (Capacity > 0):
@@ -79,7 +86,7 @@ using sq_block_sort_t = typename sq_block_sort<Capacity, Ascending>::type;
 //     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
 //
 //   After all probes are scanned, the smem is reused for block_sort merge.
-template <int Capacity, bool Ascending>
+template <int Capacity>
 __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs,
                                                  const uint32_t* list_sizes,
                                                  const uint32_t* coarse_indices,
@@ -133,7 +140,7 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
   setup_invariant_smem(dim, query, sq_vmin, s_aux, s_query_term);
   __syncthreads();
 
-  using local_topk_t = sq_block_sort_t<Capacity, Ascending>;
+  using local_topk_t = sq_block_sort_t<Capacity>;
   local_topk_t queue(k);
 
   const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
index 252ad98ed3..d48c4d259f 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
@@ -8,8 +8,7 @@
 
 namespace {
 
-constexpr int capacity   = @capacity@;
-constexpr bool ascending = @ascending_value@;
+constexpr int capacity = @capacity@;
 
 }  // namespace
 
@@ -36,25 +35,25 @@ extern "C" __global__ __launch_bounds__(kSqScanThreads) void ivf_sq_scan(
   int64_t bitset_len,
   int64_t original_nbits)
 {
-  ivf_sq_scan_impl<capacity, ascending>(data_ptrs,
-                                        list_sizes,
-                                        coarse_indices,
-                                        queries_float,
-                                        centers,
-                                        sq_vmin,
-                                        sq_delta,
-                                        query_norms,
-                                        n_probes,
-                                        dim,
-                                        k,
-                                        max_samples,
-                                        chunk_indices,
-                                        out_distances,
-                                        out_indices,
-                                        inds_ptrs,
-                                        bitset_ptr,
-                                        bitset_len,
-                                        original_nbits);
+  ivf_sq_scan_impl<capacity>(data_ptrs,
+                             list_sizes,
+                             coarse_indices,
+                             queries_float,
+                             centers,
+                             sq_vmin,
+                             sq_delta,
+                             query_norms,
+                             n_probes,
+                             dim,
+                             k,
+                             max_samples,
+                             chunk_indices,
+                             out_distances,
+                             out_indices,
+                             inds_ptrs,
+                             bitset_ptr,
+                             bitset_len,
+                             original_nbits);
 }
 
 static_assert(std::is_same_v<decltype(ivf_sq_scan), ivf_sq_scan_func_t<int64_t>>);
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
index 79250fea1b..4af912b122 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
@@ -1,15 +1,5 @@
 {
   "capacity": [
     "0", "32", "64", "128", "256"
-  ],
-  "_ascending": [
-    {
-      "ascending_descending": "ascending",
-      "ascending_value": "true"
-    },
-    {
-      "ascending_descending": "descending",
-      "ascending_value": "false"
-    }
   ]
 }
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
index da0fc1ce23..05ea34532e 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
@@ -18,10 +18,10 @@ struct IvfSqScanPlanner : AlgorithmPlanner {
 
   IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {}
 
-  template <int Capacity, bool Ascending>
+  template <int Capacity>
   void add_entrypoint()
   {
-    this->add_static_fragment<fragment_tag_ivf_sq_scan<Capacity, Ascending>>();
+    this->add_static_fragment<fragment_tag_ivf_sq_scan<Capacity>>();
   }
 
   template <typename FilterTag>
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index d4171a0fcc..86a3ee8a1f 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -155,14 +155,12 @@ void launch_kernel(const index<CodeT>& idx,
   constexpr int kThreads          = kSqScanThreads;
   uint32_t dim                    = idx.dim();
 
-  // The scan entrypoint is templated only on (Capacity, Ascending). Metric
-  // specialization is composed in at link time via four device-function
-  // fragments (setup_invariant_smem, setup_per_probe_smem,
-  // accumulate_distance, finalize_distance).
-  constexpr bool kAscending = !std::is_same_v<MetricTag, tag_metric_ip>;
-
+  // The scan entrypoint is templated only on Capacity. The warpsort runs
+  // ascending for all metrics; metric specialization is composed in at link
+  // time via four device-function fragments (setup_invariant_smem,
+  // setup_per_probe_smem, accumulate_distance, finalize_distance).
   IvfSqScanPlanner kernel_planner;
-  kernel_planner.add_entrypoint<Capacity, kAscending>();
+  kernel_planner.add_entrypoint<Capacity>();
   kernel_planner.add_setup_invariant_smem_function<MetricTag>();
   kernel_planner.add_setup_per_probe_smem_function<MetricTag>();
   kernel_planner.add_accumulate_distance_function<MetricTag>();
@@ -348,7 +346,6 @@ void search_impl(raft::resources const& handle,
                  uint32_t n_queries,
                  uint32_t k,
                  uint32_t n_probes,
-                 bool select_min,
                  int64_t* neighbors,
                  float* distances,
                  rmm::device_async_resource_ref search_mr,
@@ -357,6 +354,13 @@ void search_impl(raft::resources const& handle,
   auto stream = raft::resource::get_cuda_stream(handle);
   auto dim    = index.dim();
 
+  // The scan kernel emits min-close distances for every metric (IP scores are
+  // negated by finalize_distance and undone by postprocess_distances below),
+  // so all scan-related top-k selections use select_min = true. Only the
+  // coarse search, which runs on the raw GEMM output, follows the metric's
+  // native direction.
+  const bool coarse_select_min = cuvs::distance::is_min_close(index.metric());
+
   std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes);
 
   bool needs_query_norms = index.metric() != cuvs::distance::DistanceType::InnerProduct;
@@ -467,7 +471,7 @@ void search_impl(raft::resources const& handle,
     raft::make_device_matrix_view<float, int64_t>(coarse_distances_dev.data(), n_queries, n_probes),
     raft::make_device_matrix_view<uint32_t, int64_t>(
       coarse_indices_dev.data(), n_queries, n_probes),
-    select_min);
+    coarse_select_min);
 
   rmm::device_uvector<uint32_t> num_samples(n_queries, stream, search_mr);
   rmm::device_uvector<uint32_t> chunk_index(n_queries_probes, stream, search_mr);
@@ -563,7 +567,7 @@ void search_impl(raft::resources const& handle,
         raft::make_device_matrix_view<const uint32_t, int64_t>(indices_tmp.data(), n_queries, cols),
         raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
         raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
-        select_min);
+        /*select_min=*/true);
     }
   } else {
     // --- Fallback: materialize all distances ---
@@ -577,12 +581,10 @@ void search_impl(raft::resources const& handle,
     rmm::device_uvector<uint32_t> all_indices(
       std::size_t(n_queries) * max_samples, stream, search_mr);
 
-    float init_val =
-      select_min ? std::numeric_limits<float>::max() : std::numeric_limits<float>::lowest();
     thrust::fill_n(raft::resource::get_thrust_policy(handle),
                    all_distances.data(),
                    std::size_t(n_queries) * max_samples,
-                   init_val);
+                   std::numeric_limits<float>::max());
     thrust::fill_n(raft::resource::get_thrust_policy(handle),
                    all_indices.data(),
                    std::size_t(n_queries) * max_samples,
@@ -618,14 +620,19 @@ void search_impl(raft::resources const& handle,
         all_indices.data(), n_queries, max_samples),
       raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
       raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
-      select_min,
+      /*select_min=*/true,
       false,
       cuvs::selection::SelectAlgo::kAuto,
       num_samples_view);
   }
 
+  // For IP, the kernel returned negated scores so the warpsort could run
+  // ascending; postprocess_distances multiplies by -1 to restore the public
+  // contract (positive inner products). Cosine distance is already min-close
+  // so it must not be flipped (account_for_max_close = false for cosine).
+  const bool account_for_max_close = index.metric() != cuvs::distance::DistanceType::CosineExpanded;
   ivf::detail::postprocess_distances(
-    handle, distances, distances, index.metric(), n_queries, k, 1.0, false);
+    handle, distances, distances, index.metric(), n_queries, k, 1.0, account_for_max_close);
 
   ivf::detail::postprocess_neighbors(neighbors,
                                      neighbors_uint32_ptr,
@@ -707,7 +714,6 @@ inline void search_with_filtering(raft::resources const& handle,
                                             queries_batch,
                                             k,
                                             n_probes,
-                                            cuvs::distance::is_min_close(index.metric()),
                                             neighbors + std::size_t(offset_q) * k,
                                             distances + std::size_t(offset_q) * k,
                                             raft::resource::get_workspace_resource_ref(handle),

From c5948a2755d5b0b88c64549ce47547b3bd2db518 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 5 May 2026 12:59:36 +0200
Subject: [PATCH 33/43] Fix + minor cleanups

---
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh | 64 ++++++++++++----------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index 86a3ee8a1f..6bc64c1569 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -365,6 +365,9 @@ void search_impl(raft::resources const& handle,
 
   bool needs_query_norms = index.metric() != cuvs::distance::DistanceType::InnerProduct;
   rmm::device_uvector<float> query_norm_dev(needs_query_norms ? n_queries : 0, stream, search_mr);
+  // Pass nullptr to the kernel for IP, where the empty uvector's data() pointer
+  // is implementation-defined and the kernel's `query_norms` argument is unused.
+  const float* qn_ptr = needs_query_norms ? query_norm_dev.data() : nullptr;
   rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
   rmm::device_uvector<float> coarse_distances_dev(n_queries_probes, stream, search_mr);
   rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries_probes, stream, search_mr);
@@ -491,7 +494,7 @@ void search_impl(raft::resources const& handle,
     ivf_sq_scan(handle,
                 index,
                 converted_queries_ptr,
-                query_norm_dev.data(),
+                qn_ptr,
                 n_queries,
                 n_probes,
                 k,
@@ -513,15 +516,10 @@ void search_impl(raft::resources const& handle,
     }
   }
 
-  // Prepare uint32 neighbors buffer for postprocessing
-  rmm::device_uvector<uint32_t> neighbors_uint32(0, stream, search_mr);
-  uint32_t* neighbors_uint32_ptr = nullptr;
-  if constexpr (sizeof(int64_t) == sizeof(uint32_t)) {
-    neighbors_uint32_ptr = reinterpret_cast<uint32_t*>(neighbors);
-  } else {
-    neighbors_uint32.resize(std::size_t(n_queries) * k, stream);
-    neighbors_uint32_ptr = neighbors_uint32.data();
-  }
+  // The scan kernel writes sample-local uint32 indices; postprocess_neighbors
+  // lifts them to int64_t database indices via inds_ptrs.
+  rmm::device_uvector<uint32_t> neighbors_uint32(std::size_t(n_queries) * k, stream, search_mr);
+  uint32_t* neighbors_uint32_ptr = neighbors_uint32.data();
 
   if (manage_local_topk) {
     // --- Fused top-k path ---
@@ -545,7 +543,7 @@ void search_impl(raft::resources const& handle,
     ivf_sq_scan(handle,
                 index,
                 converted_queries_ptr,
-                query_norm_dev.data(),
+                qn_ptr,
                 n_queries,
                 n_probes,
                 k,
@@ -596,7 +594,7 @@ void search_impl(raft::resources const& handle,
     ivf_sq_scan(handle,
                 index,
                 converted_queries_ptr,
-                query_norm_dev.data(),
+                qn_ptr,
                 n_queries,
                 n_probes,
                 k,
@@ -674,32 +672,42 @@ inline void search_with_filtering(raft::resources const& handle,
 
   bool manage_local_topk = is_local_topk_feasible(k);
 
-  uint32_t max_samples = 0;
-  if (!manage_local_topk) {
-    int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
-    RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
-                 "The maximum sample size is too big.");
-    max_samples = static_cast<uint32_t>(ms);
-  }
+  // Always compute max_samples: even on the fused path, search_impl may fall
+  // back to the materialize-all branch when the scan kernel reports zero
+  // occupancy, in which case the workspace must already be sized for the
+  // (much larger) materialize-path requirement.
+  int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
+  RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
+               "The maximum sample size is too big.");
+  uint32_t max_samples = static_cast<uint32_t>(ms);
 
   constexpr uint64_t kExpectedWsSize = 1024ull * 1024 * 1024;
   uint64_t max_ws_size =
     std::min<uint64_t>(raft::resource::get_workspace_free_bytes(handle), kExpectedWsSize);
 
   uint64_t converted_query_floats = std::is_same_v<T, float> ? 0 : index.dim();
+
+  // Materialize-path estimate: used directly for the non-fused path, and as
+  // a conservative upper bound for the fused path so the fallback branch in
+  // search_impl never overshoots the workspace.
+  uint64_t ws_materialize = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 +
+                                             max_samples + converted_query_floats) +
+                            sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+
   uint64_t ws_per_query;
   if (manage_local_topk) {
-    // Fused path: only small per-query buffers for coarse search + chunk indices
-    // (The scan output is at most grid_dim_x * k per query, which is small)
-    // Conservatively assume grid_dim_x <= n_probes for the workspace estimate
+    // Fused path: only small per-query buffers for coarse search + chunk indices.
+    // The scan output is at most grid_dim_x * k per query (we conservatively
+    // assume grid_dim_x <= n_probes for the workspace estimate).
     uint64_t fused_out = uint64_t(n_probes) * k;
-    ws_per_query       = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
-                                    converted_query_floats) +
-                   sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + fused_out + k);
+    uint64_t ws_fused  = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
+                                         converted_query_floats) +
+                        sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + fused_out + k);
+    // Take the conservative max so search_impl can safely fall back to the
+    // materialize path if the fused-path occupancy probe returns zero.
+    ws_per_query = std::max(ws_fused, ws_materialize);
   } else {
-    ws_per_query = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + max_samples +
-                                    converted_query_floats) +
-                   sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+    ws_per_query = ws_materialize;
   }
 
   const uint32_t max_queries =

From d2e1c62da489687cd0311e0d6fed84747bbcfe8c Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Mon, 11 May 2026 10:32:37 +0200
Subject: [PATCH 34/43] Fix serialization vulnerabilities

---
 cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
index b201cceee7..5e455302bb 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "../../util/serialize_validation.hpp"
 #include "../ivf_common.cuh"
 #include "../ivf_list.cuh"
 #include <cuvs/neighbors/common.hpp>
@@ -87,7 +88,9 @@ template <typename CodeT>
 auto deserialize(raft::resources const& handle, std::istream& is) -> index<CodeT>
 {
   char dtype_string[4];
-  is.read(dtype_string, 4);
+  RAFT_EXPECTS(is.read(dtype_string, 4), "ivf_sq::deserialize: failed to read dtype prefix");
+  RAFT_EXPECTS(cuvs::util::validate_serialized_dtype<CodeT>(dtype_string, sizeof(dtype_string)),
+               "ivf_sq::deserialize: serialized dtype prefix does not match requested type");
 
   auto ver = raft::deserialize_scalar<int>(handle, is);
   if (ver != serialization_version) {
@@ -99,6 +102,23 @@ auto deserialize(raft::resources const& handle, std::istream& is) -> index<CodeT
   auto metric  = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
   bool cma     = raft::deserialize_scalar<bool>(handle, is);
 
+  RAFT_EXPECTS(cuvs::util::is_valid_distance_type(metric),
+               "ivf_sq::deserialize: invalid metric value %d",
+               static_cast<int>(metric));
+  RAFT_EXPECTS(n_lists <= cuvs::util::kMaxIvfNLists,
+               "ivf_sq::deserialize: n_lists=%u exceeds maximum %u",
+               n_lists,
+               cuvs::util::kMaxIvfNLists);
+  RAFT_EXPECTS(cuvs::util::is_mul_no_overflow(
+                 static_cast<std::size_t>(n_lists), static_cast<std::size_t>(dim), sizeof(float)),
+               "ivf_sq::deserialize: integer overflow in n_lists*dim*sizeof(float) "
+               "(n_lists=%u, dim=%u)",
+               n_lists,
+               dim);
+  RAFT_EXPECTS(cuvs::util::is_mul_no_overflow(static_cast<std::size_t>(dim), sizeof(float)),
+               "ivf_sq::deserialize: integer overflow in dim*sizeof(float) (dim=%u)",
+               dim);
+
   index<CodeT> index_ = index<CodeT>(handle, metric, n_lists, dim, cma);
 
   deserialize_mdspan(handle, is, index_.centers());

From 063beb8b0755268b6ba80908dc3561d4953b6c88 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Wed, 13 May 2026 11:54:35 +0200
Subject: [PATCH 35/43] CUVS_EXPORT

---
 cpp/src/neighbors/ivf_sq_index.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
index 5a110a476c..c4a75b84e3 100644
--- a/cpp/src/neighbors/ivf_sq_index.cpp
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -238,6 +238,6 @@ void index<CodeT>::check_consistency()
                static_cast<uint32_t>(centers_.extent(1)));
 }
 
-template struct index<uint8_t>;
+template struct CUVS_EXPORT index<uint8_t>;
 
 }  // namespace cuvs::neighbors::ivf_sq

From 1d40d4731c65d34a70cf321604bad94a28caff7e Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 19 May 2026 10:58:45 +0200
Subject: [PATCH 36/43] review 1/2

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         |  3 +-
 .../detail/jit_lto_kernels/scan_impl.cuh      | 13 ++++----
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     | 33 ++++++++++---------
 3 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index b342d0ce50..0d34cd8ced 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -13,6 +13,7 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdspan.hpp>
+#include <raft/util/integer_utils.hpp>
 
 namespace cuvs::neighbors::ivf_sq {
 
@@ -107,7 +108,7 @@ struct list_spec {
 
   constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
-    uint32_t padded = ((dim + kVecLen - 1) / kVecLen) * kVecLen;
+    uint32_t padded = raft::round_up_safe<uint32_t>(dim, kVecLen);
     return raft::make_extents<SizeT>(n_rows, padded);
   }
 };
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
index 85cb9605be..92652e96a8 100644
--- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
@@ -14,6 +14,7 @@
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/integer_utils.hpp>
+#include <raft/util/vectorized.cuh>
 
 #include <cstdint>
 #include <type_traits>
@@ -146,7 +147,8 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
   const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
   const uint32_t* my_chunk  = chunk_indices + query_ix * n_probes;
 
-  constexpr uint32_t veclen         = 16;
+  constexpr uint32_t veclen         = list_spec<uint32_t, uint8_t, int64_t>::kVecLen;
+  using code_vec_t                  = raft::TxN_t<uint8_t, veclen>;
   constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
   const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
   const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
@@ -180,7 +182,7 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
 
     const uint8_t* codes   = data_ptrs[cluster_id];
     uint32_t sample_offset = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
-    uint32_t padded_dim    = ((dim + veclen - 1) / veclen) * veclen;
+    uint32_t padded_dim    = raft::Pow2<veclen>::roundUp(dim);
     uint32_t n_dim_blocks  = padded_dim / veclen;
 
     for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
@@ -197,9 +199,8 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
       const uint8_t* group_data = codes + size_t(group) * padded_dim;
 
       for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
-        uint8_t codes_local[veclen];
-        *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
-          group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
+        code_vec_t codes_local;
+        codes_local.load(group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen, 0);
 
         const uint32_t l = bl * veclen;
 #pragma unroll
@@ -208,7 +209,7 @@ __device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs
             accumulate_distance(s_query_term[l + j],
                                 s_aux[l + j],
                                 s_sq_scale[l + j],
-                                codes_local[j],
+                                codes_local.val.data[j],
                                 dist,
                                 v_norm_sq);
           }
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 52be653985..f1cdc9090e 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -343,13 +343,15 @@ void extend(raft::resources const& handle,
     }
   }
 
-  utils::batch_load_iterator<T> vec_batches(new_vectors,
-                                            n_rows,
-                                            index->dim(),
-                                            max_batch_size,
-                                            copy_stream,
-                                            raft::resource::get_workspace_resource_ref(handle),
-                                            enable_prefetch);
+  auto vec_batches =
+    utils::make_batch_load_iterator<T>(handle,
+                                       new_vectors,
+                                       n_rows,
+                                       int64_t{index->dim()},
+                                       max_batch_size,
+                                       copy_stream,
+                                       raft::resource::get_workspace_resource_ref(handle),
+                                       enable_prefetch);
   vec_batches.prefetch_next_batch();
 
   const bool needs_prefetch_sync = enable_prefetch && vec_batches.does_copy();
@@ -397,16 +399,17 @@ void extend(raft::resources const& handle,
   ivf::detail::recompute_internal_state(handle, *index);
   raft::copy(list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
 
-  utils::batch_load_iterator<int64_t> vec_indices(
-    new_indices,
-    n_rows,
-    1,
-    max_batch_size,
-    stream,
-    raft::resource::get_workspace_resource_ref(handle));
+  auto vec_indices =
+    utils::make_batch_load_iterator<int64_t>(handle,
+                                             new_indices,
+                                             n_rows,
+                                             int64_t{1},
+                                             max_batch_size,
+                                             stream,
+                                             raft::resource::get_workspace_resource_ref(handle));
   vec_batches.reset();
   vec_batches.prefetch_next_batch();
-  utils::batch_load_iterator<int64_t> idx_batch = vec_indices.begin();
+  auto idx_batch = vec_indices.begin();
 
   size_t next_report_offset = 0;
   size_t d_report_offset    = n_rows * 5 / 100;

From 43c4f00a6f0de1f0a667d1b4ff9eb08da56847a6 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 19 May 2026 13:48:54 +0200
Subject: [PATCH 37/43] review 2/2

---
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |  6 ++++-
 cpp/include/cuvs/neighbors/ivf_sq.hpp         | 18 ++++++-------
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     | 25 ++++++++++++-------
 cpp/tests/neighbors/ann_ivf_sq.cuh            |  8 +++---
 .../cuvs_bench/config/algos/cuvs_ivf_sq.yaml  |  6 ++---
 5 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 6e7a6e2179..ea26ceed78 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -99,7 +99,11 @@ void parse_build_param(const nlohmann::json& conf,
   param.n_lists = conf.at("nlist");
   if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
   if (conf.contains("ratio")) {
-    param.kmeans_trainset_fraction = 1.0 / static_cast<double>(conf.at("ratio"));
+    throw std::runtime_error(
+      "cuvs_ivf_sq build parameter ratio was replaced by max_train_points_per_cluster");
+  }
+  if (conf.contains("max_train_points_per_cluster")) {
+    param.max_train_points_per_cluster = conf.at("max_train_points_per_cluster");
   }
 }
 
diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 0d34cd8ced..489440443e 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -24,13 +24,19 @@ namespace cuvs::neighbors::ivf_sq {
 
 constexpr static uint32_t kIndexGroupSize = 32;
 
+/**
+ * @brief IVF-SQ index build parameters.
+ *
+ * IVF-SQ currently uses 8-bit scalar quantization, storing one `uint8_t` code per vector
+ * dimension.
+ */
 struct index_params : cuvs::neighbors::index_params {
   /** The number of inverted lists (clusters) */
   uint32_t n_lists = 1024;
   /** The number of iterations searching for kmeans centers (index building). */
   uint32_t kmeans_n_iters = 20;
-  /** The fraction of data to use during iterative kmeans building. */
-  double kmeans_trainset_fraction = 0.5;
+  /** The number of data vectors (per cluster) to use during iterative kmeans building. */
+  uint32_t max_train_points_per_cluster = 256;
   /**
    * By default, the algorithm allocates more space than necessary for individual clusters
    * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
@@ -471,7 +477,6 @@ void build(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -502,7 +507,6 @@ auto extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -530,7 +534,6 @@ void extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -561,7 +564,6 @@ auto extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -589,7 +591,6 @@ void extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -620,7 +621,6 @@ auto extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -648,7 +648,6 @@ void extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
@@ -679,7 +678,6 @@ auto extend(raft::resources const& handle,
  *   using namespace cuvs::neighbors;
  *   ivf_sq::index_params index_params;
  *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
  *   // fill the index with the data
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index f1cdc9090e..3261035f49 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <algorithm>
 #include <cstdint>
 #include <limits>
 
@@ -498,8 +499,7 @@ inline auto build(
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, half>, "unsupported data type");
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
   RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
-  RAFT_EXPECTS(params.kmeans_trainset_fraction > 0.0 && params.kmeans_trainset_fraction <= 1.0,
-               "kmeans_trainset_fraction must be in (0, 1]");
+  RAFT_EXPECTS(params.max_train_points_per_cluster > 0, "max_train_points_per_cluster must be > 0");
   RAFT_EXPECTS(params.metric != cuvs::distance::DistanceType::CosineExpanded || dim > 1,
                "Cosine metric requires more than one dim");
 
@@ -509,13 +509,20 @@ inline auto build(
   // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
   {
     raft::random::RngState random_state{137};
-    auto trainset_ratio = std::max<size_t>(
-      1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, idx.n_lists()));
-    auto n_rows_train = n_rows / trainset_ratio;
-    auto trainset =
-      raft::make_device_mdarray<T>(handle,
-                                   raft::resource::get_large_workspace_resource_ref(handle),
-                                   raft::make_extents<int64_t>(n_rows_train, idx.dim()));
+    auto max_train_rows =
+      static_cast<size_t>(idx.n_lists()) * static_cast<size_t>(params.max_train_points_per_cluster);
+    if (max_train_rows > static_cast<size_t>(n_rows)) {
+      RAFT_LOG_WARN(
+        "ivf_sq::build: requested k-means training set size (%zu) exceeds "
+        "dataset size (%zu); using the full dataset for training.",
+        max_train_rows,
+        static_cast<size_t>(n_rows));
+    }
+    auto n_rows_train = std::min<size_t>(static_cast<size_t>(n_rows), max_train_rows);
+    auto trainset     = raft::make_device_mdarray<T>(
+      handle,
+      raft::resource::get_large_workspace_resource_ref(handle),
+      raft::make_extents<int64_t>(static_cast<int64_t>(n_rows_train), idx.dim()));
     raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     auto trainset_const_view = raft::make_const_mdspan(trainset.view());
     auto centers_view        = raft::make_device_matrix_view<float, int64_t>(
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
index b62892d9b1..969be22771 100644
--- a/cpp/tests/neighbors/ann_ivf_sq.cuh
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -288,10 +288,10 @@ class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
   cuvs::neighbors::ivf_sq::index<uint8_t> build_index(bool add_data_on_build)
   {
     cuvs::neighbors::ivf_sq::index_params index_params;
-    index_params.n_lists                  = ps.nlist;
-    index_params.metric                   = ps.metric;
-    index_params.add_data_on_build        = add_data_on_build;
-    index_params.kmeans_trainset_fraction = 0.5;
+    index_params.n_lists                      = ps.nlist;
+    index_params.metric                       = ps.metric;
+    index_params.add_data_on_build            = add_data_on_build;
+    index_params.max_train_points_per_cluster = 256;
 
     if (!ps.host_dataset) {
       auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
index af59493eef..10729ed144 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
@@ -5,21 +5,21 @@ groups:
   base:
     build:
       nlist: [1024, 2048, 4096]
-      ratio: [2, 4]
+      max_train_points_per_cluster: [256]
       niter: [25]
     search:
       nprobe: [1, 5, 10, 20, 50, 100, 200]
   large:
     build:
       nlist: [8192, 16384, 32768]
-      ratio: [4]
+      max_train_points_per_cluster: [256]
       niter: [20]
     search:
       nprobe: [10, 20, 50, 100, 200, 500, 1000]
   test:
     build:
       nlist: [1024]
-      ratio: [2]
+      max_train_points_per_cluster: [256]
       niter: [20]
     search:
       nprobe: [1, 5]

From 2e5e6baa8a7c2901e727c9b5515cc20b2a6aac26 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 19 May 2026 14:06:07 +0200
Subject: [PATCH 38/43] Drop IVF-SQ void build functions

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         | 120 ------------------
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     |  18 ---
 ...f_sq_build_extend_float_uint8_t_int64_t.cu |  16 ---
 ...vf_sq_build_extend_half_uint8_t_int64_t.cu |  16 ---
 4 files changed, 170 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 489440443e..e84b167e83 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -271,36 +271,6 @@ auto build(raft::resources const& handle,
            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2SqrtExpanded
- * - InnerProduct
- * - CosineExpanded
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // use default index parameters
- *   ivf_sq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_sq::index<uint8_t> index;
- *   ivf_sq::build(handle, index_params, dataset, index);
- * @endcode
- *
- * @param[in] handle
- * @param[in] index_params configure the index building
- * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_sq::index
- *
- */
-void build(raft::resources const& handle,
-           const cuvs::neighbors::ivf_sq::index_params& index_params,
-           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
-           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
-
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -324,36 +294,6 @@ auto build(raft::resources const& handle,
            raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2SqrtExpanded
- * - InnerProduct
- * - CosineExpanded
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // use default index parameters
- *   ivf_sq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_sq::index<uint8_t> index;
- *   ivf_sq::build(handle, index_params, dataset, index);
- * @endcode
- *
- * @param[in] handle
- * @param[in] index_params configure the index building
- * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_sq::index
- *
- */
-void build(raft::resources const& handle,
-           const cuvs::neighbors::ivf_sq::index_params& index_params,
-           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
-           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
-
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -377,36 +317,6 @@ auto build(raft::resources const& handle,
            raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2SqrtExpanded
- * - InnerProduct
- * - CosineExpanded
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // use default index parameters
- *   ivf_sq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_sq::index<uint8_t> index;
- *   ivf_sq::build(handle, index_params, dataset, index);
- * @endcode
- *
- * @param[in] handle
- * @param[in] index_params configure the index building
- * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_sq::index
- *
- */
-void build(raft::resources const& handle,
-           const cuvs::neighbors::ivf_sq::index_params& index_params,
-           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
-           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
-
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -430,36 +340,6 @@ auto build(raft::resources const& handle,
            raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::ivf_sq::index<uint8_t>;
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2SqrtExpanded
- * - InnerProduct
- * - CosineExpanded
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // use default index parameters
- *   ivf_sq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_sq::index<uint8_t> index;
- *   ivf_sq::build(handle, index_params, dataset, index);
- * @endcode
- *
- * @param[in] handle
- * @param[in] index_params configure the index building
- * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_sq::index
- *
- */
-void build(raft::resources const& handle,
-           const cuvs::neighbors::ivf_sq::index_params& index_params,
-           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
-           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
-
 /**
  * @}
  */
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 3261035f49..615b183d6b 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -585,24 +585,6 @@ inline auto build(
   return idx;
 }
 
-template <typename T, typename CodeT>
-void build(raft::resources const& handle,
-           const index_params& params,
-           raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
-           index<CodeT>& idx)
-{
-  idx = build<T, CodeT>(handle, params, dataset);
-}
-
-template <typename T, typename CodeT>
-void build(raft::resources const& handle,
-           const index_params& params,
-           raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
-           index<CodeT>& idx)
-{
-  idx = build<T, CodeT>(handle, params, dataset);
-}
-
 template <typename T, typename CodeT>
 auto extend(raft::resources const& handle,
             raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
index f22154e50f..4e28b29253 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
@@ -19,14 +19,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  void build(raft::resources const& handle,                                                   \
-             const cuvs::neighbors::ivf_sq::index_params& params,                             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
-             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
-  {                                                                                           \
-    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
-  }                                                                                           \
-                                                                                              \
   auto build(raft::resources const& handle,                                                   \
              const cuvs::neighbors::ivf_sq::index_params& params,                             \
              raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
@@ -36,14 +28,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  void build(raft::resources const& handle,                                                   \
-             const cuvs::neighbors::ivf_sq::index_params& params,                             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
-             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
-  {                                                                                           \
-    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
-  }                                                                                           \
-                                                                                              \
   auto extend(raft::resources const& handle,                                                  \
               raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
               std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
index e6900af4a0..167cb775a1 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
@@ -19,14 +19,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  void build(raft::resources const& handle,                                                   \
-             const cuvs::neighbors::ivf_sq::index_params& params,                             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
-             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
-  {                                                                                           \
-    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
-  }                                                                                           \
-                                                                                              \
   auto build(raft::resources const& handle,                                                   \
              const cuvs::neighbors::ivf_sq::index_params& params,                             \
              raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
@@ -36,14 +28,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  void build(raft::resources const& handle,                                                   \
-             const cuvs::neighbors::ivf_sq::index_params& params,                             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
-             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
-  {                                                                                           \
-    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
-  }                                                                                           \
-                                                                                              \
   auto extend(raft::resources const& handle,                                                  \
               raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
               std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \

From 78a920c1a18398a3cf70523a0c3212123d42021e Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 19 May 2026 14:24:47 +0200
Subject: [PATCH 39/43] Drop IVF-SQ auto extend functions

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         | 128 +-----------------
 cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh     |  84 +++---------
 ...f_sq_build_extend_float_uint8_t_int64_t.cu |  22 ---
 ...vf_sq_build_extend_half_uint8_t_int64_t.cu |  22 ---
 4 files changed, 20 insertions(+), 236 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index e84b167e83..58134fe62b 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -350,37 +350,7 @@ auto build(raft::resources const& handle,
  */
 
 /**
- * @brief Extend the index with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   ivf_sq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
- *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
- * @endcode
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index the original index
- *
- * @return the constructed extended ivf-sq index
- */
-auto extend(raft::resources const& handle,
-            raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
-            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
-  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
-
-/**
- * @brief Extend the index with the new data.
+ * @brief Extend the index with the new data in-place.
  *
  * Usage example:
  * @code{.cpp}
@@ -407,37 +377,7 @@ void extend(raft::resources const& handle,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
 /**
- * @brief Extend the index with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   ivf_sq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
- *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
- * @endcode
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index the original index
- *
- * @return the constructed extended ivf-sq index
- */
-auto extend(raft::resources const& handle,
-            raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
-            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
-  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
-
-/**
- * @brief Extend the index with the new data.
+ * @brief Extend the index with the new data in-place.
  *
  * Usage example:
  * @code{.cpp}
@@ -464,37 +404,7 @@ void extend(raft::resources const& handle,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
 /**
- * @brief Extend the index with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   ivf_sq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
- *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
- * @endcode
- *
- * @param[in] handle
- * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a host vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index the original index
- *
- * @return the constructed extended ivf-sq index
- */
-auto extend(raft::resources const& handle,
-            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
-            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
-  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
-
-/**
- * @brief Extend the index with the new data.
+ * @brief Extend the index with the new data in-place.
  *
  * Usage example:
  * @code{.cpp}
@@ -521,37 +431,7 @@ void extend(raft::resources const& handle,
             cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
 
 /**
- * @brief Extend the index with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   ivf_sq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
- *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
- * @endcode
- *
- * @param[in] handle
- * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a host vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index the original index
- *
- * @return the constructed extended ivf-sq index
- */
-auto extend(raft::resources const& handle,
-            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
-            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
-  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
-
-/**
- * @brief Extend the index with the new data.
+ * @brief Extend the index with the new data in-place.
  *
  * Usage example:
  * @code{.cpp}
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
index 615b183d6b..378847ce4c 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -302,11 +302,11 @@ RAFT_KERNEL compute_residuals_inplace_kernel(
 }
 
 template <typename T, typename CodeT>
-void extend(raft::resources const& handle,
-            index<CodeT>* index,
-            const T* new_vectors,
-            const int64_t* new_indices,
-            int64_t n_rows)
+void extend_inplace(raft::resources const& handle,
+                    index<CodeT>* index,
+                    const T* new_vectors,
+                    const int64_t* new_indices,
+                    int64_t n_rows)
 {
   using LabelT = uint32_t;
   RAFT_EXPECTS(index != nullptr, "index cannot be empty.");
@@ -472,18 +472,6 @@ void extend(raft::resources const& handle,
   }
 }
 
-template <typename T, typename CodeT>
-auto extend(raft::resources const& handle,
-            const index<CodeT>& orig_index,
-            const T* new_vectors,
-            const int64_t* new_indices,
-            int64_t n_rows) -> index<CodeT>
-{
-  auto ext_index = clone(handle, orig_index);
-  detail::extend(handle, &ext_index, new_vectors, new_indices, n_rows);
-  return ext_index;
-}
-
 template <typename T, typename CodeT, typename accessor>
 inline auto build(
   raft::resources const& handle,
@@ -579,52 +567,12 @@ inline auto build(
   }
 
   if (params.add_data_on_build) {
-    detail::extend<T, CodeT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
+    detail::extend_inplace<T, CodeT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
   }
 
   return idx;
 }
 
-template <typename T, typename CodeT>
-auto extend(raft::resources const& handle,
-            raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
-            const index<CodeT>& orig_index) -> index<CodeT>
-{
-  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
-               "new_vectors should have the same dimension as the index");
-  if (new_indices.has_value()) {
-    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
-                 "new_vectors and new_indices have different number of rows");
-  }
-  int64_t n_rows = new_vectors.extent(0);
-  return extend<T, CodeT>(handle,
-                          orig_index,
-                          new_vectors.data_handle(),
-                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                          n_rows);
-}
-
-template <typename T, typename CodeT>
-auto extend(raft::resources const& handle,
-            raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
-            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
-            const index<CodeT>& orig_index) -> index<CodeT>
-{
-  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
-               "new_vectors should have the same dimension as the index");
-  if (new_indices.has_value()) {
-    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
-                 "new_vectors and new_indices have different number of rows");
-  }
-  int64_t n_rows = new_vectors.extent(0);
-  return extend<T, CodeT>(handle,
-                          orig_index,
-                          new_vectors.data_handle(),
-                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                          n_rows);
-}
-
 template <typename T, typename CodeT>
 void extend(raft::resources const& handle,
             raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
@@ -637,11 +585,11 @@ void extend(raft::resources const& handle,
     RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
                  "new_vectors and new_indices have different number of rows");
   }
-  detail::extend(handle,
-                 idx,
-                 new_vectors.data_handle(),
-                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                 new_vectors.extent(0));
+  detail::extend_inplace(handle,
+                         idx,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         new_vectors.extent(0));
 }
 
 template <typename T, typename CodeT>
@@ -656,11 +604,11 @@ void extend(raft::resources const& handle,
     RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
                  "new_vectors and new_indices have different number of rows");
   }
-  detail::extend(handle,
-                 idx,
-                 new_vectors.data_handle(),
-                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                 new_vectors.extent(0));
+  detail::extend_inplace(handle,
+                         idx,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         new_vectors.extent(0));
 }
 
 }  // namespace detail
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
index 4e28b29253..3243e75507 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
@@ -28,17 +28,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  auto extend(raft::resources const& handle,                                                  \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
-              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
-  {                                                                                           \
-    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                      \
-  }                                                                                           \
-                                                                                              \
   void extend(raft::resources const& handle,                                                  \
               raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
               std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
@@ -47,17 +36,6 @@ namespace cuvs::neighbors::ivf_sq {
     cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
   }                                                                                           \
                                                                                               \
-  auto extend(raft::resources const& handle,                                                  \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
-              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
-  {                                                                                           \
-    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                      \
-  }                                                                                           \
-                                                                                              \
   void extend(raft::resources const& handle,                                                  \
               raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
               std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
index 167cb775a1..b6e6f3caf5 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
@@ -28,17 +28,6 @@ namespace cuvs::neighbors::ivf_sq {
       std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
   }                                                                                           \
                                                                                               \
-  auto extend(raft::resources const& handle,                                                  \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
-              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
-              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
-  {                                                                                           \
-    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                      \
-  }                                                                                           \
-                                                                                              \
   void extend(raft::resources const& handle,                                                  \
               raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
               std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
@@ -47,17 +36,6 @@ namespace cuvs::neighbors::ivf_sq {
     cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
   }                                                                                           \
                                                                                               \
-  auto extend(raft::resources const& handle,                                                  \
-              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
-              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
-              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
-    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
-  {                                                                                           \
-    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
-      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
-        handle, new_vectors, new_indices, orig_index)));                                      \
-  }                                                                                           \
-                                                                                              \
   void extend(raft::resources const& handle,                                                  \
               raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
               std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \

From 387b6451d492a900c15a24d61a44360c46435714 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 19 May 2026 15:38:49 +0200
Subject: [PATCH 40/43] updates

---
 c/include/cuvs/neighbors/ivf_sq.h          | 51 +++++++---------
 c/src/neighbors/ivf_sq.cpp                 | 69 ++++++++++++++--------
 cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh |  2 +-
 3 files changed, 65 insertions(+), 57 deletions(-)

diff --git a/c/include/cuvs/neighbors/ivf_sq.h b/c/include/cuvs/neighbors/ivf_sq.h
index 6b312443b0..472c9cda7a 100644
--- a/c/include/cuvs/neighbors/ivf_sq.h
+++ b/c/include/cuvs/neighbors/ivf_sq.h
@@ -42,21 +42,11 @@ struct cuvsIvfSqIndexParams {
   uint32_t n_lists;
   /** The number of iterations searching for kmeans centers (index building). */
   uint32_t kmeans_n_iters;
-  /** The fraction of data to use during iterative kmeans building. */
-  double kmeans_trainset_fraction;
   /**
-   * By default (adaptive_centers = false), the cluster centers are trained in `ivf_sq::build`,
-   * and never modified in `ivf_sq::extend`. As a result, you may need to retrain the index
-   * from scratch after invoking (`ivf_sq::extend`) a few times with new data, the distribution of
-   * which is no longer representative of the original training set.
-   *
-   * The alternative behavior (adaptive_centers = true) is to update the cluster centers for new
-   * data when it is added. In this case, `index.centers()` are always exactly the centroids of the
-   * data in the corresponding clusters. The drawback of this behavior is that the centroids depend
-   * on the order of adding new data (through the classification of the added data); that is,
-   * `index.centers()` "drift" together with the changing distribution of the newly added data.
+   * The number of data vectors per cluster to use during iterative kmeans building.
+   * The index uses at most `n_lists * max_train_points_per_cluster` rows for training.
    */
-  bool adaptive_centers;
+  uint32_t max_train_points_per_cluster;
   /**
    * By default, the algorithm allocates more space than necessary for individual clusters
    * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
@@ -77,7 +67,7 @@ typedef struct cuvsIvfSqIndexParams* cuvsIvfSqIndexParams_t;
  * @param[in] index_params cuvsIvfSqIndexParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* index_params);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* index_params);
 
 /**
  * @brief De-allocate IVF-SQ Index params
@@ -85,7 +75,7 @@ cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* index_params);
  * @param[in] index_params
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqIndexParamsDestroy(cuvsIvfSqIndexParams_t index_params);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexParamsDestroy(cuvsIvfSqIndexParams_t index_params);
 /**
  * @}
  */
@@ -111,7 +101,7 @@ typedef struct cuvsIvfSqSearchParams* cuvsIvfSqSearchParams_t;
  * @param[in] params cuvsIvfSqSearchParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params);
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params);
 
 /**
  * @brief De-allocate IVF-SQ search params
@@ -119,7 +109,7 @@ cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params);
  * @param[in] params
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqSearchParamsDestroy(cuvsIvfSqSearchParams_t params);
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchParamsDestroy(cuvsIvfSqSearchParams_t params);
 /**
  * @}
  */
@@ -145,23 +135,23 @@ typedef cuvsIvfSqIndex* cuvsIvfSqIndex_t;
  * @param[in] index cuvsIvfSqIndex_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index);
 
 /**
  * @brief De-allocate IVF-SQ index
  *
  * @param[in] index cuvsIvfSqIndex_t to de-allocate
  */
-cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index);
 
 /** Get the number of clusters/inverted lists */
-cuvsError_t cuvsIvfSqIndexGetNLists(cuvsIvfSqIndex_t index, int64_t* n_lists);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetNLists(cuvsIvfSqIndex_t index, int64_t* n_lists);
 
 /** Get the dimensionality of the data */
-cuvsError_t cuvsIvfSqIndexGetDim(cuvsIvfSqIndex_t index, int64_t* dim);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetDim(cuvsIvfSqIndex_t index, int64_t* dim);
 
 /** Get the size of the index */
-cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size);
 
 /**
  * @brief Get the cluster centers corresponding to the lists [n_lists, dim]
@@ -170,7 +160,7 @@ cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size);
  * @param[out] centers Preallocated array on host or device memory to store output, [n_lists, dim]
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* centers);
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* centers);
 
 /**
  * @}
@@ -221,7 +211,7 @@ cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* ce
  * @param[out] index cuvsIvfSqIndex_t Newly built IVF-SQ index
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
+CUVS_EXPORT cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
                            cuvsIvfSqIndexParams_t index_params,
                            DLManagedTensor* dataset,
                            cuvsIvfSqIndex_t index);
@@ -274,7 +264,7 @@ cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
  */
-cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
                             cuvsIvfSqSearchParams_t search_params,
                             cuvsIvfSqIndex_t index,
                             DLManagedTensor* queries,
@@ -294,7 +284,7 @@ cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
  * @param[out] distances DLManagedTensor* output `k` distances for queries
  * @param[in] filter cuvsFilter to filter neighbors based on the given bitset
  */
-cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
                                       cuvsIvfSqSearchParams_t search_params,
                                       cuvsIvfSqIndex_t index,
                                       DLManagedTensor* queries,
@@ -330,7 +320,7 @@ cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
  * @param[in] filename the file name for saving the index
  * @param[in] index IVF-SQ index
  */
-cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res, const char* filename, cuvsIvfSqIndex_t index);
+CUVS_EXPORT cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res, const char* filename, cuvsIvfSqIndex_t index);
 
 /**
  * Load index from file.
@@ -341,7 +331,7 @@ cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res, const char* filename, cuvsIv
  * @param[in] filename the name of the file that stores the index
  * @param[out] index IVF-SQ index loaded from disk
  */
-cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
+CUVS_EXPORT cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
                                  const char* filename,
                                  cuvsIvfSqIndex_t index);
 /**
@@ -357,11 +347,12 @@ cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] new_vectors DLManagedTensor* the new vectors to add to the index
- * @param[in] new_indices DLManagedTensor* vector of new indices for the new vectors
+ * @param[in] new_indices DLManagedTensor* vector of new indices for the new vectors. If the index
+ *            is empty, this can be NULL to imply a continuous range `[0...n_rows)`.
  * @param[inout] index IVF-SQ index to be extended
  * @return cuvsError_t
  */
-cuvsError_t cuvsIvfSqExtend(cuvsResources_t res,
+CUVS_EXPORT cuvsError_t cuvsIvfSqExtend(cuvsResources_t res,
                             DLManagedTensor* new_vectors,
                             DLManagedTensor* new_indices,
                             cuvsIvfSqIndex_t index);
diff --git a/c/src/neighbors/ivf_sq.cpp b/c/src/neighbors/ivf_sq.cpp
index eadf84a299..80f9e7e3d5 100644
--- a/c/src/neighbors/ivf_sq.cpp
+++ b/c/src/neighbors/ivf_sq.cpp
@@ -4,6 +4,7 @@
  */
 
 #include <cstdint>
+#include <optional>
 #include <dlpack/dlpack.h>
 
 #include <raft/core/error.hpp>
@@ -26,8 +27,7 @@ void convert_c_index_params(cuvsIvfSqIndexParams params,
   out->add_data_on_build             = params.add_data_on_build;
   out->n_lists                       = params.n_lists;
   out->kmeans_n_iters                = params.kmeans_n_iters;
-  out->kmeans_trainset_fraction      = params.kmeans_trainset_fraction;
-  out->adaptive_centers              = params.adaptive_centers;
+  out->max_train_points_per_cluster  = params.max_train_points_per_cluster;
   out->conservative_memory_allocation = params.conservative_memory_allocation;
 }
 void convert_c_search_params(cuvsIvfSqSearchParams params,
@@ -39,6 +39,17 @@ void convert_c_search_params(cuvsIvfSqSearchParams params,
 
 namespace {
 
+using index_type = cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+void _reset_index(cuvsIvfSqIndex_t index)
+{
+  RAFT_EXPECTS(index != nullptr, "index cannot be null");
+  auto index_ptr = reinterpret_cast<index_type*>(index->addr);
+  index->addr    = 0;
+  index->dtype   = DLDataType{};
+  delete index_ptr;
+}
+
 template <typename T>
 void* _build(cuvsResources_t res, cuvsIvfSqIndexParams params, DLManagedTensor* dataset_tensor)
 {
@@ -48,21 +59,16 @@ void* _build(cuvsResources_t res, cuvsIvfSqIndexParams params, DLManagedTensor*
   cuvs::neighbors::ivf_sq::convert_c_index_params(params, &build_params);
 
   auto dataset = dataset_tensor->dl_tensor;
-  auto dim     = dataset.shape[1];
-
-  auto index = new cuvs::neighbors::ivf_sq::index<uint8_t>(*res_ptr, build_params, dim);
 
   if (cuvs::core::is_dlpack_device_compatible(dataset)) {
     using mdspan_type = raft::device_matrix_view<const T, int64_t, raft::row_major>;
     auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
-    cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds, *index);
+    return new index_type(cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds));
   } else {
-    using mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+    using mdspan_type = raft::host_matrix_view<const T, int64_t, raft::row_major>;
     auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
-    cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds, *index);
+    return new index_type(cuvs::neighbors::ivf_sq::build(*res_ptr, build_params, mds));
   }
-
-  return index;
 }
 
 template <typename T>
@@ -134,21 +140,28 @@ void _extend(cuvsResources_t res,
   auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
 
   bool on_device = cuvs::core::is_dlpack_device_compatible(new_vectors->dl_tensor);
-  if (on_device != cuvs::core::is_dlpack_device_compatible(new_indices->dl_tensor)) {
+  if (new_indices != nullptr &&
+      on_device != cuvs::core::is_dlpack_device_compatible(new_indices->dl_tensor)) {
     RAFT_FAIL("extend inputs must both either be on device memory or host memory");
   }
 
   if (on_device) {
     using vectors_mdspan_type = raft::device_matrix_view<const T, int64_t, raft::row_major>;
-    using indices_mdspan_type = raft::device_vector_view<int64_t, int64_t>;
+    using indices_mdspan_type = raft::device_vector_view<const int64_t, int64_t>;
     auto vectors_mds          = cuvs::core::from_dlpack<vectors_mdspan_type>(new_vectors);
-    auto indices_mds          = cuvs::core::from_dlpack<indices_mdspan_type>(new_indices);
+    std::optional<indices_mdspan_type> indices_mds;
+    if (new_indices != nullptr) {
+      indices_mds.emplace(cuvs::core::from_dlpack<indices_mdspan_type>(new_indices));
+    }
     cuvs::neighbors::ivf_sq::extend(*res_ptr, vectors_mds, indices_mds, index_ptr);
   } else {
     using vectors_mdspan_type = raft::host_matrix_view<const T, int64_t, raft::row_major>;
-    using indices_mdspan_type = raft::host_vector_view<int64_t, int64_t>;
+    using indices_mdspan_type = raft::host_vector_view<const int64_t, int64_t>;
     auto vectors_mds          = cuvs::core::from_dlpack<vectors_mdspan_type>(new_vectors);
-    auto indices_mds          = cuvs::core::from_dlpack<indices_mdspan_type>(new_indices);
+    std::optional<indices_mdspan_type> indices_mds;
+    if (new_indices != nullptr) {
+      indices_mds.emplace(cuvs::core::from_dlpack<indices_mdspan_type>(new_indices));
+    }
     cuvs::neighbors::ivf_sq::extend(*res_ptr, vectors_mds, indices_mds, index_ptr);
   }
 }
@@ -168,9 +181,7 @@ extern "C" cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index)
 extern "C" cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index_c_ptr)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto index     = *index_c_ptr;
-    auto index_ptr = reinterpret_cast<cuvs::neighbors::ivf_sq::index<uint8_t>*>(index.addr);
-    delete index_ptr;
+    _reset_index(index_c_ptr);
     delete index_c_ptr;
   });
 }
@@ -183,6 +194,15 @@ extern "C" cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     auto dataset = dataset_tensor->dl_tensor;
 
+    if (dataset.dtype.code != kDLFloat ||
+        (dataset.dtype.bits != 32 && dataset.dtype.bits != 16)) {
+      RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
+                dataset.dtype.code,
+                dataset.dtype.bits);
+    }
+
+    _reset_index(index);
+
     index->dtype.code = dataset.dtype.code;
     index->dtype.bits = dataset.dtype.bits;
 
@@ -190,10 +210,6 @@ extern "C" cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
       index->addr = reinterpret_cast<uintptr_t>(_build<float>(res, *params, dataset_tensor));
     } else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 16) {
       index->addr = reinterpret_cast<uintptr_t>(_build<half>(res, *params, dataset_tensor));
-    } else {
-      RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
-                dataset.dtype.code,
-                dataset.dtype.bits);
     }
   });
 }
@@ -269,8 +285,7 @@ extern "C" cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* params
                                        .add_data_on_build              = true,
                                        .n_lists                        = 1024,
                                        .kmeans_n_iters                 = 20,
-                                       .kmeans_trainset_fraction       = 0.5,
-                                       .adaptive_centers               = false,
+                                       .max_train_points_per_cluster    = 256,
                                        .conservative_memory_allocation = false};
   });
 }
@@ -295,8 +310,10 @@ extern "C" cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
                                             const char* filename,
                                             cuvsIvfSqIndex_t index)
 {
-  return cuvs::core::translate_exceptions(
-    [=] { index->addr = reinterpret_cast<uintptr_t>(_deserialize(res, filename)); });
+  return cuvs::core::translate_exceptions([=] {
+    _reset_index(index);
+    index->addr = reinterpret_cast<uintptr_t>(_deserialize(res, filename));
+  });
 }
 
 extern "C" cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res,
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
index a17992ff19..14150f33c2 100644
--- a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -487,7 +487,7 @@ inline void search_with_filtering(raft::resources const& handle,
                                            cuvs::distance::is_min_close(index.metric()),
                                            neighbors + std::size_t(offset_q) * k,
                                            distances + std::size_t(offset_q) * k,
-                                           raft::resource::get_workspace_resource(handle),
+                                           raft::resource::get_workspace_resource_ref(handle),
                                            sample_filter);
   }
 }

From 44dfa3879def3c41123ed8ab3be1408d8c3d6d5a Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 26 May 2026 15:23:15 +0200
Subject: [PATCH 41/43] Docs to Fern

---
 cpp/include/cuvs/neighbors/ivf_sq.hpp         |  27 +-
 docs/source/cpp_api/neighbors.rst             |  25 --
 docs/source/cpp_api/neighbors_ivf_sq.rst      |  68 ----
 fern/docs.yml                                 |   2 +
 .../pages/cpp_api/cpp-api-neighbors-ivf-sq.md | 373 ++++++++++++++++++
 fern/pages/cpp_api/index.md                   |   1 +
 fern/scripts/generate_api_reference.py        |   5 +
 7 files changed, 399 insertions(+), 102 deletions(-)
 delete mode 100644 docs/source/cpp_api/neighbors.rst
 delete mode 100644 docs/source/cpp_api/neighbors_ivf_sq.rst
 create mode 100644 fern/pages/cpp_api/cpp-api-neighbors-ivf-sq.md

diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
index 58134fe62b..93952f9747 100644
--- a/cpp/include/cuvs/neighbors/ivf_sq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -137,23 +137,32 @@ using list_data = ivf::list<list_spec, SizeT, CodeT, IdxT>;
  * In the IVF-SQ index, a database vector is first assigned to the nearest cluster center
  * using an inverted file (IVF) structure, and then compressed using scalar quantization (SQ).
  *
- * Scalar quantization independently maps each dimension of the per-cluster residual (the
- * input vector minus its assigned centroid) to a fixed-width integer code. For 8-bit
- * quantization (uint8_t), each residual component r_i = x_i - centroid_i is linearly
- * mapped to an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and
- * step-size (`sq_delta`) values:
+ * Scalar quantization independently maps each dimension of the per-cluster residual (the input
+ * vector minus its assigned centroid) to a fixed-width integer code. For 8-bit quantization
+ * (`uint8_t`), each residual component is linearly mapped to an integer in [0, 255] using learned
+ * per-dimension minimum (`sq_vmin`) and step-size (`sq_delta`) values.
  *
- *   code_i = clamp(round((r_i - vmin_i) / delta_i), 0, 255)
+ * For a vector component `x_i`, centroid component `centroid_i`, residual minimum `vmin_i`, and
+ * quantization step `delta_i`, the stored code is:
  *
- * where delta_i is the per-level step size (range divided by 255), so the corresponding
- * reconstruction is x_i ≈ centroid_i + vmin_i + code_i * delta_i.
+ * $$code_i = clamp(round((x_i - centroid_i - vmin_i) / delta_i), 0, 255)$$
+ *
+ * The corresponding reconstructed component is:
+ *
+ * $$x_i \approx centroid_i + vmin_i + code_i \cdot delta_i$$
  *
  * This provides a compact representation (1 byte per dimension) while preserving the relative
  * distances between vectors with high fidelity, offering a good trade-off between index size,
  * search speed, and recall compared to flat (uncompressed) and product-quantized (PQ)
  * representations.
  *
- * @tparam CodeT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
+ * Note: `CodeT` is the storage type for scalar-quantized residual codes in the inverted lists, not
+ * the input dataset type. The public build and search APIs accept `float` and `half` input vectors,
+ * then store the quantized residual components as `CodeT` inside the index. Currently, IVF-SQ
+ * supports only `uint8_t` codes, so use `CodeT = uint8_t`. Each code uses the full 8-bit range
+ * [0, 255].
+ *
+ * @tparam CodeT SQ code storage type. Must be `uint8_t` in the current implementation.
  *
  */
 template <typename CodeT>
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
deleted file mode 100644
index 2aed5ca9e0..0000000000
--- a/docs/source/cpp_api/neighbors.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-Nearest Neighbors
-=================
-
-.. role:: py(code)
-   :language: c++
-   :class: highlight
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   neighbors_all_neighbors.rst
-   neighbors_bruteforce.rst
-   neighbors_cagra.rst
-   neighbors_dynamic_batching.rst
-   neighbors_epsilon_neighborhood.rst
-   neighbors_filter.rst
-   neighbors_hnsw.rst
-   neighbors_ivf_flat.rst
-   neighbors_ivf_pq.rst
-   neighbors_ivf_sq.rst
-   neighbors_mg.rst
-   neighbors_nn_descent.rst
-   neighbors_refine.rst
-   neighbors_vamana.rst
diff --git a/docs/source/cpp_api/neighbors_ivf_sq.rst b/docs/source/cpp_api/neighbors_ivf_sq.rst
deleted file mode 100644
index d0554f926a..0000000000
--- a/docs/source/cpp_api/neighbors_ivf_sq.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-IVF-SQ
-======
-
-The IVF-SQ method is an ANN algorithm. Like IVF-Flat, IVF-SQ splits the points into a number of clusters (also specified by a parameter called n_lists) and searches the closest clusters to compute the nearest neighbors (also specified by a parameter called n_probes), but it shrinks the sizes of the vectors using scalar quantization, independently mapping each dimension to a fixed-width integer code.
-
-.. role:: py(code)
-   :language: c++
-   :class: highlight
-
-``#include <cuvs/neighbors/ivf_sq.hpp>``
-
-namespace *cuvs::neighbors::ivf_sq*
-
-Index build parameters
-----------------------
-
-.. doxygengroup:: ivf_sq_cpp_index_params
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index search parameters
------------------------
-
-.. doxygengroup:: ivf_sq_cpp_search_params
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index
------
-
-.. doxygengroup:: ivf_sq_cpp_index
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index build
------------
-
-.. doxygengroup:: ivf_sq_cpp_index_build
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index extend
-------------
-
-.. doxygengroup:: ivf_sq_cpp_index_extend
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index search
-------------
-
-.. doxygengroup:: ivf_sq_cpp_index_search
-    :project: cuvs
-    :members:
-    :content-only:
-
-Index serialize
----------------
-
-.. doxygengroup:: ivf_sq_cpp_index_serialize
-    :project: cuvs
-    :members:
-    :content-only:
diff --git a/fern/docs.yml b/fern/docs.yml
index 810bd8f1eb..f3eb0508ee 100644
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -307,6 +307,8 @@ navigation:
             path: "./pages/cpp_api/cpp-api-neighbors-ivf-flat.md"
           - page: "Neighbors IVF PQ"
             path: "./pages/cpp_api/cpp-api-neighbors-ivf-pq.md"
+          - page: "Neighbors IVF SQ"
+            path: "./pages/cpp_api/cpp-api-neighbors-ivf-sq.md"
           - page: "Neighbors NN Descent"
             path: "./pages/cpp_api/cpp-api-neighbors-nn-descent.md"
           - page: "Neighbors Refine"
diff --git a/fern/pages/cpp_api/cpp-api-neighbors-ivf-sq.md b/fern/pages/cpp_api/cpp-api-neighbors-ivf-sq.md
new file mode 100644
index 0000000000..1872199a47
--- /dev/null
+++ b/fern/pages/cpp_api/cpp-api-neighbors-ivf-sq.md
@@ -0,0 +1,373 @@
+---
+slug: api-reference/cpp-api-neighbors-ivf-sq
+---
+
+# IVF SQ
+
+_Source header: `cuvs/neighbors/ivf_sq.hpp`_
+
+## IVF-SQ index build parameters
+
+<a id="kindexgroupsize"></a>
+### kIndexGroupSize
+
+IVF-SQ index build parameters
+
+```cpp
+constexpr static uint32_t kIndexGroupSize = 32;
+```
+
+<a id="cuvs-neighbors-ivf-sq-index-params"></a>
+### cuvs::neighbors::ivf_sq::index_params
+
+IVF-SQ index build parameters.
+
+IVF-SQ currently uses 8-bit scalar quantization, storing one `uint8_t` code per vector dimension.
+
+```cpp
+struct index_params : cuvs::neighbors::index_params { ... };
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `n_lists` | `uint32_t` | The number of inverted lists (clusters) |
+| `kmeans_n_iters` | `uint32_t` | The number of iterations searching for kmeans centers (index building). |
+| `max_train_points_per_cluster` | `uint32_t` | The number of data vectors (per cluster) to use during iterative kmeans building. |
+| `conservative_memory_allocation` | `bool` | By default, the algorithm allocates more space than necessary for individual clusters (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of data copies during repeated calls to `extend` (extending the database). The alternative is the conservative allocation behavior; when enabled, the algorithm always allocates the minimum amount of memory required to store the given number of records. Set this flag to `true` if you prefer to use as little GPU memory for the database as possible. |
+| `add_data_on_build` | `bool` | Whether to add the dataset content to the index, i.e.:<br />- `true` means the index is filled with the dataset vectors and ready to search after calling `build`.<br />- `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but the index is left empty; you'd need to call `extend` on the index afterwards to populate it. |
+
+## IVF-SQ index search parameters
+
+<a id="cuvs-neighbors-ivf-sq-search-params"></a>
+### cuvs::neighbors::ivf_sq::search_params
+
+IVF-SQ index search parameters
+
+```cpp
+struct search_params : cuvs::neighbors::search_params { ... };
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `n_probes` | `uint32_t` | The number of clusters to search. |
+
+## IVF-SQ list storage spec
+
+<a id="cuvs-neighbors-ivf-sq-list-spec"></a>
+### cuvs::neighbors::ivf_sq::list_spec
+
+IVF-SQ list storage spec
+
+```cpp
+template <typename SizeT, typename CodeT, typename IdxT>
+struct list_spec { ... };
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `align_max` | `SizeT` |  |
+| `align_min` | `SizeT` |  |
+| `dim` | `uint32_t` |  |
+
+## IVF-SQ index
+
+<a id="cuvs-neighbors-ivf-sq-index"></a>
+### cuvs::neighbors::ivf_sq::index
+
+IVF-SQ index.
+
+In the IVF-SQ index, a database vector is first assigned to the nearest cluster center using an inverted file (IVF) structure, and then compressed using scalar quantization (SQ).
+
+Scalar quantization independently maps each dimension of the per-cluster residual (the input vector minus its assigned centroid) to a fixed-width integer code. For 8-bit quantization (`uint8_t`), each residual component is linearly mapped to an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and step-size (`sq_delta`) values.
+
+For a vector component `x_i`, centroid component `centroid_i`, residual minimum `vmin_i`, and quantization step `delta_i`, the stored code is:
+
+$$code_i = clamp(round((x_i - centroid_i - vmin_i) / delta_i), 0, 255)$$
+
+The corresponding reconstructed component is:
+
+$$x_i \approx centroid_i + vmin_i + code_i \cdot delta_i$$
+
+This provides a compact representation (1 byte per dimension) while preserving the relative distances between vectors with high fidelity, offering a good trade-off between index size, search speed, and recall compared to flat (uncompressed) and product-quantized (PQ) representations.
+
+Note: `CodeT` is the storage type for scalar-quantized residual codes in the inverted lists, not the input dataset type. The public build and search APIs accept `float` and `half` input vectors, then store the quantized residual components as `CodeT` inside the index. Currently, IVF-SQ supports only `uint8_t` codes, so use `CodeT = uint8_t`. Each code uses the full 8-bit range [0, 255].
+
+```cpp
+template <typename CodeT>
+struct index : cuvs::neighbors::index { ... };
+```
+
+## IVF-SQ index build
+
+<a id="cuvs-neighbors-ivf-sq-build"></a>
+### cuvs::neighbors::ivf_sq::build
+
+Build the index from the dataset for efficient search.
+
+```cpp
+auto build(raft::resources const& handle,
+const cuvs::neighbors::ivf_sq::index_params& index_params,
+raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
+-> cuvs::neighbors::ivf_sq::index<uint8_t>;
+```
+
+NB: Currently, the following distance metrics are supported:
+
+- L2Expanded
+- L2SqrtExpanded
+- InnerProduct
+- CosineExpanded
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `index_params` | in | [`const cuvs::neighbors::ivf_sq::index_params&`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index-params) | configure the index building |
+| `dataset` | in | `raft::device_matrix_view<const float, int64_t, raft::row_major>` | a device pointer to a row-major matrix [n_rows, dim] |
+
+**Returns**
+
+[`cuvs::neighbors::ivf_sq::index<uint8_t>`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index)
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::build`
+
+Build the index from the dataset for efficient search.
+
+```cpp
+auto build(raft::resources const& handle,
+const cuvs::neighbors::ivf_sq::index_params& index_params,
+raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
+-> cuvs::neighbors::ivf_sq::index<uint8_t>;
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `index_params` | in | [`const cuvs::neighbors::ivf_sq::index_params&`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index-params) | configure the index building |
+| `dataset` | in | `raft::device_matrix_view<const half, int64_t, raft::row_major>` | a device pointer to a row-major matrix [n_rows, dim] |
+
+**Returns**
+
+[`cuvs::neighbors::ivf_sq::index<uint8_t>`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index)
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::build`
+
+Build the index from the dataset for efficient search.
+
+```cpp
+auto build(raft::resources const& handle,
+const cuvs::neighbors::ivf_sq::index_params& index_params,
+raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
+-> cuvs::neighbors::ivf_sq::index<uint8_t>;
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `index_params` | in | [`const cuvs::neighbors::ivf_sq::index_params&`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index-params) | configure the index building |
+| `dataset` | in | `raft::host_matrix_view<const float, int64_t, raft::row_major>` | a host pointer to a row-major matrix [n_rows, dim] |
+
+**Returns**
+
+[`cuvs::neighbors::ivf_sq::index<uint8_t>`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index)
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::build`
+
+Build the index from the dataset for efficient search.
+
+```cpp
+auto build(raft::resources const& handle,
+const cuvs::neighbors::ivf_sq::index_params& index_params,
+raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
+-> cuvs::neighbors::ivf_sq::index<uint8_t>;
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `index_params` | in | [`const cuvs::neighbors::ivf_sq::index_params&`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index-params) | configure the index building |
+| `dataset` | in | `raft::host_matrix_view<const half, int64_t, raft::row_major>` | a host pointer to a row-major matrix [n_rows, dim] |
+
+**Returns**
+
+[`cuvs::neighbors::ivf_sq::index<uint8_t>`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index)
+
+## IVF-SQ index extend
+
+<a id="cuvs-neighbors-ivf-sq-extend"></a>
+### cuvs::neighbors::ivf_sq::extend
+
+Extend the index with the new data in-place.
+
+```cpp
+void extend(raft::resources const& handle,
+raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `new_vectors` | in | `raft::device_matrix_view<const float, int64_t, raft::row_major>` | a device matrix view to a row-major matrix [n_rows, idx.dim()] |
+| `new_indices` | in | `std::optional<raft::device_vector_view<const int64_t, int64_t>>` | a device vector view to a vector of indices [n_rows]. If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt` here to imply a continuous range `[0...n_rows)`. |
+| `idx` | inout | [`cuvs::neighbors::ivf_sq::index<uint8_t>*`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | pointer to ivf_sq::index |
+
+**Returns**
+
+`void`
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::extend`
+
+Extend the index with the new data in-place.
+
+```cpp
+void extend(raft::resources const& handle,
+raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `new_vectors` | in | `raft::device_matrix_view<const half, int64_t, raft::row_major>` | a device matrix view to a row-major matrix [n_rows, idx.dim()] |
+| `new_indices` | in | `std::optional<raft::device_vector_view<const int64_t, int64_t>>` | a device vector view to a vector of indices [n_rows]. If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt` here to imply a continuous range `[0...n_rows)`. |
+| `idx` | inout | [`cuvs::neighbors::ivf_sq::index<uint8_t>*`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | pointer to ivf_sq::index |
+
+**Returns**
+
+`void`
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::extend`
+
+Extend the index with the new data in-place.
+
+```cpp
+void extend(raft::resources const& handle,
+raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `new_vectors` | in | `raft::host_matrix_view<const float, int64_t, raft::row_major>` | a host matrix view to a row-major matrix [n_rows, idx.dim()] |
+| `new_indices` | in | `std::optional<raft::host_vector_view<const int64_t, int64_t>>` | a host vector view to a vector of indices [n_rows]. If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt` here to imply a continuous range `[0...n_rows)`. |
+| `idx` | inout | [`cuvs::neighbors::ivf_sq::index<uint8_t>*`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | pointer to ivf_sq::index |
+
+**Returns**
+
+`void`
+
+**Additional overload:** `cuvs::neighbors::ivf_sq::extend`
+
+Extend the index with the new data in-place.
+
+```cpp
+void extend(raft::resources const& handle,
+raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+```
+
+Usage example:
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` |  |
+| `new_vectors` | in | `raft::host_matrix_view<const half, int64_t, raft::row_major>` | a host matrix view to a row-major matrix [n_rows, idx.dim()] |
+| `new_indices` | in | `std::optional<raft::host_vector_view<const int64_t, int64_t>>` | a host vector view to a vector of indices [n_rows]. If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt` here to imply a continuous range `[0...n_rows)`. |
+| `idx` | inout | [`cuvs::neighbors::ivf_sq::index<uint8_t>*`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | pointer to ivf_sq::index |
+
+**Returns**
+
+`void`
+
+## IVF-SQ index serialize
+
+<a id="cuvs-neighbors-ivf-sq-serialize"></a>
+### cuvs::neighbors::ivf_sq::serialize
+
+Save the index to file.
+
+```cpp
+void serialize(raft::resources const& handle,
+const std::string& filename,
+const cuvs::neighbors::ivf_sq::index<uint8_t>& index);
+```
+
+Experimental, both the API and the serialization format are subject to change.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` | the raft handle |
+| `filename` | in | `const std::string&` | the file name for saving the index |
+| `index` | in | [`const cuvs::neighbors::ivf_sq::index<uint8_t>&`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | IVF-SQ index |
+
+**Returns**
+
+`void`
+
+<a id="cuvs-neighbors-ivf-sq-deserialize"></a>
+### cuvs::neighbors::ivf_sq::deserialize
+
+Load index from file.
+
+```cpp
+void deserialize(raft::resources const& handle,
+const std::string& filename,
+cuvs::neighbors::ivf_sq::index<uint8_t>* index);
+```
+
+Experimental, both the API and the serialization format are subject to change.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `handle` | in | `raft::resources const&` | the raft handle |
+| `filename` | in | `const std::string&` | the name of the file that stores the index |
+| `index` | out | [`cuvs::neighbors::ivf_sq::index<uint8_t>*`](/api-reference/cpp-api-neighbors-ivf-sq#cuvs-neighbors-ivf-sq-index) | IVF-SQ index |
+
+**Returns**
+
+`void`
diff --git a/fern/pages/cpp_api/index.md b/fern/pages/cpp_api/index.md
index e233cb8b16..7449d9b74b 100644
--- a/fern/pages/cpp_api/index.md
+++ b/fern/pages/cpp_api/index.md
@@ -18,6 +18,7 @@ These pages are generated from the documented public headers in the cuVS source
 - [HNSW](/api-reference/cpp-api-neighbors-hnsw)
 - [IVF Flat](/api-reference/cpp-api-neighbors-ivf-flat)
 - [IVF PQ](/api-reference/cpp-api-neighbors-ivf-pq)
+- [IVF SQ](/api-reference/cpp-api-neighbors-ivf-sq)
 - [NN Descent](/api-reference/cpp-api-neighbors-nn-descent)
 - [Refine](/api-reference/cpp-api-neighbors-refine)
 - [Scann](/api-reference/cpp-api-neighbors-scann)
diff --git a/fern/scripts/generate_api_reference.py b/fern/scripts/generate_api_reference.py
index 81f782806c..994da2315a 100755
--- a/fern/scripts/generate_api_reference.py
+++ b/fern/scripts/generate_api_reference.py
@@ -1901,6 +1901,8 @@ def parse_plain_struct_members(
         declaration = strip_member_initializer(declaration)
         if not declaration:
             continue
+        if declaration.startswith((",", ":")):
+            continue
         if not declaration or declaration.startswith(
             (
                 "enum ",
@@ -2511,6 +2513,7 @@ def python_title(module: str) -> str:
         "Mg": "Multi-GPU",
         "Ivf": "IVF",
         "Pq": "PQ",
+        "Sq": "SQ",
         "Pca": "PCA",
         "Hnsw": "HNSW",
         "Nn": "NN",
@@ -3593,6 +3596,7 @@ def api_route_title(slug: str) -> str:
         "nn": "NN",
         "pca": "PCA",
         "pq": "PQ",
+        "sq": "SQ",
         "vq": "VQ",
     }
     words = []
@@ -3703,6 +3707,7 @@ def native_page_title(source: str) -> str:
         "C Api": "C API",
         "Ivf": "IVF",
         "Pq": "PQ",
+        "Sq": "SQ",
         "Pca": "PCA",
         "Hnsw": "HNSW",
         "Nn": "NN",

From 8141440284f5279b913796f4cbe24161f7a96b66 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 26 May 2026 15:49:26 +0200
Subject: [PATCH 42/43] Doc to fern

---
 fern/docs.yml                              |   2 +
 fern/pages/c_api/c-api-neighbors-ivf-sq.md | 438 +++++++++++++++++++++
 fern/pages/c_api/index.md                  |   1 +
 3 files changed, 441 insertions(+)
 create mode 100644 fern/pages/c_api/c-api-neighbors-ivf-sq.md

diff --git a/fern/docs.yml b/fern/docs.yml
index 4610b11776..65be6c7536 100644
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -250,6 +250,8 @@ navigation:
             path: "./pages/c_api/c-api-neighbors-ivf-flat.md"
           - page: "Neighbors IVF PQ"
             path: "./pages/c_api/c-api-neighbors-ivf-pq.md"
+          - page: "Neighbors IVF SQ"
+            path: "./pages/c_api/c-api-neighbors-ivf-sq.md"
           - page: "Neighbors Multi GPU Cagra"
             path: "./pages/c_api/c-api-neighbors-mg-cagra.md"
           - page: "Neighbors Multi GPU Common"
diff --git a/fern/pages/c_api/c-api-neighbors-ivf-sq.md b/fern/pages/c_api/c-api-neighbors-ivf-sq.md
new file mode 100644
index 0000000000..105bd1c25e
--- /dev/null
+++ b/fern/pages/c_api/c-api-neighbors-ivf-sq.md
@@ -0,0 +1,438 @@
+---
+slug: api-reference/c-api-neighbors-ivf-sq
+---
+
+# IVF SQ
+
+_Source header: `cuvs/neighbors/ivf_sq.h`_
+
+## IVF-SQ index build parameters
+
+<a id="cuvsivfsqindexparams"></a>
+### cuvsIvfSqIndexParams
+
+Supplemental parameters to build IVF-SQ Index
+
+```c
+struct cuvsIvfSqIndexParams { ... };
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `metric` | [`cuvsDistanceType`](/api-reference/c-api-distance-distance#cuvsdistancetype) | Distance type. |
+| `metric_arg` | `float` | The argument used by some distance metrics. |
+| `add_data_on_build` | `bool` | Whether to add the dataset content to the index, i.e.:<br />- `true` means the index is filled with the dataset vectors and ready to search after calling `build`.<br />- `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but the index is left empty; you'd need to call `extend` on the index afterwards to populate it. |
+| `n_lists` | `uint32_t` | The number of inverted lists (clusters) |
+| `kmeans_n_iters` | `uint32_t` | The number of iterations searching for kmeans centers (index building). |
+| `max_train_points_per_cluster` | `uint32_t` | The number of data vectors per cluster to use during iterative kmeans building. The index uses at most `n_lists * max_train_points_per_cluster` rows for training. |
+| `conservative_memory_allocation` | `bool` | By default, the algorithm allocates more space than necessary for individual clusters (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of data copies during repeated calls to `extend` (extending the database). The alternative is the conservative allocation behavior; when enabled, the algorithm always allocates the minimum amount of memory required to store the given number of records. Set this flag to `true` if you prefer to use as little GPU memory for the database as possible. |
+
+<a id="cuvsivfsqindexparamscreate"></a>
+### cuvsIvfSqIndexParamsCreate
+
+Allocate IVF-SQ Index params, and populate with default values
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexParamsCreate(cuvsIvfSqIndexParams_t* index_params);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index_params` | in | [`cuvsIvfSqIndexParams_t*`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindexparams) | cuvsIvfSqIndexParams_t to allocate |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexparamsdestroy"></a>
+### cuvsIvfSqIndexParamsDestroy
+
+De-allocate IVF-SQ Index params
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexParamsDestroy(cuvsIvfSqIndexParams_t index_params);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index_params` | in | [`cuvsIvfSqIndexParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindexparams) |  |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ index search parameters
+
+<a id="cuvsivfsqsearchparams"></a>
+### cuvsIvfSqSearchParams
+
+Supplemental parameters to search IVF-SQ index
+
+```c
+struct cuvsIvfSqSearchParams { ... };
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `n_probes` | `uint32_t` | The number of clusters to search. |
+
+<a id="cuvsivfsqsearchparamscreate"></a>
+### cuvsIvfSqSearchParamsCreate
+
+Allocate IVF-SQ search params, and populate with default values
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchParamsCreate(cuvsIvfSqSearchParams_t* params);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `params` | in | [`cuvsIvfSqSearchParams_t*`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqsearchparams) | cuvsIvfSqSearchParams_t to allocate |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqsearchparamsdestroy"></a>
+### cuvsIvfSqSearchParamsDestroy
+
+De-allocate IVF-SQ search params
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchParamsDestroy(cuvsIvfSqSearchParams_t params);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `params` | in | [`cuvsIvfSqSearchParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqsearchparams) |  |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ index
+
+<a id="cuvsivfsqindex"></a>
+### cuvsIvfSqIndex
+
+Struct to hold address of cuvs::neighbors::ivf_sq::index and its active trained dtype
+
+```c
+typedef struct { ... } cuvsIvfSqIndex;
+```
+
+**Fields**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `addr` | `uintptr_t` |  |
+| `dtype` | `DLDataType` |  |
+
+<a id="cuvsivfsqindexcreate"></a>
+### cuvsIvfSqIndexCreate
+
+Allocate IVF-SQ index
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexCreate(cuvsIvfSqIndex_t* index);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` | in | [`cuvsIvfSqIndex_t*`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | cuvsIvfSqIndex_t to allocate |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexdestroy"></a>
+### cuvsIvfSqIndexDestroy
+
+De-allocate IVF-SQ index
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexDestroy(cuvsIvfSqIndex_t index);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | cuvsIvfSqIndex_t to de-allocate |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexgetnlists"></a>
+### cuvsIvfSqIndexGetNLists
+
+Get the number of clusters/inverted lists
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetNLists(cuvsIvfSqIndex_t index, int64_t* n_lists);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` |  | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) |  |
+| `n_lists` |  | `int64_t*` |  |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexgetdim"></a>
+### cuvsIvfSqIndexGetDim
+
+Get the dimensionality of the data
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetDim(cuvsIvfSqIndex_t index, int64_t* dim);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` |  | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) |  |
+| `dim` |  | `int64_t*` |  |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexgetsize"></a>
+### cuvsIvfSqIndexGetSize
+
+Get the size of the index
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetSize(cuvsIvfSqIndex_t index, int64_t* size);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` |  | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) |  |
+| `size` |  | `int64_t*` |  |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqindexgetcenters"></a>
+### cuvsIvfSqIndexGetCenters
+
+Get the cluster centers corresponding to the lists [n_lists, dim]
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqIndexGetCenters(cuvsIvfSqIndex_t index, DLManagedTensor* centers);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | cuvsIvfSqIndex_t Built Ivf-SQ Index |
+| `centers` | out | `DLManagedTensor*` | Preallocated array on host or device memory to store output, [n_lists, dim] |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ index build
+
+<a id="cuvsivfsqbuild"></a>
+### cuvsIvfSqBuild
+
+Build an IVF-SQ index with a `DLManagedTensor` which has underlying
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
+cuvsIvfSqIndexParams_t index_params,
+DLManagedTensor* dataset,
+cuvsIvfSqIndex_t index);
+```
+
+`DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`, or `kDLCPU`. Also, acceptable underlying types are:
+
+1. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+2. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 16`
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `index_params` | in | [`cuvsIvfSqIndexParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindexparams) | cuvsIvfSqIndexParams_t used to build IVF-SQ index |
+| `dataset` | in | `DLManagedTensor*` | DLManagedTensor* training dataset |
+| `index` | out | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | cuvsIvfSqIndex_t Newly built IVF-SQ index |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ index search
+
+<a id="cuvsivfsqsearch"></a>
+### cuvsIvfSqSearch
+
+Search an IVF-SQ index with a `DLManagedTensor` which has underlying
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
+cuvsIvfSqSearchParams_t search_params,
+cuvsIvfSqIndex_t index,
+DLManagedTensor* queries,
+DLManagedTensor* neighbors,
+DLManagedTensor* distances);
+```
+
+`DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`. Types for input are:
+
+1. `queries`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32` or 16
+2. `neighbors`: `kDLDataType.code == kDLInt` and `kDLDataType.bits = 64`
+3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `search_params` | in | [`cuvsIvfSqSearchParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqsearchparams) | cuvsIvfSqSearchParams_t used to search IVF-SQ index |
+| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | ivfSqIndex which has been returned by `cuvsIvfSqBuild` |
+| `queries` | in | `DLManagedTensor*` | DLManagedTensor* queries dataset to search |
+| `neighbors` | out | `DLManagedTensor*` | DLManagedTensor* output `k` neighbors for queries |
+| `distances` | out | `DLManagedTensor*` | DLManagedTensor* output `k` distances for queries |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqsearchwithfilter"></a>
+### cuvsIvfSqSearchWithFilter
+
+Search an IVF-SQ index with filtering.
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
+cuvsIvfSqSearchParams_t search_params,
+cuvsIvfSqIndex_t index,
+DLManagedTensor* queries,
+DLManagedTensor* neighbors,
+DLManagedTensor* distances,
+cuvsFilter filter);
+```
+
+Same as cuvsIvfSqSearch, but applies a pre-filter to exclude vectors during search.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `search_params` | in | [`cuvsIvfSqSearchParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqsearchparams) | cuvsIvfSqSearchParams_t used to search IVF-SQ index |
+| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | ivfSqIndex which has been returned by `cuvsIvfSqBuild` |
+| `queries` | in | `DLManagedTensor*` | DLManagedTensor* queries dataset to search |
+| `neighbors` | out | `DLManagedTensor*` | DLManagedTensor* output `k` neighbors for queries |
+| `distances` | out | `DLManagedTensor*` | DLManagedTensor* output `k` distances for queries |
+| `filter` | in | [`cuvsFilter`](/api-reference/c-api-neighbors-common#cuvsfilter) | cuvsFilter to filter neighbors based on the given bitset |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ C-API serialize functions
+
+<a id="cuvsivfsqserialize"></a>
+### cuvsIvfSqSerialize
+
+Save the index to file.
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqSerialize(cuvsResources_t res, const char* filename, cuvsIvfSqIndex_t index);
+```
+
+Experimental, both the API and the serialization format are subject to change.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `filename` | in | `const char*` | the file name for saving the index |
+| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | IVF-SQ index |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsivfsqdeserialize"></a>
+### cuvsIvfSqDeserialize
+
+Load index from file.
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqDeserialize(cuvsResources_t res,
+const char* filename,
+cuvsIvfSqIndex_t index);
+```
+
+Experimental, both the API and the serialization format are subject to change.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `filename` | in | `const char*` | the name of the file that stores the index |
+| `index` | out | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | IVF-SQ index loaded from disk |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+## IVF-SQ index extend
+
+<a id="cuvsivfsqextend"></a>
+### cuvsIvfSqExtend
+
+Extend the index with the new data.
+
+```c
+CUVS_EXPORT cuvsError_t cuvsIvfSqExtend(cuvsResources_t res,
+DLManagedTensor* new_vectors,
+DLManagedTensor* new_indices,
+cuvsIvfSqIndex_t index);
+```
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
+| `new_vectors` | in | `DLManagedTensor*` | DLManagedTensor* the new vectors to add to the index |
+| `new_indices` | in | `DLManagedTensor*` | DLManagedTensor* vector of new indices for the new vectors. If the index is empty, this can be NULL to imply a continuous range `[0...n_rows)`. |
+| `index` | inout | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | IVF-SQ index to be extended |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
diff --git a/fern/pages/c_api/index.md b/fern/pages/c_api/index.md
index 61d672ba91..dfde95bb32 100644
--- a/fern/pages/c_api/index.md
+++ b/fern/pages/c_api/index.md
@@ -13,6 +13,7 @@ These pages are generated from the documented public headers in the cuVS source
 - [HNSW](/api-reference/c-api-neighbors-hnsw)
 - [IVF Flat](/api-reference/c-api-neighbors-ivf-flat)
 - [IVF PQ](/api-reference/c-api-neighbors-ivf-pq)
+- [IVF SQ](/api-reference/c-api-neighbors-ivf-sq)
 - [Multi-GPU Cagra](/api-reference/c-api-neighbors-mg-cagra)
 - [Multi-GPU Common](/api-reference/c-api-neighbors-mg-common)
 - [Multi-GPU IVF Flat](/api-reference/c-api-neighbors-mg-ivf-flat)

From b0cd8d8c802ac6eb4b47bbeb632c24fe0aa4c77c Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Wed, 27 May 2026 20:36:59 +0200
Subject: [PATCH 43/43] Address review

---
 c/include/cuvs/neighbors/ivf_sq.h          | 30 ++------
 c/src/neighbors/ivf_sq.cpp                 | 15 +---
 c/tests/neighbors/ann_ivf_sq_c.cu          | 28 +++++---
 c/tests/neighbors/run_ivf_sq_c.c           | 81 +++++++++++-----------
 fern/pages/c_api/c-api-neighbors-ivf-sq.md | 37 +---------
 5 files changed, 72 insertions(+), 119 deletions(-)

diff --git a/c/include/cuvs/neighbors/ivf_sq.h b/c/include/cuvs/neighbors/ivf_sq.h
index 472c9cda7a..a771bc0d4d 100644
--- a/c/include/cuvs/neighbors/ivf_sq.h
+++ b/c/include/cuvs/neighbors/ivf_sq.h
@@ -249,8 +249,8 @@ CUVS_EXPORT cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
  * cuvsError_t params_create_status = cuvsIvfSqSearchParamsCreate(&search_params);
  *
  * // Search the `index` built using `cuvsIvfSqBuild`
- * cuvsError_t search_status = cuvsIvfSqSearch(res, search_params, index, &queries, &neighbors,
- * &distances);
+ * cuvsError_t search_status = cuvsIvfSqSearch(
+ *   res, search_params, index, &queries, &neighbors, &distances, (cuvsFilter){});
  *
  * // de-allocate `search_params` and `res`
  * cuvsError_t params_destroy_status = cuvsIvfSqSearchParamsDestroy(search_params);
@@ -263,34 +263,16 @@ CUVS_EXPORT cuvsError_t cuvsIvfSqBuild(cuvsResources_t res,
  * @param[in] queries DLManagedTensor* queries dataset to search
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
+ * @param[in] filter cuvsFilter input filter that can be used
+ *            to filter queries and neighbors based on the given bitset.
  */
 CUVS_EXPORT cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
                             cuvsIvfSqSearchParams_t search_params,
                             cuvsIvfSqIndex_t index,
                             DLManagedTensor* queries,
                             DLManagedTensor* neighbors,
-                            DLManagedTensor* distances);
-
-/**
- * @brief Search an IVF-SQ index with filtering.
- *
- * Same as cuvsIvfSqSearch, but applies a pre-filter to exclude vectors during search.
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] search_params cuvsIvfSqSearchParams_t used to search IVF-SQ index
- * @param[in] index ivfSqIndex which has been returned by `cuvsIvfSqBuild`
- * @param[in] queries DLManagedTensor* queries dataset to search
- * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
- * @param[out] distances DLManagedTensor* output `k` distances for queries
- * @param[in] filter cuvsFilter to filter neighbors based on the given bitset
- */
-CUVS_EXPORT cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
-                                      cuvsIvfSqSearchParams_t search_params,
-                                      cuvsIvfSqIndex_t index,
-                                      DLManagedTensor* queries,
-                                      DLManagedTensor* neighbors,
-                                      DLManagedTensor* distances,
-                                      cuvsFilter filter);
+                            DLManagedTensor* distances,
+                            cuvsFilter filter);
 
 /**
  * @}
diff --git a/c/src/neighbors/ivf_sq.cpp b/c/src/neighbors/ivf_sq.cpp
index 80f9e7e3d5..2656338d31 100644
--- a/c/src/neighbors/ivf_sq.cpp
+++ b/c/src/neighbors/ivf_sq.cpp
@@ -259,19 +259,8 @@ extern "C" cuvsError_t cuvsIvfSqSearch(cuvsResources_t res,
                                        cuvsIvfSqIndex_t index_c_ptr,
                                        DLManagedTensor* queries_tensor,
                                        DLManagedTensor* neighbors_tensor,
-                                       DLManagedTensor* distances_tensor)
-{
-  return _cuvsIvfSqSearchImpl(
-    res, params, index_c_ptr, queries_tensor, neighbors_tensor, distances_tensor, nullptr);
-}
-
-extern "C" cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
-                                                  cuvsIvfSqSearchParams_t params,
-                                                  cuvsIvfSqIndex_t index_c_ptr,
-                                                  DLManagedTensor* queries_tensor,
-                                                  DLManagedTensor* neighbors_tensor,
-                                                  DLManagedTensor* distances_tensor,
-                                                  cuvsFilter filter)
+                                       DLManagedTensor* distances_tensor,
+                                       cuvsFilter filter)
 {
   return _cuvsIvfSqSearchImpl(
     res, params, index_c_ptr, queries_tensor, neighbors_tensor, distances_tensor, &filter);
diff --git a/c/tests/neighbors/ann_ivf_sq_c.cu b/c/tests/neighbors/ann_ivf_sq_c.cu
index c36786e45a..42c1b29999 100644
--- a/c/tests/neighbors/ann_ivf_sq_c.cu
+++ b/c/tests/neighbors/ann_ivf_sq_c.cu
@@ -6,12 +6,14 @@
 #include <cuda.h>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/random/rng.cuh>
 
 #include "neighbors/ann_utils.cuh"
 #include <cuvs/neighbors/ivf_sq.h>
 
-extern "C" void run_ivf_sq(int64_t n_rows,
+extern "C" void run_ivf_sq(cuvsResources_t res,
+                           int64_t n_rows,
                            int64_t n_queries,
                            int64_t n_dim,
                            uint32_t n_neighbors,
@@ -24,15 +26,15 @@ extern "C" void run_ivf_sq(int64_t n_rows,
                            size_t n_lists);
 
 template <typename T>
-void generate_random_data(T* devPtr, size_t size)
+void generate_random_data(raft::handle_t const& handle, T* devPtr, size_t size)
 {
-  raft::handle_t handle;
   raft::random::RngState r(1234ULL);
   raft::random::uniform(handle, r, devPtr, size, T(0.1), T(2.0));
 };
 
 template <typename T, typename IdxT>
-void recall_eval(T* query_data,
+void recall_eval(raft::handle_t const& handle,
+                 T* query_data,
                  T* index_data,
                  IdxT* neighbors,
                  T* distances,
@@ -44,7 +46,6 @@ void recall_eval(T* query_data,
                  size_t n_probes,
                  size_t n_lists)
 {
-  raft::handle_t handle;
   auto distances_ref = raft::make_device_matrix<T, IdxT>(handle, n_queries, n_neighbors);
   auto neighbors_ref = raft::make_device_matrix<IdxT, IdxT>(handle, n_queries, n_neighbors);
   cuvs::neighbors::naive_knn<T, T, IdxT>(
@@ -70,6 +71,7 @@ void recall_eval(T* query_data,
   raft::copy(distances_h.data(), distances, size, stream);
   raft::copy(neighbors_ref_h.data(), neighbors_ref.data_handle(), size, stream);
   raft::copy(distances_ref_h.data(), distances_ref.data_handle(), size, stream);
+  raft::resource::sync_stream(handle);
 
   double min_recall = static_cast<double>(n_probes) / static_cast<double>(n_lists);
   ASSERT_TRUE(cuvs::neighbors::eval_neighbours(neighbors_ref_h,
@@ -101,10 +103,15 @@ TEST(IvfSqC, BuildSearch)
   rmm::device_uvector<int64_t> neighbors_data(n_queries * n_neighbors, stream);
   rmm::device_uvector<float> distances_data(n_queries * n_neighbors, stream);
 
-  generate_random_data(index_data.data(), n_rows * n_dim);
-  generate_random_data(query_data.data(), n_queries * n_dim);
+  generate_random_data(handle, index_data.data(), n_rows * n_dim);
+  generate_random_data(handle, query_data.data(), n_queries * n_dim);
 
-  run_ivf_sq(n_rows,
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+  cuvsStreamSet(res, stream);
+
+  run_ivf_sq(res,
+             n_rows,
              n_queries,
              n_dim,
              n_neighbors,
@@ -116,7 +123,8 @@ TEST(IvfSqC, BuildSearch)
              n_probes,
              n_lists);
 
-  recall_eval(query_data.data(),
+  recall_eval(handle,
+              query_data.data(),
               index_data.data(),
               neighbors_data.data(),
               distances_data.data(),
@@ -127,4 +135,6 @@ TEST(IvfSqC, BuildSearch)
               metric,
               n_probes,
               n_lists);
+
+  cuvsResourcesDestroy(res);
 }
diff --git a/c/tests/neighbors/run_ivf_sq_c.c b/c/tests/neighbors/run_ivf_sq_c.c
index d07502abd6..7f8f08b2d9 100644
--- a/c/tests/neighbors/run_ivf_sq_c.c
+++ b/c/tests/neighbors/run_ivf_sq_c.c
@@ -5,7 +5,8 @@
 
 #include <cuvs/neighbors/ivf_sq.h>
 
-void run_ivf_sq(int64_t n_rows,
+void run_ivf_sq(cuvsResources_t res,
+                int64_t n_rows,
                 int64_t n_queries,
                 int64_t n_dim,
                 uint32_t n_neighbors,
@@ -17,19 +18,16 @@ void run_ivf_sq(int64_t n_rows,
                 size_t n_probes,
                 size_t n_lists)
 {
-  cuvsResources_t res;
-  cuvsResourcesCreate(&res);
-
   DLManagedTensor dataset_tensor;
-  dataset_tensor.dl_tensor.data              = index_data;
+  dataset_tensor.dl_tensor.data               = index_data;
   dataset_tensor.dl_tensor.device.device_type = kDLCUDA;
-  dataset_tensor.dl_tensor.ndim              = 2;
-  dataset_tensor.dl_tensor.dtype.code        = kDLFloat;
-  dataset_tensor.dl_tensor.dtype.bits        = 32;
-  dataset_tensor.dl_tensor.dtype.lanes       = 1;
-  int64_t dataset_shape[2]                   = {n_rows, n_dim};
-  dataset_tensor.dl_tensor.shape             = dataset_shape;
-  dataset_tensor.dl_tensor.strides           = NULL;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {n_rows, n_dim};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = NULL;
 
   cuvsIvfSqIndex_t index;
   cuvsIvfSqIndexCreate(&index);
@@ -41,46 +39,51 @@ void run_ivf_sq(int64_t n_rows,
   cuvsIvfSqBuild(res, build_params, &dataset_tensor, index);
 
   DLManagedTensor queries_tensor;
-  queries_tensor.dl_tensor.data              = (void*)query_data;
+  queries_tensor.dl_tensor.data               = (void*)query_data;
   queries_tensor.dl_tensor.device.device_type = kDLCUDA;
-  queries_tensor.dl_tensor.ndim              = 2;
-  queries_tensor.dl_tensor.dtype.code        = kDLFloat;
-  queries_tensor.dl_tensor.dtype.bits        = 32;
-  queries_tensor.dl_tensor.dtype.lanes       = 1;
-  int64_t queries_shape[2]                   = {n_queries, n_dim};
-  queries_tensor.dl_tensor.shape             = queries_shape;
-  queries_tensor.dl_tensor.strides           = NULL;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {n_queries, n_dim};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = NULL;
 
   DLManagedTensor neighbors_tensor;
-  neighbors_tensor.dl_tensor.data              = (void*)neighbors_data;
+  neighbors_tensor.dl_tensor.data               = (void*)neighbors_data;
   neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
-  neighbors_tensor.dl_tensor.ndim              = 2;
-  neighbors_tensor.dl_tensor.dtype.code        = kDLInt;
-  neighbors_tensor.dl_tensor.dtype.bits        = 64;
-  neighbors_tensor.dl_tensor.dtype.lanes       = 1;
-  int64_t neighbors_shape[2]                   = {n_queries, n_neighbors};
-  neighbors_tensor.dl_tensor.shape             = neighbors_shape;
-  neighbors_tensor.dl_tensor.strides           = NULL;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {n_queries, n_neighbors};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = NULL;
 
   DLManagedTensor distances_tensor;
-  distances_tensor.dl_tensor.data              = (void*)distances_data;
+  distances_tensor.dl_tensor.data               = (void*)distances_data;
   distances_tensor.dl_tensor.device.device_type = kDLCUDA;
-  distances_tensor.dl_tensor.ndim              = 2;
-  distances_tensor.dl_tensor.dtype.code        = kDLFloat;
-  distances_tensor.dl_tensor.dtype.bits        = 32;
-  distances_tensor.dl_tensor.dtype.lanes       = 1;
-  int64_t distances_shape[2]                   = {n_queries, n_neighbors};
-  distances_tensor.dl_tensor.shape             = distances_shape;
-  distances_tensor.dl_tensor.strides           = NULL;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {n_queries, n_neighbors};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = NULL;
 
   cuvsIvfSqSearchParams_t search_params;
   cuvsIvfSqSearchParamsCreate(&search_params);
   search_params->n_probes = n_probes;
-  cuvsIvfSqSearch(
-    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor);
+  cuvsIvfSqSearch(res,
+                  search_params,
+                  index,
+                  &queries_tensor,
+                  &neighbors_tensor,
+                  &distances_tensor,
+                  (cuvsFilter){.type = NO_FILTER});
+  cuvsStreamSync(res);
 
   cuvsIvfSqSearchParamsDestroy(search_params);
   cuvsIvfSqIndexParamsDestroy(build_params);
   cuvsIvfSqIndexDestroy(index);
-  cuvsResourcesDestroy(res);
 }
diff --git a/fern/pages/c_api/c-api-neighbors-ivf-sq.md b/fern/pages/c_api/c-api-neighbors-ivf-sq.md
index 105bd1c25e..3d7966b563 100644
--- a/fern/pages/c_api/c-api-neighbors-ivf-sq.md
+++ b/fern/pages/c_api/c-api-neighbors-ivf-sq.md
@@ -303,7 +303,8 @@ cuvsIvfSqSearchParams_t search_params,
 cuvsIvfSqIndex_t index,
 DLManagedTensor* queries,
 DLManagedTensor* neighbors,
-DLManagedTensor* distances);
+DLManagedTensor* distances,
+cuvsFilter filter);
 ```
 
 `DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`. Types for input are:
@@ -322,39 +323,7 @@ DLManagedTensor* distances);
 | `queries` | in | `DLManagedTensor*` | DLManagedTensor* queries dataset to search |
 | `neighbors` | out | `DLManagedTensor*` | DLManagedTensor* output `k` neighbors for queries |
 | `distances` | out | `DLManagedTensor*` | DLManagedTensor* output `k` distances for queries |
-
-**Returns**
-
-[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
-
-<a id="cuvsivfsqsearchwithfilter"></a>
-### cuvsIvfSqSearchWithFilter
-
-Search an IVF-SQ index with filtering.
-
-```c
-CUVS_EXPORT cuvsError_t cuvsIvfSqSearchWithFilter(cuvsResources_t res,
-cuvsIvfSqSearchParams_t search_params,
-cuvsIvfSqIndex_t index,
-DLManagedTensor* queries,
-DLManagedTensor* neighbors,
-DLManagedTensor* distances,
-cuvsFilter filter);
-```
-
-Same as cuvsIvfSqSearch, but applies a pre-filter to exclude vectors during search.
-
-**Parameters**
-
-| Name | Direction | Type | Description |
-| --- | --- | --- | --- |
-| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsResources_t opaque C handle |
-| `search_params` | in | [`cuvsIvfSqSearchParams_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqsearchparams) | cuvsIvfSqSearchParams_t used to search IVF-SQ index |
-| `index` | in | [`cuvsIvfSqIndex_t`](/api-reference/c-api-neighbors-ivf-sq#cuvsivfsqindex) | ivfSqIndex which has been returned by `cuvsIvfSqBuild` |
-| `queries` | in | `DLManagedTensor*` | DLManagedTensor* queries dataset to search |
-| `neighbors` | out | `DLManagedTensor*` | DLManagedTensor* output `k` neighbors for queries |
-| `distances` | out | `DLManagedTensor*` | DLManagedTensor* output `k` distances for queries |
-| `filter` | in | [`cuvsFilter`](/api-reference/c-api-neighbors-common#cuvsfilter) | cuvsFilter to filter neighbors based on the given bitset |
+| `filter` | in | [`cuvsFilter`](/api-reference/c-api-neighbors-common#cuvsfilter) | cuvsFilter input filter that can be used to filter queries and neighbors based on the given bitset. |
 
 **Returns**