From 618a22afe0d8dfed082acd829460770a95aea5c0 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 24 Jun 2026 08:41:31 +0000 Subject: [PATCH 1/5] Fix gather/scatter fast div bug - gather/scatter crashes at int32 boundary even when it advertises int64 type. This change fixes this issue. - int64 is still not supported for performance reason with int128. --- .../raft/matrix/detail/gather_inplace.cuh | 4 +-- .../raft/matrix/detail/scatter_inplace.cuh | 4 +-- cpp/include/raft/util/fast_int_div.cuh | 30 ++++++++++++------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh index 1cfd7664ec..a30a3430c9 100644 --- a/cpp/include/raft/matrix/detail/gather_inplace.cuh +++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh @@ -54,7 +54,7 @@ void gatherInplaceImpl(raft::resources const& handle, transform_op, batch_offset, map_length, - cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), n, ld] __device__(auto idx) { IndexT row = idx / cols_per_batch; @@ -74,7 +74,7 @@ void gatherInplaceImpl(raft::resources const& handle, scratch_space = scratch_space.data_handle(), batch_offset, map_length, - cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), n, ld] __device__(auto idx) { IndexT row = idx / cols_per_batch; diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh index 2c735e3fda..392928143a 100644 --- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh +++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh @@ -77,7 +77,7 @@ void scatterInplaceImpl( auto copy_op = [inout = inout.data_handle(), map = map.data_handle(), batch_offset, - cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), n] __device__(auto idx) { IndexT row = idx / cols_per_batch; IndexT col = idx % cols_per_batch; @@ -92,7 +92,7 @@ void scatterInplaceImpl( map = map.data_handle(), scratch_space = scratch_space.data_handle(), batch_offset, - cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), n] __device__(auto idx) { IndexT row = idx / cols_per_batch; IndexT col = idx % cols_per_batch; diff --git a/cpp/include/raft/util/fast_int_div.cuh b/cpp/include/raft/util/fast_int_div.cuh index 527fd46e14..eea7e56376 100644 --- a/cpp/include/raft/util/fast_int_div.cuh +++ b/cpp/include/raft/util/fast_int_div.cuh @@ -9,6 +9,7 @@ #include #include +#include namespace raft { namespace util { @@ -17,18 +18,25 @@ namespace util { * @brief Perform fast integer division and modulo using a known divisor * From Hacker's Delight, Second Edition, Chapter 10 * - * @note This currently only supports 32b signed integers + * @note 32b signed integer is supported. + * @note 64b signed integers is supported for an input data up to 2^31 + * because gpu-non-native int128 is avoided for performance. * @todo Extend support for signed divisors */ +template struct FastIntDiv { + static_assert(std::is_same_v || std::is_same_v, + "FastIntDiv: IntT must be int32_t or int64_t"); + using UIntT = std::make_unsigned_t; + /** * @defgroup HostMethods Ctor's that are accessible only from host * @{ * @brief Host-only ctor's * @param _d the divisor */ - FastIntDiv(int _d) : d(_d) { computeScalars(); } - FastIntDiv& operator=(int _d) + FastIntDiv(IntT _d) : d(_d) { computeScalars(); } + FastIntDiv& operator=(IntT _d) { d = _d; computeScalars(); @@ -53,9 +61,9 @@ struct FastIntDiv { /** @} */ /** divisor */ - int d; + IntT d; /** the term 'm' as found in the reference chapter */ - unsigned m; + UIntT m; /** the term 'p' as found in the reference chapter */ int p; @@ -90,10 +98,11 @@ struct FastIntDiv { * @param divisor the denominator * @return the quotient */ -HDI int operator/(int n, const FastIntDiv& divisor) +template +HDI IntT operator/(IntT n, const FastIntDiv& divisor) { if (divisor.d == 1) return n; - int ret = (int64_t(divisor.m) * int64_t(n)) >> divisor.p; + IntT ret = (int64_t(divisor.m) * int64_t(n)) >> divisor.p; if (n < 0) ++ret; return ret; } @@ -105,10 +114,11 @@ HDI int operator/(int n, const FastIntDiv& divisor) * @param divisor the denominator * @return the remainder */ -HDI int operator%(int n, const FastIntDiv& divisor) +template +HDI IntT operator%(IntT n, const FastIntDiv& divisor) { - int quotient = n / divisor; - int remainder = n - quotient * divisor.d; + IntT quotient = n / divisor; + IntT remainder = n - quotient * divisor.d; return remainder; } From c6f82a2061dcae19e25a1b103523ab5828b19d9d Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 26 Jun 2026 01:23:27 -0700 Subject: [PATCH 2/5] Add unit tests for FastIntDiv --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/matrix/gather.cu | 6 ++++ cpp/tests/util/fast_int_div.cu | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 cpp/tests/util/fast_int_div.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 00bfe1f32a..7127312b7b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -317,6 +317,7 @@ if(BUILD_TESTS) util/bitonic_sort.cu util/cudart_utils.cpp util/device_atomics.cu + util/fast_int_div.cu util/integer_utils.cpp util/integer_utils.cu util/memory_type_dispatcher.cu diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu index 603676ac5d..a956624d67 100644 --- a/cpp/tests/matrix/gather.cu +++ b/cpp/tests/matrix/gather.cu @@ -236,6 +236,9 @@ const std::vector> inplace_inputs_i32 = const std::vector> inplace_inputs_i64 = raft::util::itertools::product>( {25, 2000}, {6, 31, 129}, {0, 1, 11}, {11, 999}, {0, 1, 2, 3, 6, 100}, {1234ULL}); +const std::vector> inplace_inputs_i64_i32_max = + raft::util::itertools::product>( + {1100000}, {2000}, {0}, {1100000}, {0}, {1234ULL}); GATHER_TEST((GatherTest), GatherTestFU32I32, inputs_i32); GATHER_TEST((GatherTest), @@ -265,4 +268,7 @@ GATHER_TEST((GatherTest), GATHER_TEST((GatherTest), GatherInplaceTestFI64I64, inplace_inputs_i64); +GATHER_TEST((GatherTest), + GatherInplaceTestCI64I64, + inplace_inputs_i64_i32_max); // slow test, 8GB allocation, reproduces https://github.com/rapidsai/raft/issues/3055 } // end namespace raft diff --git a/cpp/tests/util/fast_int_div.cu b/cpp/tests/util/fast_int_div.cu new file mode 100644 index 0000000000..ed81335da3 --- /dev/null +++ b/cpp/tests/util/fast_int_div.cu @@ -0,0 +1,54 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include +#include +#include + +namespace raft::util { + +constexpr int64_t kInt32Max = std::numeric_limits::max(); + +template +class FastIntDivTest : public ::testing::Test { + protected: + void CompareWithNativeDivision() + { + std::vector magnitudes{0, 1, 2, 3, 7, 13, 255, 12345, (1 << 20), kInt32Max}; + std::vector divisors{1, 2, 4, 7, 16, 31, 63, 128, 1000, (1 << 15), kInt32Max}; + + for (IntT d : divisors) { + FastIntDiv fid(d); + for (IntT mag : magnitudes) { + for (IntT n : {mag, -mag}) { + ASSERT_EQ(n / fid, n / d) << "operator/ mismatch for numerator=" << n << " divisor=" << d; + ASSERT_EQ(n % fid, n % d) << "operator% mismatch for numerator=" << n << " divisor=" << d; + } + } + } + } +}; + +using FastIntDivTypes = ::testing::Types; +TYPED_TEST_CASE(FastIntDivTest, FastIntDivTypes); + +TYPED_TEST(FastIntDivTest, CompareWithNativeDivision) { this->CompareWithNativeDivision(); } + +TEST(FastIntDiv, Int64NumeratorPastInt32Boundary) +{ + for (int64_t d : {129, 772, 1000}) { + FastIntDiv fid(d); + for (int64_t n : {1LL << 31, 2147704000LL, 3LL << 30}) { // in [2^31, 2^32) + ASSERT_EQ(n / fid, n / d) << "operator/ mismatch for numerator=" << n << " divisor=" << d; + ASSERT_EQ(n % fid, n % d) << "operator% mismatch for numerator=" << n << " divisor=" << d; + } + } +} + +} // namespace raft::util From 180ccefb58b37b1f3ca8266526d9b84031efa57c Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 08:38:59 +0000 Subject: [PATCH 3/5] Remote heavy UT due to OOM --- cpp/tests/matrix/gather.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu index a956624d67..0a50f9c7d9 100644 --- a/cpp/tests/matrix/gather.cu +++ b/cpp/tests/matrix/gather.cu @@ -268,7 +268,4 @@ GATHER_TEST((GatherTest), GATHER_TEST((GatherTest), GatherInplaceTestFI64I64, inplace_inputs_i64); -GATHER_TEST((GatherTest), - GatherInplaceTestCI64I64, - inplace_inputs_i64_i32_max); // slow test, 8GB allocation, reproduces https://github.com/rapidsai/raft/issues/3055 } // end namespace raft From 9f3d2cb0b09065d371ff8fff31ad478466bb182c Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 08:46:50 +0000 Subject: [PATCH 4/5] precommit run --- cpp/include/raft/util/fast_int_div.cuh | 3 ++- cpp/tests/util/fast_int_div.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/util/fast_int_div.cuh b/cpp/include/raft/util/fast_int_div.cuh index eea7e56376..d7f385a5d4 100644 --- a/cpp/include/raft/util/fast_int_div.cuh +++ b/cpp/include/raft/util/fast_int_div.cuh @@ -9,6 +9,7 @@ #include #include + #include namespace raft { @@ -18,7 +19,7 @@ namespace util { * @brief Perform fast integer division and modulo using a known divisor * From Hacker's Delight, Second Edition, Chapter 10 * - * @note 32b signed integer is supported. + * @note 32b signed integer is supported. * @note 64b signed integers is supported for an input data up to 2^31 * because gpu-non-native int128 is avoided for performance. * @todo Extend support for signed divisors diff --git a/cpp/tests/util/fast_int_div.cu b/cpp/tests/util/fast_int_div.cu index ed81335da3..eb391c1dbb 100644 --- a/cpp/tests/util/fast_int_div.cu +++ b/cpp/tests/util/fast_int_div.cu @@ -22,7 +22,7 @@ class FastIntDivTest : public ::testing::Test { { std::vector magnitudes{0, 1, 2, 3, 7, 13, 255, 12345, (1 << 20), kInt32Max}; std::vector divisors{1, 2, 4, 7, 16, 31, 63, 128, 1000, (1 << 15), kInt32Max}; - + for (IntT d : divisors) { FastIntDiv fid(d); for (IntT mag : magnitudes) { From d87dcdd84f83329093e4ad8f34d54c7534618e90 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 09:11:01 +0000 Subject: [PATCH 5/5] Remove gather test leftover --- cpp/tests/matrix/gather.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu index 0a50f9c7d9..603676ac5d 100644 --- a/cpp/tests/matrix/gather.cu +++ b/cpp/tests/matrix/gather.cu @@ -236,9 +236,6 @@ const std::vector> inplace_inputs_i32 = const std::vector> inplace_inputs_i64 = raft::util::itertools::product>( {25, 2000}, {6, 31, 129}, {0, 1, 11}, {11, 999}, {0, 1, 2, 3, 6, 100}, {1234ULL}); -const std::vector> inplace_inputs_i64_i32_max = - raft::util::itertools::product>( - {1100000}, {2000}, {0}, {1100000}, {0}, {1234ULL}); GATHER_TEST((GatherTest), GatherTestFU32I32, inputs_i32); GATHER_TEST((GatherTest),