From e8662a124804f0a4c623576473c30ec3316f67a9 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Tue, 14 Apr 2026 15:21:16 -0700
Subject: [PATCH 1/2] Parameterize unroll factor in stridingMemcpyKernel and
 add uint4 specialization

---
 library/kernels.cu | 51 ++++++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 31 deletions(-)
diff --git a/library/kernels.cu b/library/kernels.cu
index 56aa274..305de0f 100644
--- a/library/kernels.cu
+++ b/library/kernels.cu
@@ -38,6 +38,11 @@ __device__ void write_to_regular_memory(T *dst, T val) {
     *dst = val;
 }
 
+template<>
+__device__ void write_to_regular_memory<uint4>(uint4 *dst, uint4 val) {
+    *dst = {val.x, val.y, val.z, val.w};
+}
+
 template<typename T>
 __device__ void write_to_multicast_memory(T *dst, T val) {
 #if __CUDA_ARCH__ >= 900
@@ -59,9 +64,8 @@ __device__ void reduce_from_multicast_red(T *dst, T *val) {
 #endif
 }
 
-template<typename T, write_to_memory<T> write>
+template<typename T, write_to_memory<T> write, int UNROLL_FACTOR = 12>
 __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) {
-    T *dstEnd = dst + sizeInElement;
     size_t chunkSizeInElement = sizeInElement / totalThreadCount;
 
     size_t globalThreadId = blockDim.x * blockIdx.x + threadIdx.x;
@@ -69,48 +73,33 @@ __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned lon
     src += globalThreadId;
 
     // Calculate where to end the big pipelined copy
-    size_t bigChunkSizeInElement = chunkSizeInElement / 12;
-    T *dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount;
+    size_t bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR;
+    T *dstBigEnd = dst + (bigChunkSizeInElement * UNROLL_FACTOR) * totalThreadCount;
 
     for (unsigned int i = 0; i < loopCount; i++) {
         T* cdst = dst;
         T* csrc = src;
 
         while (cdst < dstBigEnd) {
-            T pipe_0 = *csrc; csrc += totalThreadCount;
-            T pipe_1 = *csrc; csrc += totalThreadCount;
-            T pipe_2 = *csrc; csrc += totalThreadCount;
-            T pipe_3 = *csrc; csrc += totalThreadCount;
-            T pipe_4 = *csrc; csrc += totalThreadCount;
-            T pipe_5 = *csrc; csrc += totalThreadCount;
-            T pipe_6 = *csrc; csrc += totalThreadCount;
-            T pipe_7 = *csrc; csrc += totalThreadCount;
-            T pipe_8 = *csrc; csrc += totalThreadCount;
-            T pipe_9 = *csrc; csrc += totalThreadCount;
-            T pipe_10 = *csrc; csrc += totalThreadCount;
-            T pipe_11 = *csrc; csrc += totalThreadCount;
-
-            write(cdst, pipe_0); cdst += totalThreadCount;
-            write(cdst, pipe_1); cdst += totalThreadCount;
-            write(cdst, pipe_2); cdst += totalThreadCount;
-            write(cdst, pipe_3); cdst += totalThreadCount;
-            write(cdst, pipe_4); cdst += totalThreadCount;
-            write(cdst, pipe_5); cdst += totalThreadCount;
-            write(cdst, pipe_6); cdst += totalThreadCount;
-            write(cdst, pipe_7); cdst += totalThreadCount;
-            write(cdst, pipe_8); cdst += totalThreadCount;
-            write(cdst, pipe_9); cdst += totalThreadCount;
-            write(cdst, pipe_10); cdst += totalThreadCount;
-            write(cdst, pipe_11); cdst += totalThreadCount;
+            T pipe[UNROLL_FACTOR];
+            #pragma unroll
+            for (int k = 0; k < UNROLL_FACTOR; ++k) {
+                pipe[k] = *csrc; csrc += totalThreadCount;
+            }
+            #pragma unroll
+            for (int k = 0; k < UNROLL_FACTOR; ++k) {
+                write(cdst, pipe[k]); cdst += totalThreadCount;
+            }
         }
 
         // Take care of copies that didn't get aligned properly
-        while (cdst < dstEnd) {
-            write(cdst, *csrc); cdst += totalThreadCount; csrc += totalThreadCount;
+        for (size_t j = bigChunkSizeInElement * UNROLL_FACTOR; j < chunkSizeInElement; ++j) {
+            write(dst + j * totalThreadCount, src[j * totalThreadCount]);
         }
     }
 }
 
+
 template<typename T, reduce_from_memory<T> write>
 __global__ void simpleMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) {
     T *dstEnd = dst + sizeInElement;

From 110247fc33ff730b168c8f33bb0c5f1e7f81fbbf Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Wed, 15 Apr 2026 12:03:49 -0700
Subject: [PATCH 2/2] Convert main loop to use indices. Fix tail processing.

---
 library/kernels.cu | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/library/kernels.cu b/library/kernels.cu
index 305de0f..8d44055 100644
--- a/library/kernels.cu
+++ b/library/kernels.cu
@@ -71,35 +71,33 @@ __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned lon
     size_t globalThreadId = blockDim.x * blockIdx.x + threadIdx.x;
     dst += globalThreadId;
     src += globalThreadId;
+    size_t maxElementIndex = sizeInElement - globalThreadId;
 
     // Calculate where to end the big pipelined copy
     size_t bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR;
-    T *dstBigEnd = dst + (bigChunkSizeInElement * UNROLL_FACTOR) * totalThreadCount;
-
     for (unsigned int i = 0; i < loopCount; i++) {
-        T* cdst = dst;
-        T* csrc = src;
+        for (unsigned long long j = 0; j < bigChunkSizeInElement; j++) {
+            unsigned long long offset = j * UNROLL_FACTOR * totalThreadCount;
 
-        while (cdst < dstBigEnd) {
             T pipe[UNROLL_FACTOR];
             #pragma unroll
             for (int k = 0; k < UNROLL_FACTOR; ++k) {
-                pipe[k] = *csrc; csrc += totalThreadCount;
+                pipe[k] = src[offset + k * totalThreadCount];
             }
             #pragma unroll
             for (int k = 0; k < UNROLL_FACTOR; ++k) {
-                write(cdst, pipe[k]); cdst += totalThreadCount;
+                write(dst + offset + k * totalThreadCount, pipe[k]);
             }
         }
 
-        // Take care of copies that didn't get aligned properly
-        for (size_t j = bigChunkSizeInElement * UNROLL_FACTOR; j < chunkSizeInElement; ++j) {
-            write(dst + j * totalThreadCount, src[j * totalThreadCount]);
+        // Take care of copies that didn't get aligned properly and remainder elements
+        size_t start_offset = bigChunkSizeInElement * UNROLL_FACTOR * totalThreadCount;
+        for (size_t offset = start_offset; offset < maxElementIndex; offset += totalThreadCount) {
+            write(dst + offset, src[offset]);
         }
     }
 }
 
-
 template<typename T, reduce_from_memory<T> write>
 __global__ void simpleMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) {
     T *dstEnd = dst + sizeInElement;