From e8662a124804f0a4c623576473c30ec3316f67a9 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Tue, 14 Apr 2026 15:21:16 -0700 Subject: [PATCH 1/2] Parameterize unroll factor in stridingMemcpyKernel and add uint4 specialization --- library/kernels.cu | 51 ++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/library/kernels.cu b/library/kernels.cu index 56aa274..305de0f 100644 --- a/library/kernels.cu +++ b/library/kernels.cu @@ -38,6 +38,11 @@ __device__ void write_to_regular_memory(T *dst, T val) { *dst = val; } +template<> +__device__ void write_to_regular_memory(uint4 *dst, uint4 val) { + *dst = {val.x, val.y, val.z, val.w}; +} + template __device__ void write_to_multicast_memory(T *dst, T val) { #if __CUDA_ARCH__ >= 900 @@ -59,9 +64,8 @@ __device__ void reduce_from_multicast_red(T *dst, T *val) { #endif } -template write> +template write, int UNROLL_FACTOR = 12> __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) { - T *dstEnd = dst + sizeInElement; size_t chunkSizeInElement = sizeInElement / totalThreadCount; size_t globalThreadId = blockDim.x * blockIdx.x + threadIdx.x; @@ -69,48 +73,33 @@ __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned lon src += globalThreadId; // Calculate where to end the big pipelined copy - size_t bigChunkSizeInElement = chunkSizeInElement / 12; - T *dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount; + size_t bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR; + T *dstBigEnd = dst + (bigChunkSizeInElement * UNROLL_FACTOR) * totalThreadCount; for (unsigned int i = 0; i < loopCount; i++) { T* cdst = dst; T* csrc = src; while (cdst < dstBigEnd) { - T pipe_0 = *csrc; csrc += totalThreadCount; - T pipe_1 = *csrc; csrc += totalThreadCount; - T pipe_2 = *csrc; csrc += totalThreadCount; - T pipe_3 = *csrc; csrc += totalThreadCount; - T pipe_4 = *csrc; csrc += totalThreadCount; - T pipe_5 = *csrc; csrc += totalThreadCount; - T pipe_6 = *csrc; csrc += totalThreadCount; - T pipe_7 = *csrc; csrc += totalThreadCount; - T pipe_8 = *csrc; csrc += totalThreadCount; - T pipe_9 = *csrc; csrc += totalThreadCount; - T pipe_10 = *csrc; csrc += totalThreadCount; - T pipe_11 = *csrc; csrc += totalThreadCount; - - write(cdst, pipe_0); cdst += totalThreadCount; - write(cdst, pipe_1); cdst += totalThreadCount; - write(cdst, pipe_2); cdst += totalThreadCount; - write(cdst, pipe_3); cdst += totalThreadCount; - write(cdst, pipe_4); cdst += totalThreadCount; - write(cdst, pipe_5); cdst += totalThreadCount; - write(cdst, pipe_6); cdst += totalThreadCount; - write(cdst, pipe_7); cdst += totalThreadCount; - write(cdst, pipe_8); cdst += totalThreadCount; - write(cdst, pipe_9); cdst += totalThreadCount; - write(cdst, pipe_10); cdst += totalThreadCount; - write(cdst, pipe_11); cdst += totalThreadCount; + T pipe[UNROLL_FACTOR]; + #pragma unroll + for (int k = 0; k < UNROLL_FACTOR; ++k) { + pipe[k] = *csrc; csrc += totalThreadCount; + } + #pragma unroll + for (int k = 0; k < UNROLL_FACTOR; ++k) { + write(cdst, pipe[k]); cdst += totalThreadCount; + } } // Take care of copies that didn't get aligned properly - while (cdst < dstEnd) { - write(cdst, *csrc); cdst += totalThreadCount; csrc += totalThreadCount; + for (size_t j = bigChunkSizeInElement * UNROLL_FACTOR; j < chunkSizeInElement; ++j) { + write(dst + j * totalThreadCount, src[j * totalThreadCount]); } } } + template write> __global__ void simpleMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) { T *dstEnd = dst + sizeInElement; From 110247fc33ff730b168c8f33bb0c5f1e7f81fbbf Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 15 Apr 2026 12:03:49 -0700 Subject: [PATCH 2/2] Convert main loop to use indices. Fix tail processing. --- library/kernels.cu | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/library/kernels.cu b/library/kernels.cu index 305de0f..8d44055 100644 --- a/library/kernels.cu +++ b/library/kernels.cu @@ -71,35 +71,33 @@ __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned lon size_t globalThreadId = blockDim.x * blockIdx.x + threadIdx.x; dst += globalThreadId; src += globalThreadId; + size_t maxElementIndex = sizeInElement - globalThreadId; // Calculate where to end the big pipelined copy size_t bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR; - T *dstBigEnd = dst + (bigChunkSizeInElement * UNROLL_FACTOR) * totalThreadCount; - for (unsigned int i = 0; i < loopCount; i++) { - T* cdst = dst; - T* csrc = src; + for (unsigned long long j = 0; j < bigChunkSizeInElement; j++) { + unsigned long long offset = j * UNROLL_FACTOR * totalThreadCount; - while (cdst < dstBigEnd) { T pipe[UNROLL_FACTOR]; #pragma unroll for (int k = 0; k < UNROLL_FACTOR; ++k) { - pipe[k] = *csrc; csrc += totalThreadCount; + pipe[k] = src[offset + k * totalThreadCount]; } #pragma unroll for (int k = 0; k < UNROLL_FACTOR; ++k) { - write(cdst, pipe[k]); cdst += totalThreadCount; + write(dst + offset + k * totalThreadCount, pipe[k]); } } - // Take care of copies that didn't get aligned properly - for (size_t j = bigChunkSizeInElement * UNROLL_FACTOR; j < chunkSizeInElement; ++j) { - write(dst + j * totalThreadCount, src[j * totalThreadCount]); + // Take care of copies that didn't get aligned properly and remainder elements + size_t start_offset = bigChunkSizeInElement * UNROLL_FACTOR * totalThreadCount; + for (size_t offset = start_offset; offset < maxElementIndex; offset += totalThreadCount) { + write(dst + offset, src[offset]); } } } - template write> __global__ void simpleMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, T* dst, T* src, size_t sizeInElement) { T *dstEnd = dst + sizeInElement;