diff --git a/README.md b/README.md index 0e38ddb1..9c407f28 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,78 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Daniel Chen + * https://www.linkedin.com/in/daniel-c-a02ba2229/ +* Tested on: Windows 11, AMD Ryzen 7 8845HS w/ Radeon 780M Graphics (3.80 GHz), RTX 4070 notebook -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# Writeup +This project contains several CPU/GPU prefix sum and stream compaction algorithms to compare their performance. +## Algorithm comparison +![algorithm comparison graphs](/img/graphs.png) + +The above log-log plots show the running times of each of the implemented scan algorithms, at power-of-2 array sizes (left) and 3 less than each of those power-of-2 array sizes (right). Initial memory setup using `cudaMalloc`, `cudaMemset`, and `thrust::{host_vector, copy}` is excluded from the timings as per Part 7 of [INSTRUCTION.md](INSTRUCTION.md). + +For small arrays, CPU scan was the fastest, followed by naive on average, then work-efficient, then thrust. However, throughout array sizes in the low millions, the order reverses: thrust is the fastest, followed by work-efficient, then naive, and then CPU. As the array sizes grow large, they all seem to increase by the same power (about linear). The reversal of the order would likely be due to overhead of the kernel launches at the smaller array sizes and/or possible block size misalignment with the number of threads. As the arrays grow large, however, the benefits of parallelization and GPU memory access optimization become more pronounced the more times the kernels are launched and the more blocks that are instantiated, even if the long-term complexity of all the algorithms remains $O(n)$ (not $O(\log(n))$ for the GPU implementations as the number of concurrent threads will be capped to be some fixed number based on the GPU hardware). + +## Extra credit +As requested in Part 5 of [INSTRUCTION.md](INSTRUCTION.md), the work-efficient GPU implementation is over 4 times faster than the CPU implementation for array sizes greater than ~1 million even without further optimization. + +## Test output +``` + +**************** +** SCAN TESTS ** +**************** + [ 29 22 44 14 19 35 38 23 43 43 35 30 6 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0012ms (std::chrono Measured) + [ 0 29 51 95 109 128 163 201 224 267 310 345 375 ... 6278 6298 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 0 29 51 95 109 128 163 201 224 267 310 345 375 ... 6216 6219 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 1.81264ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.605056ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.677728ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.334272ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 2.58243ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.525216ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 1 2 0 2 3 0 3 1 1 2 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0014ms (std::chrono Measured) + [ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0057ms (std::chrono Measured) + [ 2 2 1 2 2 3 3 1 1 2 1 1 3 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.727648ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.61846ms (CUDA Measured) + passed +Press any key to continue . . . +``` \ No newline at end of file diff --git a/img/graphs.png b/img/graphs.png new file mode 100644 index 00000000..87de47e4 Binary files /dev/null and b/img/graphs.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..3f8e321b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 29; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..9bb4cdb5 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,7 +35,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + if (idata[index] == 0) return; + + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..1080750b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -114,6 +114,10 @@ namespace StreamCompaction { PerformanceTimer& operator=(const PerformanceTimer&) = delete; PerformanceTimer& operator=(PerformanceTimer&&) = delete; + bool cpuTimerStarted() { + return cpu_timer_started; + } + private: cudaEvent_t event_start = nullptr; cudaEvent_t event_end = nullptr; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..f996c788 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,9 +18,22 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool timerStarted = false; + if (!timer().cpuTimerStarted()) { + timer().startCpuTimer(); + timerStarted = true; + } + + if (n > 0) { + odata[0] = 0; + } + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + if (timerStarted) { + timer().endCpuTimer(); + } } /** @@ -30,9 +43,17 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int offset = 0; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + offset++; + } + else { + odata[i - offset] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return n - offset; } /** @@ -42,9 +63,26 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + + for (int i = 0; i < n; i++) { + odata[i] = idata[i] == 0 ? 0 : 1; + } + + int* offsets = new int[n]; + + scan(n, offsets, odata); + for (int i = 0; i < n; i++) { + if (idata[i] == 0) continue; + odata[offsets[i]] = idata[i]; + } + + int out = offsets[n - 1]; + + delete[] offsets; + timer().endCpuTimer(); - return -1; + return out; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..b6d61cbf 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,75 @@ namespace StreamCompaction { return timer; } + + int* dev_arr; + int *dev_arr2; + int* dev_bools; + + __global__ void upsweepStep(int nc, int step, int* arr) { + unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned long long int indexRight = (index + 1) * step - 1; + if (indexRight >= nc) return; + + unsigned long long int indexLeft = indexRight - step / 2; + + arr[indexRight] = arr[indexLeft] + arr[indexRight]; + } + + __global__ void downsweepStep(int nc, int step, int* arr) { + unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned long long int indexRight = (index + 1) * step - 1; + if (indexRight >= nc) return; + + unsigned long long int indexLeft = indexRight - step / 2; + + int right = arr[indexRight]; + arr[indexRight] += arr[indexLeft]; + arr[indexLeft] = right; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int nc = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_arr, nc * sizeof(int)); + + cudaMemcpy(dev_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_arr + n, 0, (nc - n) * sizeof(int)); + + timer().startGpuTimer(); // TODO + + + + int blockSize = 256; + + + for (int step = 2; step <= nc; step <<= 1) { + int nBlocks = (nc / step + blockSize - 1) / blockSize; + upsweepStep<<>>(nc, step, dev_arr); + } + + cudaMemset(dev_arr + nc - 1, 0, sizeof(int)); + + for (int step = nc; step >= 2; step >>= 1) { + int nBlocks = (nc / step + blockSize - 1) / blockSize; + downsweepStep<<>>(nc, step, dev_arr); + } + + timer().endGpuTimer(); + + + + cudaMemcpy(odata, dev_arr, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_arr); } /** @@ -31,10 +94,58 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + if (n == 0) return 0; + + + int nc = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_arr, nc * sizeof(int)); + cudaMalloc((void**)&dev_arr2, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, nc * sizeof(int)); + + + int blockSize = 256; + int nBlocksOuter = (nc + blockSize - 1) / blockSize; + + + cudaMemcpy(dev_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_arr + n, 0, (nc - n) * sizeof(int)); + + timer().startGpuTimer(); - // TODO + + Common::kernMapToBoolean<<>>(nc, dev_bools, dev_arr); + + for (int step = 2; step <= nc; step <<= 1) { + int nBlocks = (nc / step + blockSize - 1) / blockSize; + upsweepStep<<>>(nc, step, dev_bools); + } + + cudaMemset(dev_bools + nc - 1, 0, sizeof(int)); + + for (int step = nc; step >= 2; step >>= 1) { + int nBlocks = (nc / step + blockSize - 1) / blockSize; + downsweepStep<<>>(nc, step, dev_bools); + } + + Common::kernScatter<<>>(n, dev_arr2, dev_arr, NULL, dev_bools); + + timer().endGpuTimer(); - return -1; + + + + int nFinal; + cudaMemcpy(&nFinal, dev_bools + nc - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaMemcpy(odata, dev_arr2, nFinal * sizeof(int), cudaMemcpyDeviceToHost); + + + cudaFree(dev_arr); + cudaFree(dev_arr2); + cudaFree(dev_bools); + + return nFinal; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..77c47588 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,66 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + int *dev_arr1; + int *dev_arr2; + + __global__ void add(int n, int skip, int* out, int* in) { + unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + out[index] = in[index]; + + if (index >= skip) { + out[index] += in[index - skip]; + } + } + + __global__ void insert0(int n, int* out, int* in) { + unsigned long long int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + out[index] = index == 0 ? 0 : in[index - 1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + cudaMalloc((void**)&dev_arr1, n * sizeof(int)); + cudaMalloc((void**)&dev_arr2, n * sizeof(int)); + + cudaMemcpy(dev_arr1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); - // TODO + + + int blockSize = 256; + int nBlocks = (n + blockSize - 1) / blockSize; + + + int *curIn = dev_arr1; + int *curOut = dev_arr2; + + insert0<<>>(n, curOut, curIn); + + for (int skip = 1; skip < n; skip <<= 1) { + int* tmp = curIn; + curIn = curOut; + curOut = tmp; + + add<<>>(n, skip, curOut, curIn); + } + + + timer().endGpuTimer(); + + + cudaMemcpy(odata, curOut, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_arr1); + cudaFree(dev_arr2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..72e8a8e7 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -1,11 +1,12 @@ #include #include -#include -#include -#include #include "common.h" #include "thrust.h" +#include +#include +#include + namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; @@ -18,11 +19,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_in = (thrust::device_vector)thrust::host_vector(idata, idata + n); + thrust::device_vector dv_out = (thrust::device_vector)thrust::host_vector(odata, odata + n); + + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }