diff --git a/README.md b/README.md
index 0e38ddb1..823ec874 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,122 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Rachel Lin
-### (TODO: Your README)
+ * [LinkedIn](https://www.linkedin.com/in/rachel-lin-452834213/)
+ * [personal website](https://www.artstation.com/rachellin4)
+ * [Instagram](https://www.instagram.com/lotus_crescent/)
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Tested on: (TODO) Windows 11, 12th Gen Intel(R) Core(TM) i7-12700H @ 2.30GHz, NVIDIA GeForce RTX 3080 Laptop GPU (16 GB)
+
+## Description
+
+This project offers parallel scan and stream compaction algorithms in CUDA. Features include:
+ * stream compaction to remove unwanted elements (zeros) from an input data array and scatter the valid elements into a compacted output buffer
+ * exclusive (prefix sum) scanning
+ * on the CPU using a simple for-loop
+ * on the GPU using a naive algorithm
+ * on the GPU using a work-efficient algorithm that avoids race conditions
+ * on the GPU using thrust library
+
+## Performance Analysis
+
+### Comparison of GPU Scan Implementations
+
+
+
+
+#### Average Scan Time vs. Array Size (Power of Two)
+
+| Array Size | CPU | Naive | Work-Efficient | Thrust |
+| --------- | --------- | --------- | --------- | --------- |
+| 256 | 0.0006 | 0.1969493333 | 0.2730666667 | 0.1140053333 |
+| 1024 | 0.0018 | 0.2085546667 | 0.41984 | 0.1235733333 |
+| 16384 | 0.0284 | 0.254976 | 0.9103373333 | 0.1314133333 |
+| 131072 | 0.2284333333 | 0.6356693333 | 0.7360333333 | 0.130048 |
+| 1048576 | 1.737733333 | 1.173386667 | 1.053409333 | 0.759808 |
+| 4194304 | 7.669166667 | 4.442293333 | 2.438283333 | 0.8376226667 |
+| 16777216 | 27.9746 | 13.6956 | 6.76686 | 1.431213333 |
+
+#### Average Scan Time vs. Array Size (Non-Power of Two)
+
+| Array Size | CPU | Naive | Work-Efficient | Thrust |
+| --------- | --------- | --------- | --------- | --------- |
+| 253 | 0.0005 | 0.06144 | 0.2095786667 | 0.05768533333 |
+| 1021 | 0.0019 | 0.1505493333 | 0.2740906667 | 0.05290666667 |
+| 16383 | 0.03276666667 | 0.1723733333 | 0.372736 | 0.05563733333 |
+| 131069 | 0.2283666667 | 0.6361066667 | 0.5046293333 | 0.045056 |
+| 1048573 | 2.531466667 | 1.062026667 | 0.9203946667 | 0.75264 |
+| 4194301 | 7.606633333 | 5.455966667 | 3.874946667 | 1.58338 |
+| 16777213 | 28.6349 | 13.5973 | 6.175863333 | 1.657173333 |
+
+#### CPU
+This implementation does not involve the GPU at all and is purely single-threaded. This makes it faster for small arrays because no kernel launch is required. However, this algorithm scales poorly with array size compared to the other implementations because it does not take advantage of the multi-threaded approach that the other algorithms do. This approach faces bottlenecks in both memory and computation (it becomes slower as the array size gets large).
+
+#### Naive
+This algorithm requires two arrays that are swapped every iteration to avoid race conditions. Since it performs computations in parallel, it scales relatively well compared to the CPU approach. This algorithm is not as optimized as it could be; every iteration, all threads with index less than the stride value are idle. However, the most significant bottleneck comes from the kernel-launch overhead (there are log_2(n) kernels) the redundant computations where some threads re-add elements from the input array.
+
+#### Work-Efficient
+The work-efficient algorithm uses up-sweep to build a sum tree and down-sweep to distribute prefix sums. Sine it opertes in-place, this saves memory. This approach also takes advantage of parallelism on the GPU and uses log_2(n) kernel launches, but each kernel does less extra work because there are fewer redundant computations. This approach still faces a bottleneck through the kernel-launch overhead (still log_2(n) kernels for upsweep and downsweep).
+
+#### Thrust
+The thrust approach is very fast on large arrays. It may be using shared memory or minimizing the number of idle threads to further optimize the algorithm.
+
+
+
+### Example Output for Array Size 256
+```
+****************
+** SCAN TESTS **
+****************
+ [ 24 2 28 23 36 34 30 21 22 40 10 0 17 ... 20 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 0.0005ms (std::chrono Measured)
+ [ 0 24 26 54 77 113 147 177 198 220 260 270 270 ... 6029 6049 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 0.0005ms (std::chrono Measured)
+ [ 0 24 26 54 77 113 147 177 198 220 260 270 270 ... 5891 5934 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 0.823296ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 0.13824ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 0.635904ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 0.311296ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 0.203776ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 0.094208ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 0 2 0 3 0 2 2 1 0 0 2 2 1 ... 2 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 0.0008ms (std::chrono Measured)
+ [ 2 3 2 2 1 2 2 1 3 1 2 3 3 ... 2 2 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 0.0004ms (std::chrono Measured)
+ [ 2 3 2 2 1 2 2 1 3 1 2 3 3 ... 1 3 ]
+ passed
+==== cpu compact with scan ====
+ elapsed time: 0.0048ms (std::chrono Measured)
+ [ 2 3 2 2 1 2 2 1 3 1 2 3 3 ... 2 2 ]
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 0.538624ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 0.223232ms (CUDA Measured)
+ passed
+```
diff --git a/img/Scan Time vs. Array Size (Non-Power of Two).png b/img/Scan Time vs. Array Size (Non-Power of Two).png
new file mode 100644
index 00000000..fe58597e
Binary files /dev/null and b/img/Scan Time vs. Array Size (Non-Power of Two).png differ
diff --git a/img/Scan Time vs. Array Size (Power of Two).png b/img/Scan Time vs. Array Size (Power of Two).png
new file mode 100644
index 00000000..65817a19
Binary files /dev/null and b/img/Scan Time vs. Array Size (Power of Two).png differ
diff --git a/src/main.cpp b/src/main.cpp
index 3d5c8820..b5a69dc9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,6 +20,16 @@ int *b = new int[SIZE];
int *c = new int[SIZE];
int main(int argc, char* argv[]) {
+ int deviceCount;
+ cudaGetDeviceCount(&deviceCount);
+
+ for (int i = 0; i < deviceCount; i++) {
+ cudaDeviceProp prop;
+ cudaGetDeviceProperties(&prop, i);
+ std::cout << "Device " << i << ": " << prop.name << "\n";
+ std::cout << "Compute capability: " << prop.major << "." << prop.minor << "\n";
+ }
+
// Scan tests
printf("\n");
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d630..0a4b8914 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,15 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+ if (index < n) {
+ if (idata[index] == 0) {
+ bools[index] = 0;
+ } else {
+ bools[index] = 1;
+ }
+ }
}
/**
@@ -32,7 +40,14 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
+
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+ if (index < n) {
+ if (bools[index] == 1) {
+ odata[indices[index]] = idata[index];
+ }
+ }
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa115..3d12e7eb 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,19 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ // check size of idata
+ if (n == 0) {
+ return;
+ }
+
+ // compute exclusive prefix sum (ignore last element)
+ odata[0] = 0;
+
+ for (int i = 1; i < n; i++) {
+ odata[i] = odata[i - 1] + idata[i - 1];
+ }
+
timer().endCpuTimer();
}
@@ -30,9 +42,20 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ // number of elements remaining
+ int numElems = 0;
+
+ // compaction
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[numElems] = idata[i];
+ numElems++;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+ return numElems;
}
/**
@@ -42,9 +65,46 @@ namespace StreamCompaction {
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ // temporary array to indicate the element should be kept/discarded
+ int* isValid = new int[n];
+
+ for (int i = 0; i < n; i++) {
+ isValid[i] = 0;
+ if (idata[i] != 0) {
+ isValid[i] = 1;
+ }
+ }
+
+ // exclusive prefix sum scan on temp array
+ // represents the index in odata that i in idata should be mapped to
+ int* indices = new int[n];
+
+ // compute exclusive prefix sum (ignore last element)
+ indices[0] = 0;
+
+ for (int i = 1; i < n; i++) {
+ indices[i] = indices[i - 1] + isValid[i - 1];
+ }
+
+ // number of elements remaining
+ int numElems = 0;
+
+ // scatter
+ for (int i = 0; i < n; i++) {
+ if (isValid[i] == 1) {
+ int idx = indices[i];
+ odata[idx] = idata[i];
+ numElems++;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+
+ delete[] isValid;
+ delete[] indices;
+
+ return numElems;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346ee..945d9ce8 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,14 @@
#include "common.h"
#include "efficient.h"
+/*****************
+* Configuration *
+*****************/
+
+/*! Block size used for CUDA kernel launch. */
+#define blockSize 256
+
+
namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +20,149 @@ namespace StreamCompaction {
return timer;
}
+ // set last elem to zero
+ __global__ void kernZeroLast(int n, int* idata) {
+ if (threadIdx.x == 0 && blockIdx.x == 0) {
+ idata[n - 1] = 0;
+ }
+ }
+
+ // up-sweep
+ __global__ void kernUpsweep(int n, int d, int* idata) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ int stride = 1 << (d + 1); // 2^(d+1)
+ int numNodes = n >> (d + 1);
+
+ if (index < numNodes) {
+ int idx1 = index * stride + stride - 1;
+ int idx2 = idx1 - (stride >> 1);
+ idata[idx1] += idata[idx2];
+ }
+ }
+
+ // down-sweep
+ __global__ void kernDownsweep(int n, int d, int* idata) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ int stride = 1 << (d + 1); // 2^(d+1)
+ int numNodes = n >> (d + 1);
+
+ if (index < numNodes) {
+ int idx1 = index * stride + stride - 1;
+ int idx2 = idx1 - (stride >> 1);
+
+ int t = idata[idx2];
+ idata[idx2] = idata[idx1];
+ idata[idx1] += t;
+ }
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+
+ // set new n
+ int newN = 1 << ilog2ceil(n);
+
+ // device array buffer
+ int* dev_arr;
+
+ // allocate buffers
+ cudaMalloc((void**)&dev_arr, newN * sizeof(int));
+ checkCUDAError("cudaMalloc dev_arr failed!");
+
+ cudaMemset(dev_arr, 0, newN * sizeof(int));
+ cudaMemcpy(dev_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+
+
timer().startGpuTimer();
- // TODO
+
+ // upsweep
+ for (int d = 0; d < ilog2ceil(n); d++) {
+
+ int numNodes = newN >> (d + 1);
+
+ dim3 fullBlocksPerGrid((numNodes + blockSize - 1) / blockSize);
+
+ kernUpsweep << > > (newN, d, dev_arr);
+
+ checkCUDAError("kernUpsweep failed!");
+ }
+
+ // downsweep
+ cudaMemset(dev_arr + (newN - 1), 0, sizeof(int));
+ for (int d = ilog2ceil(n) - 1; d >= 0; d--) {
+
+ int numNodes = newN >> (d + 1);
+
+ dim3 fullBlocksPerGrid((numNodes + blockSize - 1) / blockSize);
+
+ kernDownsweep << > > (newN, d, dev_arr);
+
+ checkCUDAError("kernDownsweep failed!");
+ }
+ cudaDeviceSynchronize();
+
timer().endGpuTimer();
+
+ // copy data back to the CPU side from the GPU
+ cudaMemcpy(odata, dev_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // cleanup
+ cudaFree(dev_arr);
+ }
+
+
+ /**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata (device only).
+ */
+ void compactScan(int n, int* dev_odata, const int* dev_idata) {
+
+ // set new n
+ int newN = 1 << ilog2ceil(n);
+
+ // device array buffer
+ int* dev_arr;
+
+ // allocate buffers
+ cudaMalloc((void**)&dev_arr, newN * sizeof(int));
+ checkCUDAError("cudaMalloc dev_arr failed!");
+
+ cudaMemset(dev_arr, 0, newN * sizeof(int));
+ cudaMemcpy(dev_arr, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+
+ // upsweep
+ for (int d = 0; d < ilog2ceil(n); d++) {
+ int numNodes = newN / (1 << (d + 1));
+
+ dim3 fullBlocksPerGrid((numNodes + blockSize - 1) / blockSize);
+
+ kernUpsweep << > > (newN, d, dev_arr);
+
+ checkCUDAError("kernUpsweep failed!");
+ }
+
+ kernZeroLast << <1, 1 >> > (newN, dev_arr);
+
+ // downsweep
+ for (int d = ilog2ceil(n) - 1; d >= 0; d--) {
+ int numNodes = newN / (1 << (d + 1));
+
+ dim3 fullBlocksPerGrid((numNodes + blockSize - 1) / blockSize);
+
+ kernDownsweep << > > (newN, d, dev_arr);
+
+ checkCUDAError("kernDownsweep failed!");
+ }
+ cudaDeviceSynchronize();
+
+ // copy data back to the CPU side from the GPU
+ cudaMemcpy(dev_odata, dev_arr, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+ // cleanup
+ cudaFree(dev_arr);
}
/**
@@ -31,10 +175,58 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
+ int newN = 1 << ilog2ceil(n);
+
+ // allocate buffers
+ int* dev_idata;
+ int* dev_odata;
+ int* dev_isValid;
+ int* dev_indices;
+
+ cudaMalloc((void**)&dev_idata, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_idata failed!");
+ cudaMalloc((void**)&dev_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_odata failed!");
+ cudaMalloc((void**)&dev_isValid, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_isValid failed!");
+ cudaMalloc((void**)&dev_indices, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_indices failed!");
+
+ cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+
timer().startGpuTimer();
- // TODO
+
+ // temporary array to indicate the element should be kept/discarded
+ StreamCompaction::Common::kernMapToBoolean << > > (n, dev_isValid, dev_idata);
+
+ // exclusive scan on temporary array
+ compactScan(n, dev_indices, dev_isValid);
+
+ // scatter
+ StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_isValid, dev_indices);
+ cudaDeviceSynchronize();
+
timer().endGpuTimer();
- return -1;
+
+
+ // copy data from dev_odata to odata
+ int lastValid = 0;
+ cudaMemcpy(&lastValid, dev_isValid + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ int lastIdx = 0;
+ cudaMemcpy(&lastIdx, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ int numElems = lastValid + lastIdx;
+ cudaMemcpy(odata, dev_odata, numElems * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // cleanup
+ cudaFree(dev_idata);
+ cudaFree(dev_odata);
+ cudaFree(dev_indices);
+ cudaFree(dev_isValid);
+
+ return numElems;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 43088769..17488bca 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,13 @@
#include "common.h"
#include "naive.h"
+/*****************
+* Configuration *
+*****************/
+
+/*! Block size used for CUDA kernel launch. */
+#define blockSize 256
+
namespace StreamCompaction {
namespace Naive {
using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +18,67 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
- // TODO: __global__
+
+ __global__ void kernNaiveParallelScan(int n, int* odata, const int* idata, int k) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+ if (index < n) {
+ // 2^(d-1)
+ int kernelStart = 1 << (k - 1);
+
+ if (index >= kernelStart) {
+ odata[index] = idata[index] + idata[index - kernelStart];
+ } else {
+ odata[index] = idata[index];
+ }
+ }
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ // use global memory
+ // ilog2ceil(n) kernel invocations
+ // since GPU threads are not guaranteed to run simultaneously, we can't operate on an array in-place on the GPU (it will cause race conditions)
+ // instead, create two device arrays and swap them every iteration
+
+ // device array buffers
+ int* dev_arrA;
+ int* dev_arrB;
+
+ // allocate buffers
+ cudaMalloc((void**)&dev_arrA, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_arrA failed!");
+ cudaMalloc((void**)&dev_arrB, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_arrB failed!");
+
+ cudaMemcpy(dev_arrA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
timer().startGpuTimer();
- // TODO
+
+ // for each kernel
+ for (int k = 1; k <= ilog2ceil(n); k++) {
+ // call naive scan kernel
+ kernNaiveParallelScan << > > (n, dev_arrB, dev_arrA, k);
+ checkCUDAError("naiveParallelScanKernel failed!");
+
+ // swap array buffers
+ std::swap(dev_arrA, dev_arrB);
+ }
+ cudaDeviceSynchronize();
+
timer().endGpuTimer();
+
+ // copy data back to the CPU side from the GPU
+ odata[0] = 0;
+ cudaMemcpy(odata + 1, dev_arrA, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // cleanup
+ cudaFree(dev_arrA);
+ cudaFree(dev_arrB);
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e7..96c16163 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,31 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
+
+ // use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
- timer().endGpuTimer();
+
+ if (n > 0) {
+ // copy input from host pointer to host vector
+ //thrust::host_vector host_iData(idata, idata + n);
+
+ // cast as device vector
+ thrust::device_vector dev_iData(idata, idata + n);
+
+ // device output vector
+ thrust::device_vector dev_oData(n);
+
+ timer().startGpuTimer();
+
+ // perform exclusive scan on GPU
+ thrust::exclusive_scan(dev_iData.begin(), dev_iData.end(), dev_oData.begin());
+
+ timer().endGpuTimer();
+
+ // copy result to host output vector
+ thrust::copy(dev_oData.begin(), dev_oData.end(), odata);
+ }
}
}
}