diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..dab6e3b2 --- /dev/null +++ b/.clang-format @@ -0,0 +1,7 @@ +BasedOnStyle: LLVM + +BreakTemplateDeclarations: Yes +ColumnLimit: 100 +FixNamespaceComments: false +IndentWidth: 4 +NamespaceIndentation: All diff --git a/CMakeLists.txt b/CMakeLists.txt index 610c27d4..c3b1b602 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,3 +44,6 @@ source_group(Sources FILES ${sources}) add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers}) target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) + +add_executable(measure_time src/measure_time.cpp ${headers}) +target_link_libraries(measure_time stream_compaction) diff --git a/README.md b/README.md index 0e38ddb1..213e42a2 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,471 @@ -CUDA Stream Compaction -====================== +# CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Yunhao Qian + - [LinkedIn](https://www.linkedin.com/in/yunhao-qian-026980170/) + - [GitHub](https://github.com/yunhao-qian) +- Tested on: + - OS: Windows 11, 24H2 + - CPU: 13th Gen Intel(R) Core(TM) i7-13700 (2.10 GHz) + - GPU: NVIDIA GeForce RTX 4090 + - RAM: 32.0 GB -### (TODO: Your README) +## Overview -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Features +This project implements stream compaction and its building blocks (map, scan, scatter) using multiple approaches. Key features include: + +- A CPU implementation of scan and stream compaction +- GPU implementations of scan, using both naive and work-efficient methods +- A GPU implementation of stream compaction based on the work-efficient scan +- A carefully optimized work-efficient scan that leverages shared memory along with several additional optimizations +- C++ and Python scripts used to automate performance measurement accurately and programmatically +- Performance analysis comparing the different methods +- More in-depth performance analysis conducted with Nsight + +### Changes to `CMakeLists.txt` + +- An additional executable,`measure_time.exe`, has been added to the project to support block size tuning, performance benchmarking, and profiling. +- The files `cpu_sort.h`, `cpu_sort.cu`, `radix_sort.h`, `radix_sort.h` were introduced to implement Extra Credit 1. +- The files `efficient_plus.h` and `efficient_plus.cu` were introduced to implement Extra Credit 2. + +### Function Overloads + +To make block size tuning easier, I added overloads of the following functions that accept an additional `blockSize` parameter. The original overloads remain unchanged and simply forward to the new versions with a tuned default value: + +- `Naive::scan(..., const int blockSize)` +- `Efficient::scan(..., const int blockSize)` +- `Efficient::compact(..., const int blockSize)` + +These changes are only used by `measure_time.exe` and do not affect existing code paths. + +## Part 1: CPU Scan & Stream Compaction + +In [`cpu.h`](stream_compaction/cpu.h) and [`cpu.cu`](stream_compaction/cpu.cu): + +- `scan()`: Computes an exclusive prefix sum using a simple `for` loop. +- `compactWithoutScan()`: Performs stream compaction directly with a `for` loop, without calling `scan()`. +- `compactWithScan()`: Implements stream compaction using map → scan → scatter. While it follows the structure of a parallel implementation, it is built entirely with `for` loops. + +## Part 2: Naive GPU Scan Algorithm + +In [`naive.h`](stream_compaction/naive.h) and [`naive.cu`](stream_compaction/naive.cu): + +- `scan()`: Implements the naive algorithm from GPU Gems 3, Section 39.2.1, with the following differences: + - Uses only global memory (does not leverage shared memory). + - Launches one kernel per level, plus an additional kernel at the end to shift the results, rather than fusing the entire algorithm into a single kernel. + +## Part 3: Work-Efficient GPU Scan & Stream Compaction + +### 3.1. Scan + +In [`efficient.h`](stream_compaction/efficient.h) and [`efficient.cu`](stream_compaction/efficient.cu): + +- `scan()`: Implements the work-efficient algorithm from GPU Gems 3, Section 39.2.2, with the following differences: + - Uses only global memory (does not leverage shared memory). + - Launches one kernel per up-sweep/down-sweep level, rather than a single fused kernel. + - Saves results in place instead of out-of-place. +- Added `scanImpl()`, which operates directly on device arrays. This avoids the CPU buffer interface exposed by `scan()`, making it easier to integrate with CUDA code. + +### 3.2. Stream Compaction + +In [`common.h`](stream_compaction/common.h) and [`common.cu`](stream_compaction/common.cu): + +- `kernMapToBoolean()` A CUDA kernel that maps each integer to 0 or 1, depending on whether the value is zero. +- `kernScatter()`: A CUDA kernel that performs the scatter operation with vector addressing. Conditioned on a boolean array, it optionally stores elements at locations specified by an index array. + +In [`efficient.h`](stream_compaction/efficient.h) and [`efficient.cu`](stream_compaction/efficient.cu): + +- `compact()`: Implements stream compaction on GPU using map (via `kernMapToBoolean()`) → scan (via `scanImpl()`) → scatter (via `kernScatter()`). + +## Part 4: Using Thrust's Implementation + +In [`thrust.h`](stream_compaction/thrust.h) and [`thrust.cu`](stream_compaction/thrust.cu): + +- `scan()`: Wraps `thrust::exclusive_scan()`, adding timing instrumentation and exposing the same API as the other implementations. +- Profiling revealed that beyond a certain input size, `thrust::exclusive_scan()` switches to an algorithm that allocates GPU memory internally. These allocations introduced significant variability in benchmarking, at times making Thrust appear slower than even the naive implementation. To address this, I created a reusable memory pool and invoked the overload of `exclusive_scan` that accepts an execution policy configured with that pool. This allows the benchmark to perform a warm-up run (which handles any necessary allocations) and then time the second run, which reuses the existing memory without additional allocations. + +## Part 5: Why is My GPU Approach So Slow? + +I believe Part 3.1 already incorporates the optimizations described in the instructions: + +- For an up-sweep or down-sweep level with a given `offset`, only elements at indices `j = (i + 1) * offset * 2 - 1`, where `i` is any integer and `0 ≤ j < n`, require processing. All other elements can be skipped. +- Instead of mapping each kernel thread to a `j`, we map each thread directly to an `i`. This ensures that nearly all launched threads perform useful work. +- As a result, fewer threads need to be launched. For a given `block_size`, the number of required blocks becomes +`ceil(n / (2 * offset * block_size))`. + +## Part 6: Extra Credit + +### Extra Credit 1: Radix Sort + +Radix sort is implemented in `radix_sort.cu`. It uses the `efficient_plus.cu` scan developed for extra credit 2, along with several other utility kernels. The algorithm processes the input data one bit at a time, from the least significant to the most significant. + +For each bit, the process is as follows: + +1. Extract Bits: The specific bit is extracted from each integer. +2. Scan for Indices: A parallel scan is performed on these extracted bits to determine the correct destination index for each element. +3. Count Deduction: The last element of the scanned output and the last element of the input are used to quickly deduce the counts of both zeros and ones. +4. Stable Scatter: A stable scatter operation then moves the elements to their new, sorted positions based on the calculated indices, preserving the relative order of elements with the same bit value. + +After iterating through all the bits, the data is fully sorted. The correctness of the GPU implementation is verified in `main.cpp` by comparing its output against a baseline CPU sorting algorithm implemented in `cpu_sort.cu`. The results confirm that the implementation is accurate. + +### Extra Credit 2: GPU Scan Using Shared Memory && Hardware Optimization + +To improve the work-efficient implementation, I developed the work-efficient plus variant in [`efficient_plus.h`](stream_compaction/efficient_plus.h) and [`efficient_plus.cu`](stream_compaction/efficient_plus.cu). Key aspects of this design include: + +- Kernel fusing: The up-sweep and down-sweep phases are fused into a single kernel invocation. Although compacting indices no longer reduces the number of blocks, this approach minimizes divergence and is therefore retained. +- Shared memory usage: Instead of operating directly on global memory, data is first copied into shared memory, scanned per block, and then copied back to global memory. +- Recursive tiling: Since a block can only process a limited number of elements, the algorithm is made recursive. + - The array is partitioned into tiles, one per block. + - In addition to performing the scan, each block writes its tile sum into a new array. + - An exclusive scan is recursively applied to this array of tile sums, and the results are then added back to the input data. +- Avoiding shared memory bank conflicts: Shared memory indexing is padded following the method described in GPU Gems. This introduces only a small increase in memory usage. +- Preallocating the recursion buffer: To avoid repeated allocation of temporary GPU buffers for tile sums, the recursion depth and total required storage are precomputed. A single contiguous GPU buffer is allocated and reused across all recursion levels. +- Multiple elements per thread: As an experiment, I added support for processing multiple elements per thread, controlled by the compile-time template parameter `ElementsPerThread`. Different variants are dispatched at runtime, and loops over per-thread elements are unrolled with `#pragma unroll`. In practice, this optimization was not beneficial; the tuned configuration still uses one element per thread. +- Shrinking block size in the final recursion: In the last recursion step, the input is often much smaller than the predefined block size. As a niche optimization, the block size can be reduced to the smallest power of two that accommodates the input, with shared memory usage adjusted accordingly. However, testing showed no measurable performance improvement from this change. + +## Part 7: Write-up + +Project description: see the [Features](#features) section at the top. + +### Performance Analysis + +#### `measure_time.exe` + +To simplify performance analysis, I added a C++ executable, `measure_time.exe`. The implementation is in [`measure_time.cpp`](src/measure_time.cpp), which: + +- Accepts the operation (scan or compact), implementation (CPU, GPU naive, GPU work-efficient, or GPU Thrust), input size, block size, and number of elements per thread as command-line arguments. +- Generates random input data and prints the measured execution time (in milliseconds) to the console. + +I created this tool because measuring a configuration only once is often imprecise. In my earlier attempts, running repeated measurements within a C++ loop caused the results to drift significantly. In particular, Thrust measurements became unexpectedly slower, sometimes even slower than the GPU naive implementation. I suspect this was due to frequent GPU memory allocations and deallocations (since the exposed API uses CPU inputs and outputs), which created an atypical workload and put the driver in a degraded performance state. + +To avoid this issue, I designed `measure_time.exe` to test only a single configuration with one iteration per program launch. Repeated measurements are instead automated by accompanying Python scripts. + +#### Optimizing Parameters + +To optimize parameters such as block sizes and the number of elements per thread, I created a Python script, [`tune_parameters.py`](scripts/tune_parameters.py). + +- Because optimal block sizes vary with input size, tests are conducted on a fixed scale of $2^{22}$, using both a power-of-two input ($2^{22}$) and a non-power-of-two input ($2^{22} - 3$). The results indicate that the distinction between power-of-two and non-power-of-two input sizes has minimal impact on performance. Therefore, this factor will not be considered further in the discussion. +- Block sizes are sampled over a log-spaced range: 8, 16, …, 512, 1024. +- The number of processed elements per thread is one of 1, 2, 4, 8, and 16. +- Each configuration is executed 10 times, with the mean runtime recorded. +- The final selection balances performance across both power-of-two and non-power-of-two cases. + +From these experiments, the chosen defaults are: + +- Block size 256 for the naive scan +- Block size 64 for the work-efficient scan and compaction +- Block size 256 for the work-efficient plus (extra credit 2) scan, with 1 element per thread + +#### Performance Comparison + +To systematically collect execution time data across many configurations, I created a Python script, [`measure_performance.py`](scripts/measure_performance.py). This script relies on the previously described `measure_time.exe` to benchmark runtime across a range of input sizes and implementations. The procedure is as follows: + +- Input sizes are tested from $2^4$ up to $2^{27}$, using both exact powers of two and non-powers of two ($2^i - 3$ for each $i$) to capture different performance behaviors. +- The number of elements processed per thread is selected from the set `{1, 2, 4, 8, 16}`. +- Each configuration is executed 10 times, and the mean runtime is recorded. +- Results are stored in a JSON file ([`performance.json`](scripts/performance.json)). + +The JSON data is then processed by another script, [`plot_performance.py`](scripts/plot_performance.py), which generates the figures shown in this report. + +The first figure presents the full dataset: + +![performance comparison](img/performance.png) + +Because non-power-of-two inputs produce jagged trends, and CPU performance scales on a very different range than GPU performance, a second figure was generated using only GPU data and power-of-two inputs: + +![performance comparison GPU-only](img/performance-gpu-only.png) + +### Observation and Analysis + +#### CPU Implementation + +The minimal for-loop CPU implementation consistently demonstrates $O(N)$ complexity across all tested input sizes, as confirmed by the linear trend in the log-log plot. This complexity holds regardless of whether the limiting factor is compute or memory. Regarding the bottleneck: + +- The compute workload is very light, since addition operations are fast. +- The memory access pattern is highly favorable, as all reads and writes are sequential. + +Although memory is typically slower than arithmetic operations—suggesting the implementation may be slightly memory-bound—the distinction is not critical here. + +Importantly, this simple, low-overhead implementation outperforms the GPU variants (naive, work-efficient, and Thrust) for inputs up to about $2^{17}$. The likely reason is the absence of GPU kernel launch overhead, which allows the CPU to handle small and mid-sized inputs more efficiently. + +#### Naive & Work-Efficient GPU Implementations + +Common characteristics: + +- For $N < 2^{20}$, both implementations show limited sensitivity to input size. This is likely because the $O(\log N)$ kernel launches dominate execution time, while the work per kernel remains relatively small. Additionally, the input sizes may be too small to fully utilize GPU resources. In this regime, the bottleneck is neither compute nor memory, but the overhead of repeated kernel launches. +- Beyond $2^{20}$, execution time increases rapidly with $N$, indicating saturation of a GPU resource. Given that both implementations rely heavily on global memory, the performance bottleneck is most likely memory I/O rather than computation. + +Comparison of the two: + +- For $N < 2^{23}$, the so-called work-efficient implementation is actually slower than the naive version. This can be attributed to kernel launch overhead: the up-sweep and down-sweep phases double the number of kernel invocations. Although the total work is reduced, the benefit is negligible at these smaller sizes. +- For larger $N$, the work-efficient method begins to outperform the naive implementation, and the gap widens quickly. This is because the work-efficient algorithm performs only $O(N)$ total operations, whereas the naive approach requires $O(N \log N)$. + +#### Work-Efficient Plus GPU Implementation (Extra Credit 2) + +- Outperforms both the naive and work-efficient versions even for very small $N$, likely due to requiring fewer kernel launches. +- For larger inputs, achieves nearly an order of magnitude speedup over the previous two approaches, primarily from reduced global memory traffic and greater reliance on shared memory. +- Despite these gains, Nsight analysis later shows that the implementation remains memory-bound. + +![Work-efficient plus Nsight Systems](img/efficient-plus-nsys.png) + +From the Nsight Systems timeline, execution begins with three `scanPerBlock` calls followed by three `addSums`. In the second and third recursion levels, the input size is so small that their runtimes are negligible. Nearly all of the execution time is spent in the first recursion level. + +![Work-efficient plus scanPerBlock Nsight Compute](img/efficient-plus-scan-per-block-ncu.png) + +![Work-efficient plus addSums Nsight Compute](img/efficient-plus-add-sums-ncu.png) + +In Nsight Compute, examining the first recursion’s `scanPerBlock` shows a moderate SM throughput of about 53%. The kernel is clearly memory-bound, with memory and DRAM throughput reaching 92%, indicating that global memory reads and writes still dominate execution time. + +In contrast, the first recursion's `addSums` achieves only 8% SM throughput, demonstrating that its performance is heavily constrained by global memory access. Under the current implementation, this behavior is expected and difficult to improve further without a major overhaul. + +#### Thrust GPU Implementation + +Thrust’s `exclusive_scan()` delivers strong performance across all input sizes: + +- For small $N$, the runtime is comparable to the optimized work-efficient plus implementation. +- For large $N$, Thrust is roughly 2x faster. However, it is essential to warm up once before timing, so that internal GPU buffer allocations are excluded; otherwise, the runtime exhibits a sudden step increase after roughly $N = 2^{19}$. + +![Thrust Nsight Systems](img/thrust-nsys.png) + +From the timeline in Nsight Systems, execution begins with a lightweight `DeviceScanInitKernel`, followed by a single `DeviceScanKernel` that dominates the runtime. It is notable that the main computation is handled almost entirely within a single kernel launch. + +![Thrust Nsight Compute](img/thrust-ncu.png) + +In Nsight Compute, the `DeviceScanKernel` shows an SM throughput of 19%, lower than that of my work-efficient plus implementation, while memory and DRAM throughput remain similarly high at 93%. The high memory throughput reflects the inherent nature of the task. The lower SM throughput does not imply that Thrust’s implementation is less efficient. On the contrary, it likely reduces the total amount of computation through careful optimizations, with the lower SM utilization being a byproduct of that efficiency. + +#### Outputs of `cis5650_stream_compaction_test.exe` + +```text +**************** +** SCAN TESTS ** +**************** + [ 13 35 9 0 27 0 9 16 22 8 47 34 47 ... 12 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.5831ms (std::chrono Measured) + [ 0 13 48 57 57 84 84 93 109 131 139 186 220 ... 102754952 102754964 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.4894ms (std::chrono Measured) + [ 0 13 48 57 57 84 84 93 109 131 139 186 220 ... 102754881 102754896 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.499936ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.336768ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.865184ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.569664ms (CUDA Measured) + passed +==== work-efficient plus scan, power-of-two ==== + elapsed time: 0.472736ms (CUDA Measured) + passed +==== work-efficient plus scan, non-power-of-two ==== + elapsed time: 0.07552ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.277952ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.04608ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 1 0 1 0 1 0 0 0 1 0 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 8.2601ms (std::chrono Measured) + [ 1 3 1 1 1 1 1 3 2 2 3 2 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 8.5801ms (std::chrono Measured) + [ 1 3 1 1 1 1 1 3 2 2 3 2 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 14.8527ms (std::chrono Measured) + [ 1 3 1 1 1 1 1 3 2 2 3 2 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.495744ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.53136ms (CUDA Measured) + passed + +******************* +** SORTING TESTS ** +******************* + [ 4113 24235 21309 2900 15577 2400 30009 8016 29272 30408 23697 20184 22097 ... 4812 0 ] +==== cpu sort, power-of-two ==== + elapsed time: 189.038ms (std::chrono Measured) +==== gpu radix sort, power-of-two ==== + elapsed time: 3.96045ms (CUDA Measured) + passed +==== cpu sort, non-power-of-two ==== + elapsed time: 191.916ms (std::chrono Measured) +==== gpu radix sort, non-power-of-two ==== + elapsed time: 4.00803ms (CUDA Measured) + passed +``` + +#### Outputs of `tune_parameters.py` + +```text +Naive scan, power-of-two +Block size: 8, time: 6.1931840430000005 ms +Block size: 16, time: 3.160015989 ms +Block size: 32, time: 1.6289984100000001 ms +Block size: 64, time: 0.8754815995000002 ms +Block size: 128, time: 0.5630367994000001 ms +Block size: 256, time: 0.5268575905 ms +Block size: 512, time: 0.5707488000000001 ms +Block size: 1024, time: 0.4913184017000001 ms +Optimal block size: 1024, time: 0.4913184017000001 ms +======================================== +Naive scan, non-power-of-two +Block size: 8, time: 6.197379209 ms +Block size: 16, time: 3.1488416440000004 ms +Block size: 32, time: 1.6155263910000002 ms +Block size: 64, time: 0.8565600038000001 ms +Block size: 128, time: 0.5647936076 ms +Block size: 256, time: 0.45924159600000003 ms +Block size: 512, time: 0.5508479952 ms +Block size: 1024, time: 0.4998720078999999 ms +Optimal block size: 256, time: 0.45924159600000003 ms +======================================== +Work-efficient scan, power-of-two +Block size: 8, time: 1.2334048075 ms +Block size: 16, time: 0.9746495964999999 ms +Block size: 32, time: 1.0865504112999997 ms +Block size: 64, time: 0.7621375807999999 ms +Block size: 128, time: 0.8848479986000001 ms +Block size: 256, time: 0.7751007915 ms +Block size: 512, time: 1.2780031998999999 ms +Block size: 1024, time: 0.9169344123000001 ms +Optimal block size: 64, time: 0.7621375807999999 ms +======================================== +Work-efficient scan, non-power-of-two +Block size: 8, time: 1.6846079939999998 ms +Block size: 16, time: 1.1247648052 ms +Block size: 32, time: 0.9946559961999999 ms +Block size: 64, time: 1.1106271928 ms +Block size: 128, time: 0.7899712115 ms +Block size: 256, time: 0.8548672105999999 ms +Block size: 512, time: 0.8211200056000001 ms +Block size: 1024, time: 0.3766912013 ms +Optimal block size: 1024, time: 0.3766912013 ms +======================================== +Work-efficient compact, power-of-two +Block size: 8, time: 1.320752002 ms +Block size: 16, time: 0.9775711956 ms +Block size: 32, time: 0.9180895992 ms +Block size: 64, time: 1.2212512094 ms +Block size: 128, time: 0.8028352011 ms +Block size: 256, time: 1.337366404 ms +Block size: 512, time: 1.0536224006 ms +Block size: 1024, time: 0.9021439969000001 ms +Optimal block size: 128, time: 0.8028352011 ms +======================================== +Work-efficient compact, non-power-of-two +Block size: 8, time: 1.819353578 ms +Block size: 16, time: 1.3313984046000003 ms +Block size: 32, time: 1.2955423931000003 ms +Block size: 64, time: 1.0542688068000001 ms +Block size: 128, time: 1.2601408064000004 ms +Block size: 256, time: 1.2192768095 ms +Block size: 512, time: 1.1920671929000002 ms +Block size: 1024, time: 1.3155871931 ms +Optimal block size: 64, time: 1.0542688068000001 ms +======================================== +Work-efficient plus scan, power-of-two +Block size: 8, elements per thread: 1, time: 0.35677120390000006 ms +Block size: 8, elements per thread: 2, time: 0.2461855993 ms +Block size: 8, elements per thread: 4, time: 0.20586879839999997 ms +Block size: 8, elements per thread: 8, time: 0.2564895988 ms +Block size: 8, elements per thread: 16, time: 0.29693120419999997 ms +Block size: 16, elements per thread: 1, time: 0.2679104015 ms +Block size: 16, elements per thread: 2, time: 0.152214399 ms +Block size: 16, elements per thread: 4, time: 0.199580802 ms +Block size: 16, elements per thread: 8, time: 0.2125504002 ms +Block size: 16, elements per thread: 16, time: 0.2828671993 ms +Block size: 32, elements per thread: 1, time: 0.16852159649999998 ms +Block size: 32, elements per thread: 2, time: 0.1681056015 ms +Block size: 32, elements per thread: 4, time: 0.1758399994 ms +Block size: 32, elements per thread: 8, time: 0.21814079869999997 ms +Block size: 32, elements per thread: 16, time: 0.3729472012 ms +Block size: 64, elements per thread: 1, time: 0.16515839977999996 ms +Block size: 64, elements per thread: 2, time: 0.15373440083 ms +Block size: 64, elements per thread: 4, time: 0.1847008006 ms +Block size: 64, elements per thread: 8, time: 0.22609280039999996 ms +Block size: 64, elements per thread: 16, time: 0.3613759995 ms +Block size: 128, elements per thread: 1, time: 0.13827199933000003 ms +Block size: 128, elements per thread: 2, time: 0.12009920111999998 ms +Block size: 128, elements per thread: 4, time: 0.23142399940000002 ms +Block size: 128, elements per thread: 8, time: 0.2426623969 ms +Block size: 128, elements per thread: 16, time: 0.3689568042 ms +Block size: 256, elements per thread: 1, time: 0.1315040014 ms +Block size: 256, elements per thread: 2, time: 0.14512320169999998 ms +Block size: 256, elements per thread: 4, time: 0.1842624008 ms +Block size: 256, elements per thread: 8, time: 0.24464000160000002 ms +Block size: 256, elements per thread: 16, time: 0.36462400240000004 ms +Block size: 512, elements per thread: 1, time: 0.14869759980000002 ms +Block size: 512, elements per thread: 2, time: 0.1295744002 ms +Block size: 512, elements per thread: 4, time: 0.19439359899999997 ms +Block size: 512, elements per thread: 8, time: 0.2625504031 ms +Block size: 512, elements per thread: 16 - crashed +Block size: 1024, elements per thread: 1, time: 0.12309759919999999 ms +Block size: 1024, elements per thread: 2, time: 0.1486623989 ms +Block size: 1024, elements per thread: 4, time: 0.2266975983 ms +Block size: 1024, elements per thread: 8 - crashed +Block size: 1024, elements per thread: 16 - crashed +Optimal config: block size 128, elements per thread 2, time: 0.12009920111999998 ms +======================================== +Work-efficient plus scan, non-power-of-two +Block size: 8, elements per thread: 1, time: 0.3583456037 ms +Block size: 8, elements per thread: 2, time: 0.25230080190000004 ms +Block size: 8, elements per thread: 4, time: 0.21565439989999996 ms +Block size: 8, elements per thread: 8, time: 0.24966079579999997 ms +Block size: 8, elements per thread: 16, time: 0.29572479430000004 ms +Block size: 16, elements per thread: 1, time: 0.2169760003 ms +Block size: 16, elements per thread: 2, time: 0.1610751987 ms +Block size: 16, elements per thread: 4, time: 0.1980544014 ms +Block size: 16, elements per thread: 8, time: 0.21126080009999998 ms +Block size: 16, elements per thread: 16, time: 0.2852992027 ms +Block size: 32, elements per thread: 1, time: 0.1461216008 ms +Block size: 32, elements per thread: 2, time: 0.15881919949999998 ms +Block size: 32, elements per thread: 4, time: 0.1823200002 ms +Block size: 32, elements per thread: 8, time: 0.2195231989 ms +Block size: 32, elements per thread: 16, time: 0.376639995 ms +Block size: 64, elements per thread: 1, time: 0.15102400022 ms +Block size: 64, elements per thread: 2, time: 0.13811200266 ms +Block size: 64, elements per thread: 4, time: 0.1730176017 ms +Block size: 64, elements per thread: 8, time: 0.217846398 ms +Block size: 64, elements per thread: 16, time: 0.3569279999 ms +Block size: 128, elements per thread: 1, time: 0.14728640024 ms +Block size: 128, elements per thread: 2, time: 0.13456960240000002 ms +Block size: 128, elements per thread: 4, time: 0.1838400023 ms +Block size: 128, elements per thread: 8, time: 0.2354975983 ms +Block size: 128, elements per thread: 16, time: 0.35950399940000005 ms +Block size: 256, elements per thread: 1, time: 0.11884800039000001 ms +Block size: 256, elements per thread: 2, time: 0.12683519940000001 ms +Block size: 256, elements per thread: 4, time: 0.1796159998 ms +Block size: 256, elements per thread: 8, time: 0.2481792004 ms +Block size: 256, elements per thread: 16, time: 0.3609760016 ms +Block size: 512, elements per thread: 1, time: 0.11134720002000001 ms +Block size: 512, elements per thread: 2, time: 0.11812160169 ms +Block size: 512, elements per thread: 4, time: 0.2151135998 ms +Block size: 512, elements per thread: 8, time: 0.258998403 ms +Block size: 512, elements per thread: 16 - crashed +Block size: 1024, elements per thread: 1, time: 0.11879039996999999 ms +Block size: 1024, elements per thread: 2, time: 0.15765759950000002 ms +Block size: 1024, elements per thread: 4, time: 0.22839040019999998 ms +Block size: 1024, elements per thread: 8 - crashed +Block size: 1024, elements per thread: 16 - crashed +Optimal config: block size 512, elements per thread 1, time: 0.11134720002000001 ms +``` diff --git a/img/efficient-plus-add-sums-ncu.png b/img/efficient-plus-add-sums-ncu.png new file mode 100644 index 00000000..fce97261 Binary files /dev/null and b/img/efficient-plus-add-sums-ncu.png differ diff --git a/img/efficient-plus-nsys.png b/img/efficient-plus-nsys.png new file mode 100644 index 00000000..cfffa5d9 Binary files /dev/null and b/img/efficient-plus-nsys.png differ diff --git a/img/efficient-plus-scan-per-block-ncu.png b/img/efficient-plus-scan-per-block-ncu.png new file mode 100644 index 00000000..da2c429d Binary files /dev/null and b/img/efficient-plus-scan-per-block-ncu.png differ diff --git a/img/performance-gpu-only.png b/img/performance-gpu-only.png new file mode 100644 index 00000000..0e7ad6df Binary files /dev/null and b/img/performance-gpu-only.png differ diff --git a/img/performance.png b/img/performance.png new file mode 100644 index 00000000..29da4290 Binary files /dev/null and b/img/performance.png differ diff --git a/img/thrust-ncu.png b/img/thrust-ncu.png new file mode 100644 index 00000000..eba9c7fd Binary files /dev/null and b/img/thrust-ncu.png differ diff --git a/img/thrust-nsys.png b/img/thrust-nsys.png new file mode 100644 index 00000000..9b4db104 Binary files /dev/null and b/img/thrust-nsys.png differ diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 00000000..bee8a64b --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/scripts/helper.py b/scripts/helper.py new file mode 100644 index 00000000..858a9120 --- /dev/null +++ b/scripts/helper.py @@ -0,0 +1,32 @@ +import subprocess +from pathlib import Path +from typing import Literal + +ROOT_DIR = Path(__file__).parent.parent.absolute() +MEASURE_TIME_EXE = ROOT_DIR / "build" / "bin" / "Release" / "measure_time.exe" +assert MEASURE_TIME_EXE.is_file() + + +def measure_time( + operation: Literal["scan", "compact"], + implementation: Literal["cpu", "naive", "efficient", "thrust"], + input_size: int, + block_size: int = -1, + elements_per_thread: int = -1, +) -> float: + result = subprocess.run( + [ + str(MEASURE_TIME_EXE), + operation, + implementation, + str(input_size), + str(block_size), + str(elements_per_thread), + ], + capture_output=True, + cwd=ROOT_DIR, + check=True, + encoding="utf-8", + text=True, + ) + return float(result.stdout) diff --git a/scripts/measure_performance.py b/scripts/measure_performance.py new file mode 100644 index 00000000..15320179 --- /dev/null +++ b/scripts/measure_performance.py @@ -0,0 +1,53 @@ +import json +from pathlib import Path + +from helper import measure_time + + +def main() -> None: + performance_json_path = Path(__file__).parent / "performance.json" + if performance_json_path.exists(): + with performance_json_path.open(encoding="utf-8") as f: + performance_data = json.load(f) + else: + performance_data = [] + + def find_existing_entry( + operation: str, implementation: str, input_size: int + ) -> dict | None: + for entry in performance_data: + if ( + entry["operation"] == operation + and entry["implementation"] == implementation + and entry["input_size"] == input_size + ): + return entry + return None + + for implementation in ["cpu", "naive", "efficient", "efficient_plus", "thrust"]: + for exponent in range(4, 28): + base_input_size = 1 << exponent + for input_size in [base_input_size, base_input_size - 3]: + config = { + "operation": "scan", + "implementation": implementation, + "input_size": input_size, + } + if find_existing_entry(**config) is not None: + print(f"Skipping existing entry: {config}") + continue + + time_sum = 0.0 + time_count = 10 + for _ in range(time_count): + time_sum += measure_time("scan", implementation, input_size) + time = time_sum / time_count + print(f"Measured: {config}, time: {time} ms") + performance_data.append({**config, "time": time}) + + with performance_json_path.open("w", encoding="utf-8") as f: + json.dump(performance_data, f, indent=4) + + +if __name__ == "__main__": + main() diff --git a/scripts/performance.json b/scripts/performance.json new file mode 100644 index 00000000..58da40b5 --- /dev/null +++ b/scripts/performance.json @@ -0,0 +1,1442 @@ +[ + { + "operation": "scan", + "implementation": "cpu", + "input_size": 16, + "time": 0.00025000000105000004 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 13, + "time": 0.00013000000105000003 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 32, + "time": 0.00030000000560000004 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 29, + "time": 0.00024000000130000002 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 64, + "time": 0.00014999999840000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 61, + "time": 0.00030000000775000005 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 128, + "time": 0.0003300000039 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 125, + "time": 0.00029000000715 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 256, + "time": 0.0004299999958 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 253, + "time": 0.0005599999852500001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 512, + "time": 0.0007300000072000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 509, + "time": 0.0005600000028000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 1024, + "time": 0.0024200000357 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 1021, + "time": 0.0011600000066 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 2048, + "time": 0.0015800000105999999 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 2045, + "time": 0.0017299999776 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 4096, + "time": 0.0027700000429999997 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 4093, + "time": 0.003739999969 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 8192, + "time": 0.012350000162 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 8189, + "time": 0.005109999953999999 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 16384, + "time": 0.017919999749 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 16381, + "time": 0.009520000170000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 32768, + "time": 0.019830000074 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 32765, + "time": 0.029069999748999997 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 65536, + "time": 0.03700999972 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 65533, + "time": 0.04673999977 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 131072, + "time": 0.05842999929 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 131069, + "time": 0.06685000047999999 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 262144, + "time": 0.11926000113999999 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 262141, + "time": 0.11873999900000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 524288, + "time": 0.22090999779999998 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 524285, + "time": 0.2270400003 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 1048576, + "time": 0.46264000250000004 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 1048573, + "time": 0.44220999469999994 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 2097152, + "time": 0.8150799991 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 2097149, + "time": 0.8720100045000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 4194304, + "time": 1.7742799980000001 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 4194301, + "time": 1.8275800110000002 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 8388608, + "time": 3.4640300269999997 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 8388605, + "time": 3.362529993 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 16777216, + "time": 6.993960047000002 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 16777213, + "time": 7.115759993 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 33554432, + "time": 13.783430099999999 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 33554429, + "time": 13.915969850000002 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 67108864, + "time": 27.28339006 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 67108861, + "time": 27.26784 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 134217728, + "time": 53.63389968 + }, + { + "operation": "scan", + "implementation": "cpu", + "input_size": 134217725, + "time": 55.663139720000004 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 16, + "time": 0.043801599940000005 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 13, + "time": 0.05164160099 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 32, + "time": 0.047331199780000006 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 29, + "time": 0.04742400105999999 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 64, + "time": 0.048412799810000004 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 61, + "time": 0.059200000190000004 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 128, + "time": 0.03954880014000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 125, + "time": 0.06078080121 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 256, + "time": 0.06831680054 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 253, + "time": 0.06890880058 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 512, + "time": 0.10325119943000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 509, + "time": 0.08137280046999999 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 1024, + "time": 0.07716160044 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 1021, + "time": 0.06329279951000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 2048, + "time": 0.07493120021 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 2045, + "time": 0.09300480001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 4096, + "time": 0.08738880011000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 4093, + "time": 0.08106879847000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 8192, + "time": 0.08349440022 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 8189, + "time": 0.09429760018000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 16384, + "time": 0.10271359978000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 16381, + "time": 0.09546559974999998 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 32768, + "time": 0.08381439968999999 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 32765, + "time": 0.0977536009 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 65536, + "time": 0.10394560105999999 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 65533, + "time": 0.09715839993 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 131072, + "time": 0.11108160102999998 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 131069, + "time": 0.1361344003 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 262144, + "time": 0.12880639953 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 262141, + "time": 0.11512959958999998 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 524288, + "time": 0.17975679940000003 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 524285, + "time": 0.1614495971 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 1048576, + "time": 0.2007328017 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 1048573, + "time": 0.21642560079999998 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 2097152, + "time": 0.1942719981 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 2097149, + "time": 0.20537920299999995 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 4194304, + "time": 0.32504640230000004 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 4194301, + "time": 0.32316159599999994 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 8388608, + "time": 0.559580797 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 8388605, + "time": 0.5589184045000001 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 16777216, + "time": 3.685180783 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 16777213, + "time": 3.6830688 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 33554432, + "time": 7.8179743770000005 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 33554429, + "time": 7.818601607 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 67108864, + "time": 16.506505759999996 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 67108861, + "time": 16.508195090000005 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 134217728, + "time": 34.762250130000005 + }, + { + "operation": "scan", + "implementation": "naive", + "input_size": 134217725, + "time": 34.76191711 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 16, + "time": 0.09160000128 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 13, + "time": 0.09594559843 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 32, + "time": 0.10794239906 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 29, + "time": 0.08448640066999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 64, + "time": 0.10183679833 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 61, + "time": 0.12498239879999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 128, + "time": 0.08773759894000002 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 125, + "time": 0.17651199781 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 256, + "time": 0.11273279790999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 253, + "time": 0.15140480018 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 512, + "time": 0.15328000020000002 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 509, + "time": 0.15540160099 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 1024, + "time": 0.1757888012 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 1021, + "time": 0.1287103996 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 2048, + "time": 0.19697279997 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 2045, + "time": 0.16180160187 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 4096, + "time": 0.3186911985 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 4093, + "time": 0.186617604 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 8192, + "time": 0.17923519920000003 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 8189, + "time": 0.2133696005 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 16384, + "time": 0.19736960299999998 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 16381, + "time": 0.30214719619999997 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 32768, + "time": 0.330947201 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 32765, + "time": 0.2578656003 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 65536, + "time": 0.2784255951 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 65533, + "time": 0.2382143975 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 131072, + "time": 0.25891199849999996 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 131069, + "time": 0.23974400020000003 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 262144, + "time": 0.47313919689999995 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 262141, + "time": 0.5214911980999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 524288, + "time": 0.2494303988 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 524285, + "time": 0.2424511984 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 1048576, + "time": 0.30882879620000003 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 1048573, + "time": 0.25613119759999997 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 2097152, + "time": 0.33248960369999997 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 2097149, + "time": 0.3335808009 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 4194304, + "time": 0.405337599 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 4194301, + "time": 0.4363199979 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 8388608, + "time": 0.6022176025999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 8388605, + "time": 0.5778335990000001 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 16777216, + "time": 0.9064192056999998 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 16777213, + "time": 0.9025983930000001 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 33554432, + "time": 3.1401983509999996 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 33554429, + "time": 3.1271168240000002 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 67108864, + "time": 6.42243519 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 67108861, + "time": 6.411436843 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 134217728, + "time": 13.032940759999999 + }, + { + "operation": "scan", + "implementation": "efficient", + "input_size": 134217725, + "time": 13.015532870000001 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 16, + "time": 0.016774400140000003 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 13, + "time": 0.01680960015 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 32, + "time": 0.022912000020000003 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 29, + "time": 0.02389120051 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 64, + "time": 0.02816960039 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 61, + "time": 0.018041599910000004 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 128, + "time": 0.02915520016 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 125, + "time": 0.020166399980000003 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 256, + "time": 0.0197312 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 253, + "time": 0.026326399660000006 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 512, + "time": 0.023401600150000004 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 509, + "time": 0.02170880007 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 1024, + "time": 0.03136959971 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 1021, + "time": 0.032143999830000006 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 2048, + "time": 0.04942400044 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 2045, + "time": 0.03144320007 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 4096, + "time": 0.05921599921 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 4093, + "time": 0.04074880038 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 8192, + "time": 0.028803200270000006 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 8189, + "time": 0.0353248002 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 16384, + "time": 0.0542432001 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 16381, + "time": 0.039516800090000005 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 32768, + "time": 0.03502400052 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 32765, + "time": 0.05661119895000001 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 65536, + "time": 0.04257600007000001 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 65533, + "time": 0.03187520002 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 131072, + "time": 0.03140159939 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 131069, + "time": 0.04689599945999999 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 262144, + "time": 0.03357119970999999 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 262141, + "time": 0.0458432002 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 524288, + "time": 0.05233599991 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 524285, + "time": 0.045827199519999993 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 1048576, + "time": 0.0480736006 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 1048573, + "time": 0.04503359979 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 2097152, + "time": 0.05534399972 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 2097149, + "time": 0.05462400019999999 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 4194304, + "time": 0.07412160112999999 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 4194301, + "time": 0.07647680043999999 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 8388608, + "time": 0.114435201 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 8388605, + "time": 0.1118656003 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 16777216, + "time": 0.18965120000000002 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 16777213, + "time": 0.1896639975 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 33554432, + "time": 0.5722016038000001 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 33554429, + "time": 0.5712223946 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 67108864, + "time": 1.1539296169999997 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 67108861, + "time": 1.154291202 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 134217728, + "time": 2.316012813 + }, + { + "operation": "scan", + "implementation": "efficient_plus", + "input_size": 134217725, + "time": 2.3172320120000003 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 16, + "time": 0.0278495999 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 13, + "time": 0.031961599550000004 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 32, + "time": 0.031990399940000004 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 29, + "time": 0.04058240001 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 64, + "time": 0.044662399990000004 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 61, + "time": 0.027497600019999995 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 128, + "time": 0.035180799670000006 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 125, + "time": 0.03549760022 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 256, + "time": 0.021094400069999998 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 253, + "time": 0.047772799600000006 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 512, + "time": 0.0302335998 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 509, + "time": 0.03101119957 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 1024, + "time": 0.039052799709999995 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 1021, + "time": 0.038742399610000006 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 2048, + "time": 0.027734400139999997 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 2045, + "time": 0.02224000002 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 4096, + "time": 0.02301119956 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 4093, + "time": 0.024169599829999992 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 8192, + "time": 0.02102719992 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 8189, + "time": 0.021628799839999997 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 16384, + "time": 0.02471360044 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 16381, + "time": 0.02351680009 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 32768, + "time": 0.025039999949999996 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 32765, + "time": 0.022559999680000005 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 65536, + "time": 0.02583999989 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 65533, + "time": 0.021139200219999998 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 131072, + "time": 0.02293440011 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 131069, + "time": 0.02241599989 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 262144, + "time": 0.028640000340000006 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 262141, + "time": 0.029644800180000003 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 524288, + "time": 0.027737599989999993 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 524285, + "time": 0.030038399440000002 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 1048576, + "time": 0.03103039991 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 1048573, + "time": 0.03092800007 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 2097152, + "time": 0.036678399890000005 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 2097149, + "time": 0.03083839993 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 4194304, + "time": 0.041769599909999997 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 4194301, + "time": 0.043894399699999996 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 8388608, + "time": 0.050057599689999996 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 8388605, + "time": 0.050815999859999995 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 16777216, + "time": 0.0800863996 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 16777213, + "time": 0.09180160011999998 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 33554432, + "time": 0.2736607999 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 33554429, + "time": 0.27832959879999997 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 67108864, + "time": 0.5728832066 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 67108861, + "time": 0.5628224016 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 134217728, + "time": 1.16590079 + }, + { + "operation": "scan", + "implementation": "thrust", + "input_size": 134217725, + "time": 1.150540817 + } +] \ No newline at end of file diff --git a/scripts/plot_performance.py b/scripts/plot_performance.py new file mode 100644 index 00000000..3ded1b2e --- /dev/null +++ b/scripts/plot_performance.py @@ -0,0 +1,70 @@ +import json +from pathlib import Path + +import numpy as np +from matplotlib import pyplot as plt + + +def is_power_of_two(n: int) -> bool: + return (n & (n - 1)) == 0 and n != 0 + + +def main() -> None: + json_path = Path(__file__).parent / "performance.json" + with json_path.open(encoding="utf-8") as f: + data = json.load(f) + + def plot_and_save( + implementations: list[str], + labels: list[str], + power_of_two_only: bool, + filename: str, + ): + fig, ax = plt.subplots() + for implementation, label in zip(implementations, labels): + n_values = [] + time_values = [] + for entry in data: + if entry["implementation"] == implementation: + if not power_of_two_only or is_power_of_two(entry["input_size"]): + n_values.append(entry["input_size"]) + time_values.append(entry["time"]) + + n_values = np.array(n_values) + time_values = np.array(time_values) + sort_indices = np.argsort(n_values) + n_values = n_values[sort_indices] + time_values = time_values[sort_indices] + ax.plot(n_values, time_values, label=label, marker="o", markersize=4) + + ax.set_xscale("log", base=2) + ax.set_yscale("log", base=10) + ax.grid(True, which="both", ls="--", lw=0.5) + ax.set_xlabel("Input size") + ax.set_ylabel("Time (ms)") + ax.legend() + fig.tight_layout() + fig.savefig(Path(__file__).parent.parent / "img" / filename) + + plot_and_save( + ["naive", "efficient", "efficient_plus", "thrust", "cpu"], + [ + "GPU (naive)", + "GPU (efficient)", + "GPU (work-efficient plus)", + "GPU (Thrust)", + "CPU", + ], + False, + "performance.png", + ) + plot_and_save( + ["naive", "efficient", "efficient_plus", "thrust"], + ["Naive", "Work-efficient", "Work-efficient plus", "Thrust"], + True, + "performance-gpu-only.png", + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/tune_parameters.py b/scripts/tune_parameters.py new file mode 100644 index 00000000..350b308b --- /dev/null +++ b/scripts/tune_parameters.py @@ -0,0 +1,97 @@ +import subprocess + +from helper import measure_time + + +def tune_block_size(operation: str, implementation: str, input_size: int) -> None: + block_sizes = [8, 16, 32, 64, 128, 256, 512, 1024] + best_time = float("inf") + best_block_size = -1 + for block_size in block_sizes: + time_sum = 0.0 + time_count = 10 + for _ in range(time_count): + time_sum += measure_time(operation, implementation, input_size, block_size) + time = time_sum / time_count + print(f"Block size: {block_size}, time: {time} ms") + if time < best_time: + best_time = time + best_block_size = block_size + print(f"Optimal block size: {best_block_size}, time: {best_time} ms") + + +def tune_efficient_plus(input_size: int) -> None: + block_sizes = [8, 16, 32, 64, 128, 256, 512, 1024] + elements_per_thread_values = [1, 2, 4, 8, 16] + best_time = float("inf") + best_config = (-1, -1) + for block_size in block_sizes: + for elements_per_thread in elements_per_thread_values: + time_sum = 0.0 + time_count = 10 + try: + for _ in range(time_count): + time_sum += measure_time( + "scan", + "efficient_plus", + input_size, + block_size, + elements_per_thread, + ) + except subprocess.CalledProcessError: + print( + f"Block size: {block_size}, " + f"elements per thread: {elements_per_thread} " + "- crashed" + ) + continue + time = time_sum / time_count + print( + f"Block size: {block_size}, " + f"elements per thread: {elements_per_thread}, " + f"time: {time} ms" + ) + if time < best_time: + best_time = time + best_config = (block_size, elements_per_thread) + print( + f"Optimal config: block size {best_config[0]}, " + f"elements per thread {best_config[1]}, " + f"time: {best_time} ms" + ) + + +def print_divider() -> None: + print("=" * 40) + + +def main(): + input_size = 1 << 22 + + print("Naive scan, power-of-two") + tune_block_size("scan", "naive", input_size) + print_divider() + print("Naive scan, non-power-of-two") + tune_block_size("scan", "naive", input_size - 3) + print_divider() + print("Work-efficient scan, power-of-two") + tune_block_size("scan", "efficient", input_size) + print_divider() + print("Work-efficient scan, non-power-of-two") + tune_block_size("scan", "efficient", input_size - 3) + print_divider() + print("Work-efficient compact, power-of-two") + tune_block_size("compact", "efficient", input_size) + print_divider() + print("Work-efficient compact, non-power-of-two") + tune_block_size("compact", "efficient", input_size - 3) + print_divider() + print("Work-efficient plus scan, power-of-two") + tune_efficient_plus(input_size) + print_divider() + print("Work-efficient plus scan, non-power-of-two") + tune_efficient_plus(input_size - 3) + + +if __name__ == "__main__": + main() diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..c2df960b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,12 +8,15 @@ #include #include +#include #include #include +#include +#include #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -81,6 +84,20 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient plus scan, power-of-two"); + StreamCompaction::EfficientPlus::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::EfficientPlus::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient plus scan, non-power-of-two"); + StreamCompaction::EfficientPlus::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::EfficientPlus::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -147,6 +164,43 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*******************\n"); + printf("** SORTING TESTS **\n"); + printf("*******************\n"); + + genArray(SIZE, a, SIZE); + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu sort, power-of-two"); + StreamCompaction::CPUSort::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPUSort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(SIZE, b, true); + // printCmpResult(SIZE, b, b); + + zeroArray(SIZE, c); + printDesc("gpu radix sort, power-of-two"); + StreamCompaction::RadixSort::sort(SIZE, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, b); + printDesc("cpu sort, non-power-of-two"); + StreamCompaction::CPUSort::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::CPUSort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(NPOT, c, true); + // printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("gpu radix sort, non-power-of-two"); + StreamCompaction::RadixSort::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/src/measure_time.cpp b/src/measure_time.cpp new file mode 100644 index 00000000..a2ab5e68 --- /dev/null +++ b/src/measure_time.cpp @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + // argv[1]: scan/compact + // argv[2]: cpu/naive/efficient/efficient_plus/thrust + // argv[3]: input size + // argv[4]: block size + // argv[5]: elements per thread + + if (argc != 6) { + return -1; + } + const std::string operation(argv[1]); + const std::string implementation(argv[2]); + const int inputSize = std::stoi(argv[3]); + const int blockSize = std::stoi(argv[4]); + const int elementsPerThread = std::stoi(argv[5]); + + std::vector inputs(inputSize); + std::vector outputs(inputSize); + { + std::random_device device; + std::mt19937 generator(device()); + int maxValue; + if (operation == "scan") { + maxValue = 50 - 1; + } else if (operation == "compact") { + maxValue = 4 - 1; + } else { + return -1; + } + std::uniform_int_distribution distribution(0, maxValue); + for (auto &i : inputs) { + i = distribution(generator); + } + } + + const auto isGpu{implementation != "cpu"}; + StreamCompaction::Common::PerformanceTimer *timer = nullptr; + if (implementation == "cpu") { + timer = &StreamCompaction::CPU::timer(); + } else if (implementation == "naive") { + timer = &StreamCompaction::Naive::timer(); + } else if (implementation == "efficient") { + timer = &StreamCompaction::Efficient::timer(); + } else if (implementation == "efficient_plus") { + timer = &StreamCompaction::EfficientPlus::timer(); + } else if (implementation == "thrust") { + timer = &StreamCompaction::Thrust::timer(); + } else { + return -1; + } + + // Some implementations (e.g., Thrust) allocates memory internally on the first run, so we run + // the operation twice and only measure the later one. +#define REPEAT_TWICE(function, ...) \ + do { \ + function(inputSize, outputs.data(), inputs.data(), __VA_ARGS__); \ + std::this_thread::sleep_for(std::chrono::milliseconds(10)); \ + function(inputSize, outputs.data(), inputs.data(), __VA_ARGS__); \ + } while (false) + + if (operation == "scan") { + if (implementation == "cpu") { + REPEAT_TWICE(StreamCompaction::CPU::scan); + } else if (implementation == "naive") { + REPEAT_TWICE(StreamCompaction::Naive::scan, blockSize); + } else if (implementation == "efficient") { + REPEAT_TWICE(StreamCompaction::Efficient::scan, blockSize); + } else if (implementation == "efficient_plus") { + REPEAT_TWICE(StreamCompaction::EfficientPlus::scan, blockSize, elementsPerThread); + } else if (implementation == "thrust") { + REPEAT_TWICE(StreamCompaction::Thrust::scan); + } else { + return -1; + } + } else if (operation == "compact") { + REPEAT_TWICE(StreamCompaction::Efficient::compact, blockSize); + } else { + return -1; + } + +#undef REPEAT_TWICE + + std::cout << std::setprecision(std::numeric_limits::max_digits10); + if (isGpu) { + std::cout << timer->getGpuElapsedTimeForPreviousOperation(); + } else { + std::cout << timer->getCpuElapsedTimeForPreviousOperation(); + } + std::cout << std::endl; + return 0; +} diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..16f92922 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,16 +1,22 @@ set(headers "common.h" "cpu.h" + "cpu_sort.h" "naive.h" "efficient.h" + "efficient_plus.h" + "radix_sort.h" "thrust.h" ) set(sources "common.cu" "cpu.cu" + "cpu_sort.cu" "naive.cu" "efficient.cu" + "efficient_plus.cu" + "radix_sort.cu" "thrust.cu" ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..5cf663dc 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,7 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } - namespace StreamCompaction { namespace Common { @@ -23,17 +22,26 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + bools[index] = idata[index] != 0 ? 1 : 0; } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, + const int *indices) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + if (bools[index]) { + odata[indices[index]] = idata[index]; + } } - } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..dc1d1f90 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,25 +1,33 @@ #include -#include "cpu.h" #include "common.h" +#include "cpu.h" namespace StreamCompaction { namespace CPU { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } + void scanImpl(int n, int *odata, const int *idata) { + int sum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = sum; + sum += idata[i]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + * (Optional) For better understanding before starting moving to GPU, you can simulate your + * GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + scanImpl(n, odata, idata); timer().endCpuTimer(); } @@ -30,9 +38,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int oCount = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[oCount] = idata[i]; + ++oCount; + } + } timer().endCpuTimer(); - return -1; + return oCount; } /** @@ -41,10 +55,27 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int *bools = new int[n]; + int *indices = new int[n]; timer().startCpuTimer(); - // TODO + // Step 1: map + for (int i = 0; i < n; ++i) { + bools[i] = (idata[i] != 0) ? 1 : 0; + } + // Step 2: scan + scanImpl(n, indices, bools); + // Step 3: scatter + int oCount = 0; + for (int i = 0; i < n; ++i) { + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; + ++oCount; + } + } timer().endCpuTimer(); - return -1; + delete[] bools; + delete[] indices; + return oCount; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c0476..954b8517 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -4,7 +4,7 @@ namespace StreamCompaction { namespace CPU { - StreamCompaction::Common::PerformanceTimer& timer(); + StreamCompaction::Common::PerformanceTimer &timer(); void scan(int n, int *odata, const int *idata); diff --git a/stream_compaction/cpu_sort.cu b/stream_compaction/cpu_sort.cu new file mode 100644 index 00000000..62ec08d4 --- /dev/null +++ b/stream_compaction/cpu_sort.cu @@ -0,0 +1,24 @@ +#include + +#include +#include + +#include "common.h" +#include "cpu_sort.h" + +namespace StreamCompaction { + namespace CPUSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer &timer() { + static PerformanceTimer timer; + return timer; + } + + void sort(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + std::copy_n(idata, n, odata); + std::sort(odata, odata + n); + timer().endCpuTimer(); + } + } +} diff --git a/stream_compaction/cpu_sort.h b/stream_compaction/cpu_sort.h new file mode 100644 index 00000000..8879864a --- /dev/null +++ b/stream_compaction/cpu_sort.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace CPUSort { + StreamCompaction::Common::PerformanceTimer &timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..2d425f8f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,24 +1,95 @@ #include #include + #include "common.h" #include "efficient.h" namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } + __global__ void kernUpSweep(int n, int offset, int *data) { + // Avoid integer overflows. + auto indexLL = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + indexLL = (indexLL + 1) * offset * 2 - 1; + if (indexLL >= n) { + return; + } + int index = static_cast(indexLL); + int beforeIndex = index - offset; + if (beforeIndex >= 0) { + data[index] += data[beforeIndex]; + } + } + + __global__ void kernDownSweep(int n, int offset, int *data) { + // Avoid integer overflows. + auto indexLL = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + indexLL = (indexLL + 1) * offset * 2 - 1; + if (indexLL >= n) { + return; + } + int index = static_cast(indexLL); + int beforeIndex = index - offset; + if (beforeIndex >= 0) { + int beforeValue = data[beforeIndex]; + data[beforeIndex] = data[index]; + data[index] += beforeValue; + } + } + + void scanImpl(int n, int *data, const int blockSize) { + const auto computeGridSize = [=](int offset) { + // Avoid integer overflows when n and blockSize are large. + const auto divisor = 2LL * offset * blockSize; + return static_cast((n + divisor - 1) / divisor); + }; + + // Up-sweep + for (int offset = 1; offset < n; offset *= 2) { + const int gridSize = computeGridSize(offset); + kernUpSweep<<>>(n, offset, data); + } + // Set the last element to 0. + cudaMemset(data + (n - 1), 0, sizeof(int)); + // Down-sweep + for (int offset = n / 2; offset >= 1; offset /= 2) { + const int gridSize = computeGridSize(offset); + kernDownSweep<<>>(n, offset, data); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata, int blockSize) { + // Set default parameters. + if (blockSize <= 0) { + blockSize = 64; + } + + const auto dataSize = n * sizeof(int); + const int ceiledN = 1 << ilog2ceil(n); + const auto ceiledDataSize = ceiledN * sizeof(int); + + int *dev_data; + cudaMalloc((void **)&dev_data, ceiledDataSize); + cudaMemcpy(dev_data, idata, dataSize, cudaMemcpyHostToDevice); + if (ceiledDataSize > dataSize) { + cudaMemset(dev_data + n, 0, ceiledDataSize - dataSize); + } + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + scanImpl(ceiledN, dev_data, blockSize); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, dataSize, cudaMemcpyDeviceToHost); + cudaFree(dev_data); } /** @@ -30,11 +101,56 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata, int blockSize) { + // Set default parameters. + if (blockSize <= 0) { + blockSize = 64; + } + + const auto dataSize = n * sizeof(int); + const int ceiledN = 1 << ilog2ceil(n); + const auto ceiledDataSize = ceiledN * sizeof(int); + + const int gridSize = (n + blockSize - 1) / blockSize; + + int *dev_iData; + cudaMalloc((void **)&dev_iData, dataSize); + cudaMemcpy(dev_iData, idata, dataSize, cudaMemcpyHostToDevice); + int *dev_bools; + cudaMalloc((void **)&dev_bools, dataSize); + int *dev_indices; + cudaMalloc((void **)&dev_indices, ceiledDataSize); + int *dev_oData; + cudaMalloc((void **)&dev_oData, dataSize); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean<<>>(n, dev_bools, dev_iData); + cudaMemcpy(dev_indices, dev_bools, dataSize, cudaMemcpyDeviceToDevice); + if (ceiledDataSize > dataSize) { + cudaMemset(dev_indices + n, 0, ceiledDataSize - dataSize); + } + scanImpl(ceiledN, dev_indices, blockSize); + Common::kernScatter<<>>(n, dev_oData, dev_iData, dev_bools, + dev_indices); timer().endGpuTimer(); - return -1; + + int oCount; + { + int lastBool; + cudaMemcpy(&lastBool, dev_bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + int lastIndex; + cudaMemcpy(&lastIndex, dev_indices + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + oCount = lastIndex + lastBool; + } + if (oCount > 0) { + cudaMemcpy(odata, dev_oData, oCount * sizeof(int), cudaMemcpyDeviceToHost); + } + cudaFree(dev_iData); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_oData); + return oCount; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..eea99ff2 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -4,10 +4,16 @@ namespace StreamCompaction { namespace Efficient { - StreamCompaction::Common::PerformanceTimer& timer(); + StreamCompaction::Common::PerformanceTimer &timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, int blockSize); - int compact(int n, int *odata, const int *idata); + inline void scan(int n, int *odata, const int *idata) { scan(n, odata, idata, -1); } + + int compact(int n, int *odata, const int *idata, int blockSize); + + inline int compact(int n, int *odata, const int *idata) { + return compact(n, odata, idata, -1); + } } } diff --git a/stream_compaction/efficient_plus.cu b/stream_compaction/efficient_plus.cu new file mode 100644 index 00000000..dd0322e6 --- /dev/null +++ b/stream_compaction/efficient_plus.cu @@ -0,0 +1,243 @@ +#include +#include + +#include "common.h" +#include "efficient_plus.h" + +namespace StreamCompaction { + namespace EfficientPlus { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer &timer() { + static PerformanceTimer timer; + return timer; + } + + __host__ __device__ unsigned conflictFreeIndex(unsigned index) { + // Number of banks: 32 + unsigned offset = index / 32U; + offset += offset / 32U; + return index + offset; + } + + __device__ int &conflictFreeGet(int *data, unsigned index) { + return data[conflictFreeIndex(index)]; + } + + template + __global__ void scanPerBlock(unsigned n, int *__restrict__ data, int *__restrict__ sums) { + unsigned tileSize = blockDim.x * ElementsPerThread * 2U; + + extern __shared__ int sharedData[]; + unsigned startSharedIndex = threadIdx.x * ElementsPerThread * 2U; + unsigned endSharedIndex = startSharedIndex + ElementsPerThread * 2U; + unsigned startGlobalIndex = blockIdx.x * tileSize + startSharedIndex; + unsigned endGlobalIndex = min(startGlobalIndex + ElementsPerThread * 2U, n); + { + // Copy data to shared memory. + unsigned sharedIndex = startSharedIndex; + unsigned globalIndex = startGlobalIndex; + for (; globalIndex < endGlobalIndex; ++sharedIndex, ++globalIndex) { + conflictFreeGet(sharedData, sharedIndex) = data[globalIndex]; + } + for (; sharedIndex < endSharedIndex; ++sharedIndex) { + conflictFreeGet(sharedData, sharedIndex) = 0; + } + } + __syncthreads(); + + // Up-sweep + for (unsigned offset = 1U; offset < tileSize; offset *= 2U) { +#pragma unroll + for (unsigned i = 0U; i < ElementsPerThread; ++i) { + unsigned index = (threadIdx.x * ElementsPerThread + i + 1U) * offset * 2U - 1U; + if (index < tileSize) { + conflictFreeGet(sharedData, index) += + conflictFreeGet(sharedData, index - offset); + } + } + __syncthreads(); + } + + if (threadIdx.x == 0U) { + int &lastElement = conflictFreeGet(sharedData, tileSize - 1U); + // Save the total sum of this block to the sums array. + if (sums != nullptr) { + // sums may be nullptr for the last recursion. + sums[blockIdx.x] = lastElement; + } + // Clear the last element before down-sweep. + lastElement = 0; + } + __syncthreads(); + + // Down-sweep + for (unsigned offset = tileSize / 2U; offset > 0U; offset /= 2U) { +#pragma unroll + for (unsigned i = 0U; i < ElementsPerThread; ++i) { + unsigned index = (threadIdx.x * ElementsPerThread + i + 1U) * offset * 2U - 1U; + if (index < tileSize) { + int &leftChild = conflictFreeGet(sharedData, index - offset); + int &rightChild = conflictFreeGet(sharedData, index); + int oldLeftChild = leftChild; + leftChild = rightChild; + rightChild += oldLeftChild; + } + } + __syncthreads(); + } + + // Write results back to global memory. + { + unsigned sharedIndex = startSharedIndex; + unsigned globalIndex = startGlobalIndex; + for (; globalIndex < endGlobalIndex; ++sharedIndex, ++globalIndex) { + data[globalIndex] = conflictFreeGet(sharedData, sharedIndex); + } + } + } + + template + __global__ void addSums(unsigned n, int *__restrict__ data, const int *__restrict__ sums) { + unsigned startIndex = (blockIdx.x * blockDim.x + threadIdx.x) * ElementsPerThread * 2U; + unsigned endIndex = min(startIndex + ElementsPerThread * 2U, n); + int sum = sums[blockIdx.x]; + for (unsigned index = startIndex; index < endIndex; ++index) { + data[index] += sum; + } + } + + template + void scanImpl(unsigned n, int *data, unsigned blockSize, unsigned tileSize, + unsigned gridSize, int *sums) { + unsigned actualBlockSize = blockSize; + if (gridSize == 1U) { + while (true) { + unsigned nextBlockSize = actualBlockSize / 2U; + if (nextBlockSize * ElementsPerThread * 2U < n) { + break; + } + actualBlockSize = nextBlockSize; + } + } + unsigned sharedMemorySize = + (conflictFreeIndex(actualBlockSize * ElementsPerThread * 2U - 1U) + 1U) * + sizeof(int); + scanPerBlock + <<>>(n, data, sums); + checkCUDAErrorFn("scanPerBlock kernel failed!"); + if (gridSize > 1U) { + unsigned nextGridSize = (gridSize + tileSize - 1U) / tileSize; + int *nextSums = nullptr; + if (nextGridSize > 1U) { + nextSums = sums + gridSize; + } + scanImpl(gridSize, sums, blockSize, tileSize, nextGridSize, + nextSums); + addSums<<>>(n, data, sums); + checkCUDAErrorFn("addSums kernel failed!"); + } + } + + void scan(int n, int *odata, const int *idata, int blockSize, int elementsPerThread) { + // Set default parameters. + if (blockSize <= 0) { + blockSize = 256; + } + if (elementsPerThread <= 0) { + elementsPerThread = 1; + } + + const auto dataSize = n * sizeof(int); + + int *dev_data; + cudaMalloc((void **)&dev_data, dataSize); + checkCUDAErrorFn("cudaMalloc dev_data failed!"); + cudaMemcpy(dev_data, idata, dataSize, cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy to device failed!"); + + // Allocate GPU memory for sums beforehand. + unsigned tileSize = blockSize * elementsPerThread * 2U; + unsigned gridSize = (n + tileSize - 1U) / tileSize; + + unsigned totalSumCount = 0U; + { + unsigned gridSize = n; + while (true) { + gridSize = (gridSize + tileSize - 1U) / tileSize; + if (gridSize <= 1U) { + break; + } + totalSumCount += gridSize; + } + } + int *dev_sums = nullptr; + if (totalSumCount > 0U) { + cudaMalloc((void **)&dev_sums, totalSumCount * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_sums failed!"); + } + cudaDeviceSynchronize(); + +#define DISPATCH(N) \ + case N: \ + scanImpl(n, dev_data, blockSize, tileSize, gridSize, dev_sums); \ + break + + timer().startGpuTimer(); + + switch (elementsPerThread) { + DISPATCH(1); + DISPATCH(2); + DISPATCH(4); + DISPATCH(8); + DISPATCH(16); + default: + printf("Unsupported elementsPerThread: %d\n", elementsPerThread); + exit(1); + } + timer().endGpuTimer(); + +#undef DISPATCH + + cudaMemcpy(odata, dev_data, dataSize, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMemcpy to host failed!"); + cudaFree(dev_data); + checkCUDAErrorFn("cudaFree failed!"); + if (dev_sums != nullptr) { + cudaFree(dev_sums); + checkCUDAErrorFn("cudaFree failed!"); + } + } + + void scanDeviceInPlace(int n, int *data) { + const unsigned blockSize = 256; + constexpr unsigned ElementsPerThread = 1; + + unsigned tileSize = blockSize * ElementsPerThread * 2U; + unsigned gridSize = (n + tileSize - 1U) / tileSize; + + unsigned totalSumCount = 0U; + { + unsigned gridSize = n; + while (true) { + gridSize = (gridSize + tileSize - 1U) / tileSize; + if (gridSize <= 1U) { + break; + } + totalSumCount += gridSize; + } + } + int *dev_sums = nullptr; + if (totalSumCount > 0U) { + cudaMalloc((void **)&dev_sums, totalSumCount * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_sums failed!"); + } + + scanImpl(n, data, blockSize, tileSize, gridSize, dev_sums); + + if (dev_sums != nullptr) { + cudaFree(dev_sums); + checkCUDAErrorFn("cudaFree failed!"); + } + } + } +} diff --git a/stream_compaction/efficient_plus.h b/stream_compaction/efficient_plus.h new file mode 100644 index 00000000..8d1486d8 --- /dev/null +++ b/stream_compaction/efficient_plus.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace EfficientPlus { + StreamCompaction::Common::PerformanceTimer &timer(); + + void scan(int n, int *odata, const int *idata, int blockSize, int elementsPerThread); + + inline void scan(int n, int *odata, const int *idata) { scan(n, odata, idata, -1, -1); } + + void scanDeviceInPlace(int n, int *data); + } +} diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..dd6c17f6 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,25 +1,64 @@ #include #include + #include "common.h" #include "naive.h" namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernScan(int n, int offset, int *odata, const int *idata) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + int beforeIndex = index - offset; + odata[index] = idata[index] + (beforeIndex < 0 ? 0 : idata[beforeIndex]); + } + + __global__ void kernShiftRight(int n, int *odata, const int *idata) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = index == 0 ? 0 : idata[index - 1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata, int blockSize) { + // Set default parameters. + if (blockSize <= 0) { + blockSize = 256; + } + + const auto dataSize = n * sizeof(int); + + int *dev_iData; + cudaMalloc((void **)&dev_iData, dataSize); + cudaMemcpy(dev_iData, idata, dataSize, cudaMemcpyHostToDevice); + int *dev_oData; + cudaMalloc((void **)&dev_oData, dataSize); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + const int gridSize = (n + blockSize - 1) / blockSize; + for (int offset = 1; offset < n; offset *= 2) { + kernScan<<>>(n, offset, dev_oData, dev_iData); + std::swap(dev_oData, dev_iData); + } + kernShiftRight<<>>(n, dev_oData, dev_iData); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_oData, dataSize, cudaMemcpyDeviceToHost); + cudaFree(dev_iData); + cudaFree(dev_oData); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb064..960103ae 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -4,8 +4,10 @@ namespace StreamCompaction { namespace Naive { - StreamCompaction::Common::PerformanceTimer& timer(); + StreamCompaction::Common::PerformanceTimer &timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, int blockSize); + + inline void scan(int n, int *odata, const int *idata) { scan(n, odata, idata, -1); } } } diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu new file mode 100644 index 00000000..9894372b --- /dev/null +++ b/stream_compaction/radix_sort.cu @@ -0,0 +1,77 @@ +#include +#include + +#include "common.h" +#include "efficient_plus.h" +#include "radix_sort.h" + +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer &timer() { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernExtractBit(int n, int mask, int *odata, const int *idata) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = (idata[index] & mask) == 0 ? 0 : 1; + } + + __global__ void kernScatter(int n, unsigned mask, int zeroCount, const int *positions, + const int *idata, int *odata) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + bool isOne = (static_cast(idata[index]) & mask) != 0; + if (isOne) { + odata[zeroCount + positions[index]] = idata[index]; + } else { + int zeros_before = index - positions[index]; + odata[zeros_before] = idata[index]; + } + } + + void sort(int n, int *odata, const int *idata) { + const auto dataSize = n * sizeof(int); + + int *dev_iData; + cudaMalloc((void **)&dev_iData, dataSize); + cudaMemcpy(dev_iData, idata, dataSize, cudaMemcpyHostToDevice); + + int *dev_oData; + cudaMalloc((void **)&dev_oData, dataSize); + + int *dev_positions; + cudaMalloc((void **)&dev_positions, dataSize); + + timer().startGpuTimer(); + const int blockSize = 256; + const int gridSize = (n + blockSize - 1) / blockSize; + for (int bit = 0; bit < 32; ++bit) { + const unsigned mask = 1u << bit; + kernExtractBit<<>>(n, mask, dev_positions, dev_iData); + int lastBit; + cudaMemcpy(&lastBit, &dev_positions[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + EfficientPlus::scanDeviceInPlace(n, dev_positions); + int oneCount; + cudaMemcpy(&oneCount, &dev_positions[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + oneCount += lastBit; + int zeroCount = n - oneCount; + kernScatter<<>>(n, mask, zeroCount, dev_positions, dev_iData, + dev_oData); + std::swap(dev_iData, dev_oData); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_oData, dataSize, cudaMemcpyDeviceToHost); + cudaFree(dev_iData); + cudaFree(dev_oData); + cudaFree(dev_positions); + } + } +} diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h new file mode 100644 index 00000000..82c1d731 --- /dev/null +++ b/stream_compaction/radix_sort.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer &timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..b66f1c31 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -2,27 +2,43 @@ #include #include #include +#include +#include #include +#include +#include + #include "common.h" #include "thrust.h" namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Use a memory pool to reduce memory allocation overhead of Thrust operations. + static thrust::cuda::memory_resource upstream; + static thrust::mr::new_delete_resource bookkeeper; + static thrust::mr::disjoint_unsynchronized_pool_resource pool(&upstream, &bookkeeper); + thrust::mr::allocator allocator(&pool); + const auto policy = thrust::cuda::par(allocator); + + thrust::device_vector dev_data(n); + thrust::copy_n(idata, n, dev_data.begin()); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(policy, dev_data.begin(), dev_data.end(), dev_data.begin()); timer().endGpuTimer(); + + thrust::copy(dev_data.begin(), dev_data.end(), odata); } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206b..c20b769c 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -4,7 +4,7 @@ namespace StreamCompaction { namespace Thrust { - StreamCompaction::Common::PerformanceTimer& timer(); + StreamCompaction::Common::PerformanceTimer &timer(); void scan(int n, int *odata, const int *idata); }