diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..09da644
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+ "files.associations": {
+ "charconv": "cpp",
+ "xstring": "cpp",
+ "xtree": "cpp",
+ "chrono": "cpp"
+ }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e38ddb..1f5e25e 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,132 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Richard Chen
+ * [LinkedIn](https://www.linkedin.com/in/richardrlchen/)
+* Tested on: Windows 11, i7-10875H @ 2.3GHz 16GB, RTX 2060 MAXQ 6GB (PC)
+
+## Overview
+The purpose of this project was to explore parallel algorithms, something a GPU
+excels at. To that end, we used the GPU to perform stream compaction via scan.
+Stream compaction filters out elements from a list, in this case discarding the 0
+elements from the input. Scan is a fold but all the intermediate steps are also
+available.
+
+## Features
+* CPU implementation of scan
+* CPU implementation of stream compaction
+* Naive parallel version of scan for GPU implemented in CUDA
+* Work efficient parallel version of scan for GPU implemented in CUDA
+* Stream Compaction that leverages the GPU scan
+
+## Example Output
+Tested on 256 element array
+```
+****************
+** SCAN TESTS **
+****************
+ [ 28 7 23 2 29 27 22 19 2 38 26 47 45 ... 43 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 0.0005ms (std::chrono Measured)
+ [ 0 28 35 58 60 89 116 138 157 159 197 223 270 ... 6050 6093 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 0.0003ms (std::chrono Measured)
+ [ 0 28 35 58 60 89 116 138 157 159 197 223 270 ... 6016 6031 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 0.243616ms (CUDA Measured)
+ a[1] = 28, b[1] = 0
+ FAIL VALUE
+==== naive scan, non-power-of-two ====
+ elapsed time: 0.17504ms (CUDA Measured)
+ a[1] = 28, b[1] = 0
+ FAIL VALUE
+==== work-efficient scan, power-of-two ====
+ elapsed time: 0.325728ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 0.335584ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 0.1088ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 0.048928ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 0 3 1 3 1 1 0 2 2 0 0 1 2 ... 3 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 0.0013ms (std::chrono Measured)
+ [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 3 3 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 0.0012ms (std::chrono Measured)
+ [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 2 2 ]
+ passed
+==== cpu compact with scan ====
+ elapsed time: 0.0015ms (std::chrono Measured)
+ [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 3 3 ]
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 0.36784ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 0.3792ms (CUDA Measured)
+ passed
+Press any key to continue . . .
+```
+
+## Performance Analysis
+Optimizing Block Size
+
+
+
+There does not seem to be a significant difference when tested on size 2^16.
+
+Performance as a function of array length
+
+
+
+Notice that the time axis is log scaled. The crossover between the naive and
+efficient implementations happens around length 2^19.
+
+Stream Compaction as a function of array length
+
+
+
+Notice that the time axis is log scaled. Even with arrays of up to length
+2^28, the single threaded CPU is still faster for stream compactification.
+This goes to show how parallelism and concurrency should be used situationally
+as in many scenarios, the drawbacks might outweigh the benefits.
+
+### Nsight
+Profiling the execution on arrays of length 2^16
+
+
+In the CUDA HW row, green represents host->device and
+red represents device->host. Thus we can see that these pairs neatly bracket
+the naive implementation, the work-efficient implementation, and using
+the thrust library.
+
+Up to 936ms is the naive GPU version. It looks about half as long as the
+work efficient implementation. The work efficient implementation iterates through
+the layers of the "tree" twice, once on the upsweep and once on the downsweep.
+Until the arrays are large enough, the larger number of computations performed
+will not overcome the overhead incurred from taking twice the number of steps.
+When tested on length 2^20 arrays, the work efficient implementation was faster.
+
+The thrust implementation is significantly faster and also has far fewer kernel
+calls. This seems reasonable as there probably is overhead that is incurred when
+spawning multiple kernels so being able to fit the operation into fewer kernel
+calls would drastically cut down on this. Additionally, thrust probably has
+shared memory optimizations, memory access optimizations, and more efficient
+computation.
+
+For all 3, the kernel executions themselves have lots of empty time in between
+computations which suggests there is some form of IO bottleneck.
-### (TODO: Your README)
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/img/profile_timeline.png b/img/profile_timeline.png
new file mode 100644
index 0000000..7d85b71
Binary files /dev/null and b/img/profile_timeline.png differ
diff --git a/img/scan_time.png b/img/scan_time.png
new file mode 100644
index 0000000..53641b9
Binary files /dev/null and b/img/scan_time.png differ
diff --git a/img/stream_time.png b/img/stream_time.png
new file mode 100644
index 0000000..84271ce
Binary files /dev/null and b/img/stream_time.png differ
diff --git a/img/time_blocksize.png b/img/time_blocksize.png
new file mode 100644
index 0000000..4363946
Binary files /dev/null and b/img/time_blocksize.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..5562fb8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,13 +13,14 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 12; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[])
+{
// Scan tests
printf("\n");
@@ -27,7 +28,7 @@ int main(int argc, char* argv[]) {
printf("** SCAN TESTS **\n");
printf("****************\n");
- genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
+ genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);
@@ -81,7 +82,16 @@ int main(int argc, char* argv[]) {
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
- zeroArray(SIZE, c);
+ // int tmpTest[] = {0, 1, 2, 3, 4, 5, 6, 7};
+ // int *tmpTestOut = new int[8];
+ // zeroArray(8, tmpTestOut);
+ // printDesc("Small array slides example scan");
+ // StreamCompaction::Efficient::scan(8, tmpTestOut, tmpTest);
+ // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ // //printArray(NPOT, c, true);
+ // printCmpResult(NPOT, b, c);
+ // delete[] tmpTestOut;
+
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
@@ -102,7 +112,7 @@ int main(int argc, char* argv[]) {
// Compaction tests
- genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
+ genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..d3e45a3 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,29 +1,39 @@
#include "common.h"
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char *msg, const char *file, int line)
+{
cudaError_t err = cudaGetLastError();
- if (cudaSuccess == err) {
+ if (cudaSuccess == err)
+ {
return;
}
fprintf(stderr, "CUDA error");
- if (file) {
+ if (file)
+ {
fprintf(stderr, " (%s:%d)", file, line);
}
fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
-
-namespace StreamCompaction {
- namespace Common {
+namespace StreamCompaction
+{
+ namespace Common
+ {
/**
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
- __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ __global__ void kernMapToBoolean(int n, int *bools, const int *idata)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n)
+ {
+ return;
+ }
+ bools[index] = !(idata[index] == 0);
}
/**
@@ -31,8 +41,17 @@ namespace StreamCompaction {
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
__global__ void kernScatter(int n, int *odata,
- const int *idata, const int *bools, const int *indices) {
- // TODO
+ const int *idata, const int *bools, const int *indices)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n)
+ {
+ return;
+ }
+ if (bools[index])
+ {
+ odata[indices[index]] = idata[index];
+ }
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..530c406 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -10,32 +10,41 @@
#include
#include
+#define blockSize 1024
+
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
/**
* Check for CUDA errors; print and exit if there was a problem.
*/
void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
-inline int ilog2(int x) {
+inline int ilog2(int x)
+{
int lg = 0;
- while (x >>= 1) {
+ while (x >>= 1)
+ {
++lg;
}
return lg;
}
-inline int ilog2ceil(int x) {
+inline int ilog2ceil(int x)
+{
return x == 1 ? 0 : ilog2(x - 1) + 1;
}
-namespace StreamCompaction {
- namespace Common {
+namespace StreamCompaction
+{
+ namespace Common
+ {
__global__ void kernMapToBoolean(int n, int *bools, const int *idata);
__global__ void kernScatter(int n, int *odata,
- const int *idata, const int *bools, const int *indices);
+ const int *idata, const int *bools, const int *indices);
/**
* This class is used for timing the performance
@@ -60,7 +69,10 @@ namespace StreamCompaction {
void startCpuTimer()
{
- if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+ if (cpu_timer_started)
+ {
+ throw std::runtime_error("CPU timer already started");
+ }
cpu_timer_started = true;
time_start_cpu = std::chrono::high_resolution_clock::now();
@@ -70,7 +82,10 @@ namespace StreamCompaction {
{
time_end_cpu = std::chrono::high_resolution_clock::now();
- if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+ if (!cpu_timer_started)
+ {
+ throw std::runtime_error("CPU timer not started");
+ }
std::chrono::duration duro = time_end_cpu - time_start_cpu;
prev_elapsed_time_cpu_milliseconds =
@@ -81,7 +96,10 @@ namespace StreamCompaction {
void startGpuTimer()
{
- if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+ if (gpu_timer_started)
+ {
+ throw std::runtime_error("GPU timer already started");
+ }
gpu_timer_started = true;
cudaEventRecord(event_start);
@@ -92,7 +110,10 @@ namespace StreamCompaction {
cudaEventRecord(event_end);
cudaEventSynchronize(event_end);
- if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+ if (!gpu_timer_started)
+ {
+ throw std::runtime_error("GPU timer not started");
+ }
cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
gpu_timer_started = false;
@@ -109,10 +130,10 @@ namespace StreamCompaction {
}
// remove copy and move functions
- PerformanceTimer(const PerformanceTimer&) = delete;
- PerformanceTimer(PerformanceTimer&&) = delete;
- PerformanceTimer& operator=(const PerformanceTimer&) = delete;
- PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+ PerformanceTimer(const PerformanceTimer &) = delete;
+ PerformanceTimer(PerformanceTimer &&) = delete;
+ PerformanceTimer &operator=(const PerformanceTimer &) = delete;
+ PerformanceTimer &operator=(PerformanceTimer &&) = delete;
private:
cudaEvent_t event_start = nullptr;
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..47fc827 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -3,10 +3,12 @@
#include "common.h"
-namespace StreamCompaction {
- namespace CPU {
+namespace StreamCompaction
+{
+ namespace CPU
+ {
using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
+ PerformanceTimer &timer()
{
static PerformanceTimer timer;
return timer;
@@ -17,9 +19,16 @@ namespace StreamCompaction {
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int *odata, const int *idata)
+ {
timer().startCpuTimer();
// TODO
+ int tmpAcc = 0;
+ for (int i = 0; i < n; i++)
+ {
+ odata[i] = tmpAcc;
+ tmpAcc += idata[i];
+ }
timer().endCpuTimer();
}
@@ -28,11 +37,21 @@ namespace StreamCompaction {
*
* @returns the number of elements remaining after compaction.
*/
- int compactWithoutScan(int n, int *odata, const int *idata) {
+ int compactWithoutScan(int n, int *odata, const int *idata)
+ {
timer().startCpuTimer();
// TODO
+ int curIdx = 0;
+ for (int i = 0; i < n; i++)
+ {
+ if (idata[i] != 0)
+ {
+ odata[curIdx] = idata[i];
+ curIdx++;
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return curIdx;
}
/**
@@ -40,11 +59,39 @@ namespace StreamCompaction {
*
* @returns the number of elements remaining after compaction.
*/
- int compactWithScan(int n, int *odata, const int *idata) {
+ int compactWithScan(int n, int *odata, const int *idata)
+ {
+ int *tmpData = new int[n];
+ int *tmpData2 = new int[n];
timer().startCpuTimer();
- // TODO
+ // map
+ for (int i = 0; i < n; i++)
+ {
+ tmpData[i] = idata[i] != 0;
+ }
+ // scan
+ int tmpAcc = 0;
+ for (int i = 0; i < n; i++)
+ {
+ tmpData2[i] = tmpAcc;
+ tmpAcc += tmpData[i];
+ }
+ int const *arrPtr = idata;
+ // if last elem of mapped boolarr is 0, tmpData[n-1] is 0
+ int retVal = tmpData2[n - 1] + tmpData[n - 1];
+ for (int i = 0; i < retVal; i++)
+ {
+ while (*arrPtr == 0)
+ {
+ arrPtr++;
+ }
+ odata[i] = *arrPtr;
+ arrPtr++;
+ }
timer().endCpuTimer();
- return -1;
+ delete[] tmpData2;
+ delete[] tmpData;
+ return retVal;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..ffe6a5d 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,22 +3,86 @@
#include "common.h"
#include "efficient.h"
-namespace StreamCompaction {
- namespace Efficient {
+namespace StreamCompaction
+{
+ namespace Efficient
+ {
using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
+ PerformanceTimer &timer()
{
static PerformanceTimer timer;
return timer;
}
+ __global__ void kernScanEfficientUpSweep(int n, int layer, int shift, int *data)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n)
+ {
+ return;
+ }
+ if (index % (1 << (layer + 1)) == 0)
+ {
+ data[index + (1 << (layer + 1)) - 1] += data[index + (1 << layer) - 1];
+ }
+ }
+
+ __global__ void kernScanEfficientDownSweep(int n, int layer, int max, int shift, int *data)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n)
+ {
+ return;
+ }
+ if (index % (1 << (layer + 1)) == 0)
+ {
+ int t = data[index + (1 << layer) - 1];
+ data[index + (1 << layer) - 1] = data[index + (1 << (layer + 1)) - 1];
+ data[index + (1 << (layer + 1)) - 1] += t;
+ }
+ }
+
+ __global__ void kernSetLastToZero(int n, int *data)
+ {
+ data[n - 1] = 0;
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int *odata, const int *idata)
+ {
+ //next power of 2
+ int power = ilog2ceil(n);
+ int size = 1 << power;
+ int offset = size - n;
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+ int *buf;
+ cudaMalloc((void **)&buf, size * sizeof(int));
+ cudaMemset(buf, 0, size * sizeof(int));
+ cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
timer().startGpuTimer();
// TODO
+ for (int layer = 0; layer < power; layer++)
+ {
+ // invoke kernel
+ int shift = 1 << layer;
+ kernScanEfficientUpSweep<<>>(size, layer, shift, buf);
+ cudaDeviceSynchronize();
+ }
+ kernSetLastToZero<<<1, 1>>>(size, buf);
+ cudaDeviceSynchronize();
+ for (int layer = power - 1; layer >= 0; layer--)
+ {
+ // invoke kernel
+ int shift = 1 << layer;
+ kernScanEfficientDownSweep<<>>(size, layer, power - 1, shift, buf);
+ cudaDeviceSynchronize();
+ // checkCUDAErrorWithLine("cudaDeviceSynchronize buf failed!");
+ }
timer().endGpuTimer();
+ cudaMemcpy(odata, buf + offset, n * sizeof(int), cudaMemcpyDeviceToHost);
+ cudaFree(buf);
}
/**
@@ -30,11 +94,63 @@ namespace StreamCompaction {
* @param idata The array of elements to compact.
* @returns The number of elements remaining after compaction.
*/
- int compact(int n, int *odata, const int *idata) {
+ int compact(int n, int *odata, const int *idata)
+ {
+ //next power of 2
+ int power = ilog2ceil(n);
+ int size = 1 << power;
+ int offset = size - n;
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+ int *buf; // power of 2 0 padded copy of idata
+ cudaMalloc((void **)&buf, size * sizeof(int));
+ cudaMemset(buf, 0, size * sizeof(int));
+ cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ int *bools, *indices, *tmpOut; //
+ cudaMalloc((void **)&bools, size * sizeof(int));
+ cudaMalloc((void **)&indices, size * sizeof(int));
+ cudaMalloc((void **)&tmpOut, size * sizeof(int));
timer().startGpuTimer();
// TODO
+ // Map
+ Common::kernMapToBoolean<<>>(size, bools, buf);
+ Common::kernMapToBoolean<<>>(size, indices, buf);
+ cudaDeviceSynchronize();
+ // Scan
+ for (int layer = 0; layer < power; layer++)
+ {
+ // invoke kernel
+ int shift = 1 << layer;
+ kernScanEfficientUpSweep<<>>(size, layer, shift, indices);
+ cudaDeviceSynchronize();
+ }
+ kernSetLastToZero<<<1, 1>>>(size, indices);
+ cudaDeviceSynchronize();
+ for (int layer = power - 1; layer >= 0; layer--)
+ {
+ // invoke kernel
+ int shift = 1 << layer;
+ kernScanEfficientDownSweep<<>>(size, layer, power - 1, shift, indices);
+ cudaDeviceSynchronize();
+ }
+ // Scatter
+ Common::kernScatter<<>>(size, tmpOut, buf, bools, indices);
+ cudaDeviceSynchronize();
timer().endGpuTimer();
- return -1;
+
+ cudaMemcpy(odata, tmpOut, n * sizeof(int), cudaMemcpyDeviceToHost);
+ int retSize;
+ cudaMemcpy(&retSize, indices + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ int tmpLast;
+ cudaMemcpy(&tmpLast, buf + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ retSize += (tmpLast != 0);
+
+ cudaFree(buf);
+ cudaFree(bools);
+ cudaFree(indices);
+ cudaFree(tmpOut);
+ return retSize;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..b2a8ba5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,23 +3,77 @@
#include "common.h"
#include "naive.h"
-namespace StreamCompaction {
- namespace Naive {
+namespace StreamCompaction
+{
+ namespace Naive
+ {
using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
+ PerformanceTimer &timer()
{
static PerformanceTimer timer;
return timer;
}
- // TODO: __global__
+ __global__ void kernScanNaive(int n, int layer, int offset, int *odata, const int *idata)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n)
+ {
+ return;
+ }
+ int tmp = idata[index];
+ odata[index] = tmp + ((index >= offset) ? idata[index - offset] : 0);
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int *odata, const int *idata)
+ {
+ int power = ilog2ceil(n);
+ int size = 1 << power;
+ int offset = size - n;
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+ int *bufA;
+ int *bufB;
+ cudaMalloc((void **)&bufA, size * sizeof(int));
+ checkCUDAErrorWithLine("cudaMalloc bufA failed!");
+ cudaMalloc((void **)&bufB, size * sizeof(int));
+ checkCUDAErrorWithLine("cudaMalloc bufB failed!");
+
+ cudaMemset(bufA, 0, size * sizeof(int));
+ cudaMemset(bufB, 0, size * sizeof(int));
+ cudaMemcpy(bufA + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAErrorWithLine("cudaMemcpy failed!");
+ int *tmp;
timer().startGpuTimer();
// TODO
+ for (int layer = 0; layer < power; layer++)
+ {
+ // invoke kernel
+ int offset = 1 << layer;
+ checkCUDAErrorWithLine("loop start failed!");
+ kernScanNaive<<>>(size, layer, offset, bufB, bufA);
+ checkCUDAErrorWithLine("kernscan failed!");
+ cudaDeviceSynchronize();
+ checkCUDAErrorWithLine("cudaDeviceSync failed!");
+ // swap bufA and bufB
+ tmp = bufA;
+ bufA = bufB;
+ bufB = tmp;
+ }
+ checkCUDAErrorWithLine("before memcpy failed!");
+
+ // cudaDeviceSynchronize();
timer().endGpuTimer();
+ cudaMemcpy(odata + 1, bufA + offset, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAErrorWithLine("cudaMemcpy failed!");
+
+ odata[0] = 0;
+ // cudaDeviceSynchronize();
+ cudaFree(bufB);
+ checkCUDAErrorWithLine("cudaFree bufB failed!");
+ cudaFree(bufA);
+ checkCUDAErrorWithLine("cudaFree bufA failed!");
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..afe7719 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -6,10 +6,12 @@
#include "common.h"
#include "thrust.h"
-namespace StreamCompaction {
- namespace Thrust {
+namespace StreamCompaction
+{
+ namespace Thrust
+ {
using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
+ PerformanceTimer &timer()
{
static PerformanceTimer timer;
return timer;
@@ -17,12 +19,18 @@ namespace StreamCompaction {
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int *odata, const int *idata)
+ {
+ thrust::host_vector hv_idata(idata, idata + n);
+ thrust::device_vector dv_idata(hv_idata);
+ thrust::device_vector dv_odata(n);
timer().startGpuTimer();
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+ thrust::exclusive_scan(dv_idata.begin(), dv_idata.end(), dv_odata.begin());
timer().endGpuTimer();
+ thrust::copy(dv_odata.begin(), dv_odata.end(), odata);
}
}
}