diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..09da644 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "files.associations": { + "charconv": "cpp", + "xstring": "cpp", + "xtree": "cpp", + "chrono": "cpp" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb..1f5e25e 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,132 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Richard Chen + * [LinkedIn](https://www.linkedin.com/in/richardrlchen/) +* Tested on: Windows 11, i7-10875H @ 2.3GHz 16GB, RTX 2060 MAXQ 6GB (PC) + +## Overview +The purpose of this project was to explore parallel algorithms, something a GPU +excels at. To that end, we used the GPU to perform stream compaction via scan. +Stream compaction filters out elements from a list, in this case discarding the 0 +elements from the input. Scan is a fold but all the intermediate steps are also +available. + +## Features +* CPU implementation of scan +* CPU implementation of stream compaction +* Naive parallel version of scan for GPU implemented in CUDA +* Work efficient parallel version of scan for GPU implemented in CUDA +* Stream Compaction that leverages the GPU scan + +## Example Output +Tested on 256 element array +``` +**************** +** SCAN TESTS ** +**************** + [ 28 7 23 2 29 27 22 19 2 38 26 47 45 ... 43 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 28 35 58 60 89 116 138 157 159 197 223 270 ... 6050 6093 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0003ms (std::chrono Measured) + [ 0 28 35 58 60 89 116 138 157 159 197 223 270 ... 6016 6031 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.243616ms (CUDA Measured) + a[1] = 28, b[1] = 0 + FAIL VALUE +==== naive scan, non-power-of-two ==== + elapsed time: 0.17504ms (CUDA Measured) + a[1] = 28, b[1] = 0 + FAIL VALUE +==== work-efficient scan, power-of-two ==== + elapsed time: 0.325728ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.335584ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.1088ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.048928ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 1 3 1 1 0 2 2 0 0 1 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0012ms (std::chrono Measured) + [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0015ms (std::chrono Measured) + [ 3 1 3 1 1 2 2 1 2 2 2 1 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.36784ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.3792ms (CUDA Measured) + passed +Press any key to continue . . . +``` + +## Performance Analysis +Optimizing Block Size +
+ + +There does not seem to be a significant difference when tested on size 2^16. + +Performance as a function of array length +
+ + +Notice that the time axis is log scaled. The crossover between the naive and +efficient implementations happens around length 2^19. + +Stream Compaction as a function of array length +
+ + +Notice that the time axis is log scaled. Even with arrays of up to length +2^28, the single threaded CPU is still faster for stream compactification. +This goes to show how parallelism and concurrency should be used situationally +as in many scenarios, the drawbacks might outweigh the benefits. + +### Nsight +Profiling the execution on arrays of length 2^16 +
+ +In the CUDA HW row, green represents host->device and +red represents device->host. Thus we can see that these pairs neatly bracket +the naive implementation, the work-efficient implementation, and using +the thrust library. + +Up to 936ms is the naive GPU version. It looks about half as long as the +work efficient implementation. The work efficient implementation iterates through +the layers of the "tree" twice, once on the upsweep and once on the downsweep. +Until the arrays are large enough, the larger number of computations performed +will not overcome the overhead incurred from taking twice the number of steps. +When tested on length 2^20 arrays, the work efficient implementation was faster. + +The thrust implementation is significantly faster and also has far fewer kernel +calls. This seems reasonable as there probably is overhead that is incurred when +spawning multiple kernels so being able to fit the operation into fewer kernel +calls would drastically cut down on this. Additionally, thrust probably has +shared memory optimizations, memory access optimizations, and more efficient +computation. + +For all 3, the kernel executions themselves have lots of empty time in between +computations which suggests there is some form of IO bottleneck. -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/profile_timeline.png b/img/profile_timeline.png new file mode 100644 index 0000000..7d85b71 Binary files /dev/null and b/img/profile_timeline.png differ diff --git a/img/scan_time.png b/img/scan_time.png new file mode 100644 index 0000000..53641b9 Binary files /dev/null and b/img/scan_time.png differ diff --git a/img/stream_time.png b/img/stream_time.png new file mode 100644 index 0000000..84271ce Binary files /dev/null and b/img/stream_time.png differ diff --git a/img/time_blocksize.png b/img/time_blocksize.png new file mode 100644 index 0000000..4363946 Binary files /dev/null and b/img/time_blocksize.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..5562fb8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,13 +13,14 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 12; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) +{ // Scan tests printf("\n"); @@ -27,7 +28,7 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -81,7 +82,16 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + // int tmpTest[] = {0, 1, 2, 3, 4, 5, 6, 7}; + // int *tmpTestOut = new int[8]; + // zeroArray(8, tmpTestOut); + // printDesc("Small array slides example scan"); + // StreamCompaction::Efficient::scan(8, tmpTestOut, tmpTest); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(NPOT, c, true); + // printCmpResult(NPOT, b, c); + // delete[] tmpTestOut; + printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); @@ -102,7 +112,7 @@ int main(int argc, char* argv[]) { // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..d3e45a3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,29 +1,39 @@ #include "common.h" -void checkCUDAErrorFn(const char *msg, const char *file, int line) { +void checkCUDAErrorFn(const char *msg, const char *file, int line) +{ cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { + if (cudaSuccess == err) + { return; } fprintf(stderr, "CUDA error"); - if (file) { + if (file) + { fprintf(stderr, " (%s:%d)", file, line); } fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } - -namespace StreamCompaction { - namespace Common { +namespace StreamCompaction +{ + namespace Common + { /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + bools[index] = !(idata[index] == 0); } /** @@ -31,8 +41,17 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *bools, const int *indices) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..530c406 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,32 +10,41 @@ #include #include +#define blockSize 1024 + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + /** * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); -inline int ilog2(int x) { +inline int ilog2(int x) +{ int lg = 0; - while (x >>= 1) { + while (x >>= 1) + { ++lg; } return lg; } -inline int ilog2ceil(int x) { +inline int ilog2ceil(int x) +{ return x == 1 ? 0 : ilog2(x - 1) + 1; } -namespace StreamCompaction { - namespace Common { +namespace StreamCompaction +{ + namespace Common + { __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); + const int *idata, const int *bools, const int *indices); /** * This class is used for timing the performance @@ -60,7 +69,10 @@ namespace StreamCompaction { void startCpuTimer() { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + if (cpu_timer_started) + { + throw std::runtime_error("CPU timer already started"); + } cpu_timer_started = true; time_start_cpu = std::chrono::high_resolution_clock::now(); @@ -70,7 +82,10 @@ namespace StreamCompaction { { time_end_cpu = std::chrono::high_resolution_clock::now(); - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + if (!cpu_timer_started) + { + throw std::runtime_error("CPU timer not started"); + } std::chrono::duration duro = time_end_cpu - time_start_cpu; prev_elapsed_time_cpu_milliseconds = @@ -81,7 +96,10 @@ namespace StreamCompaction { void startGpuTimer() { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + if (gpu_timer_started) + { + throw std::runtime_error("GPU timer already started"); + } gpu_timer_started = true; cudaEventRecord(event_start); @@ -92,7 +110,10 @@ namespace StreamCompaction { cudaEventRecord(event_end); cudaEventSynchronize(event_end); - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + if (!gpu_timer_started) + { + throw std::runtime_error("GPU timer not started"); + } cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); gpu_timer_started = false; @@ -109,10 +130,10 @@ namespace StreamCompaction { } // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; + PerformanceTimer(const PerformanceTimer &) = delete; + PerformanceTimer(PerformanceTimer &&) = delete; + PerformanceTimer &operator=(const PerformanceTimer &) = delete; + PerformanceTimer &operator=(PerformanceTimer &&) = delete; private: cudaEvent_t event_start = nullptr; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..47fc827 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -3,10 +3,12 @@ #include "common.h" -namespace StreamCompaction { - namespace CPU { +namespace StreamCompaction +{ + namespace CPU + { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; @@ -17,9 +19,16 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { timer().startCpuTimer(); // TODO + int tmpAcc = 0; + for (int i = 0; i < n; i++) + { + odata[i] = tmpAcc; + tmpAcc += idata[i]; + } timer().endCpuTimer(); } @@ -28,11 +37,21 @@ namespace StreamCompaction { * * @returns the number of elements remaining after compaction. */ - int compactWithoutScan(int n, int *odata, const int *idata) { + int compactWithoutScan(int n, int *odata, const int *idata) + { timer().startCpuTimer(); // TODO + int curIdx = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[curIdx] = idata[i]; + curIdx++; + } + } timer().endCpuTimer(); - return -1; + return curIdx; } /** @@ -40,11 +59,39 @@ namespace StreamCompaction { * * @returns the number of elements remaining after compaction. */ - int compactWithScan(int n, int *odata, const int *idata) { + int compactWithScan(int n, int *odata, const int *idata) + { + int *tmpData = new int[n]; + int *tmpData2 = new int[n]; timer().startCpuTimer(); - // TODO + // map + for (int i = 0; i < n; i++) + { + tmpData[i] = idata[i] != 0; + } + // scan + int tmpAcc = 0; + for (int i = 0; i < n; i++) + { + tmpData2[i] = tmpAcc; + tmpAcc += tmpData[i]; + } + int const *arrPtr = idata; + // if last elem of mapped boolarr is 0, tmpData[n-1] is 0 + int retVal = tmpData2[n - 1] + tmpData[n - 1]; + for (int i = 0; i < retVal; i++) + { + while (*arrPtr == 0) + { + arrPtr++; + } + odata[i] = *arrPtr; + arrPtr++; + } timer().endCpuTimer(); - return -1; + delete[] tmpData2; + delete[] tmpData; + return retVal; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..ffe6a5d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,22 +3,86 @@ #include "common.h" #include "efficient.h" -namespace StreamCompaction { - namespace Efficient { +namespace StreamCompaction +{ + namespace Efficient + { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } + __global__ void kernScanEfficientUpSweep(int n, int layer, int shift, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + if (index % (1 << (layer + 1)) == 0) + { + data[index + (1 << (layer + 1)) - 1] += data[index + (1 << layer) - 1]; + } + } + + __global__ void kernScanEfficientDownSweep(int n, int layer, int max, int shift, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + if (index % (1 << (layer + 1)) == 0) + { + int t = data[index + (1 << layer) - 1]; + data[index + (1 << layer) - 1] = data[index + (1 << (layer + 1)) - 1]; + data[index + (1 << (layer + 1)) - 1] += t; + } + } + + __global__ void kernSetLastToZero(int n, int *data) + { + data[n - 1] = 0; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + //next power of 2 + int power = ilog2ceil(n); + int size = 1 << power; + int offset = size - n; + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int *buf; + cudaMalloc((void **)&buf, size * sizeof(int)); + cudaMemset(buf, 0, size * sizeof(int)); + cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); // TODO + for (int layer = 0; layer < power; layer++) + { + // invoke kernel + int shift = 1 << layer; + kernScanEfficientUpSweep<<>>(size, layer, shift, buf); + cudaDeviceSynchronize(); + } + kernSetLastToZero<<<1, 1>>>(size, buf); + cudaDeviceSynchronize(); + for (int layer = power - 1; layer >= 0; layer--) + { + // invoke kernel + int shift = 1 << layer; + kernScanEfficientDownSweep<<>>(size, layer, power - 1, shift, buf); + cudaDeviceSynchronize(); + // checkCUDAErrorWithLine("cudaDeviceSynchronize buf failed!"); + } timer().endGpuTimer(); + cudaMemcpy(odata, buf + offset, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buf); } /** @@ -30,11 +94,63 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata) + { + //next power of 2 + int power = ilog2ceil(n); + int size = 1 << power; + int offset = size - n; + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + + int *buf; // power of 2 0 padded copy of idata + cudaMalloc((void **)&buf, size * sizeof(int)); + cudaMemset(buf, 0, size * sizeof(int)); + cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int *bools, *indices, *tmpOut; // + cudaMalloc((void **)&bools, size * sizeof(int)); + cudaMalloc((void **)&indices, size * sizeof(int)); + cudaMalloc((void **)&tmpOut, size * sizeof(int)); timer().startGpuTimer(); // TODO + // Map + Common::kernMapToBoolean<<>>(size, bools, buf); + Common::kernMapToBoolean<<>>(size, indices, buf); + cudaDeviceSynchronize(); + // Scan + for (int layer = 0; layer < power; layer++) + { + // invoke kernel + int shift = 1 << layer; + kernScanEfficientUpSweep<<>>(size, layer, shift, indices); + cudaDeviceSynchronize(); + } + kernSetLastToZero<<<1, 1>>>(size, indices); + cudaDeviceSynchronize(); + for (int layer = power - 1; layer >= 0; layer--) + { + // invoke kernel + int shift = 1 << layer; + kernScanEfficientDownSweep<<>>(size, layer, power - 1, shift, indices); + cudaDeviceSynchronize(); + } + // Scatter + Common::kernScatter<<>>(size, tmpOut, buf, bools, indices); + cudaDeviceSynchronize(); timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, tmpOut, n * sizeof(int), cudaMemcpyDeviceToHost); + int retSize; + cudaMemcpy(&retSize, indices + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + int tmpLast; + cudaMemcpy(&tmpLast, buf + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + retSize += (tmpLast != 0); + + cudaFree(buf); + cudaFree(bools); + cudaFree(indices); + cudaFree(tmpOut); + return retSize; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b2a8ba5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,77 @@ #include "common.h" #include "naive.h" -namespace StreamCompaction { - namespace Naive { +namespace StreamCompaction +{ + namespace Naive + { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void kernScanNaive(int n, int layer, int offset, int *odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + int tmp = idata[index]; + odata[index] = tmp + ((index >= offset) ? idata[index - offset] : 0); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + int power = ilog2ceil(n); + int size = 1 << power; + int offset = size - n; + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int *bufA; + int *bufB; + cudaMalloc((void **)&bufA, size * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc bufA failed!"); + cudaMalloc((void **)&bufB, size * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc bufB failed!"); + + cudaMemset(bufA, 0, size * sizeof(int)); + cudaMemset(bufB, 0, size * sizeof(int)); + cudaMemcpy(bufA + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy failed!"); + int *tmp; timer().startGpuTimer(); // TODO + for (int layer = 0; layer < power; layer++) + { + // invoke kernel + int offset = 1 << layer; + checkCUDAErrorWithLine("loop start failed!"); + kernScanNaive<<>>(size, layer, offset, bufB, bufA); + checkCUDAErrorWithLine("kernscan failed!"); + cudaDeviceSynchronize(); + checkCUDAErrorWithLine("cudaDeviceSync failed!"); + // swap bufA and bufB + tmp = bufA; + bufA = bufB; + bufB = tmp; + } + checkCUDAErrorWithLine("before memcpy failed!"); + + // cudaDeviceSynchronize(); timer().endGpuTimer(); + cudaMemcpy(odata + 1, bufA + offset, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy failed!"); + + odata[0] = 0; + // cudaDeviceSynchronize(); + cudaFree(bufB); + checkCUDAErrorWithLine("cudaFree bufB failed!"); + cudaFree(bufA); + checkCUDAErrorWithLine("cudaFree bufA failed!"); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..afe7719 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,10 +6,12 @@ #include "common.h" #include "thrust.h" -namespace StreamCompaction { - namespace Thrust { +namespace StreamCompaction +{ + namespace Thrust + { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() + PerformanceTimer &timer() { static PerformanceTimer timer; return timer; @@ -17,12 +19,18 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + thrust::host_vector hv_idata(idata, idata + n); + thrust::device_vector dv_idata(hv_idata); + thrust::device_vector dv_odata(n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_idata.begin(), dv_idata.end(), dv_odata.begin()); timer().endGpuTimer(); + thrust::copy(dv_odata.begin(), dv_odata.end(), odata); } } }