diff --git a/Performance.xlsx b/Performance.xlsx new file mode 100644 index 0000000..d28cb65 Binary files /dev/null and b/Performance.xlsx differ diff --git a/README.md b/README.md index 0e38ddb..f52b2db 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,126 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yuxuan Zhu + * [LinkedIn](https://www.linkedin.com/in/andrewyxzhu/) +* Tested on: Windows 10, i7-7700HQ @ 2.80GHz 16GB, GTX 1050 4096MB (Personal Laptop) -### (TODO: Your README) +**Introduction** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project implemented different versions of the exclusive scanning algorithm and the stream compaction algorithm. +I implemented one CPU version of exclusive scanning and four GPU versions of exclusive scanning. +The CPU version of exclusive scanning supports in-place scanning operation by using local variables. It is surprisingly fast, even for +millions of elements. +The first GPU version of exclusive scanning is naive scanning, which iterativesly sums the array elements and does O(nlog n) computations. +The second GPU version of exclusive scanning is the work efficient version, which consists of an up-sweep portion and a down-sweep portion. I +launched different number of threads for each level of computation to reduce the number of "wasted" threads and improve performance. +The third GPU version is optimized based on the second version. I used shared memory to decrease the freqeuncy of global memory access. It is +roughly twice as fast as the second version. It is also faster than the CPU version for larger arrays. This is considered extra credit. +The fourth GPU version is done by calling the thrust library. It is highly optimized and extremely fast for large arrays. + +I also implemented two CPU versions of stream compaction and two GPU versions of stream compaction. +The first CPU version does stream compaction iteratively. It is quite fast. +The second CPU version simulates the stream compaction algorithm on a GPU by using scanning. It is a lot slower. +The first GPU version uses work efficient scanning to implement the stream compaction. It is quite fast. +The second GPU version uses the optimized work efficient scanning to implement stream compaction. It is also faster than the first version and often faster than +the CPU version. + +**Performance Analysis** + +I empirically found that the best block size is 256 for all versions of scan. + +![Scan](img/Scan_Performance.png) + +The graph above shows the performance comparsion among the different versions of the algorithm. The CPU version is almost always faster, unless the array +size is extremely large. The thrust library performance is always strictly better than my implementations. This is reasonable since I did not optimize everything due to +my current limited understanding of GPU performance. For example, I did not try to reduce bank conflicts. I don't know what's happening under the hood for the thrust version of scanning yet. I tried to analyze performance bottle necks of my algorithms by commenting out certain kernels and checking how much improvement in performance I get. For example, I realized the kernel that adds block increment to each element in a block is very slow due to frequent read/write of global memory. I optimized the code to use shared memory to improve the bottleneck. + +![Compaction](img/Stream_Compaction_Performance.png) + +The graph above shows the performance comparision among different versions of stream compaction algorithm. The CPU version was out-performed by my optimized GPU version when the array size is larger than 100000. + +Below is an attachment of the output of my test program. + + +``` + +**************** +** SCAN TESTS ** +**************** + [ 34 43 38 22 3 18 25 14 49 31 41 44 20 ... 9 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 9.4994ms (std::chrono Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 6.9843ms (std::chrono Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 26.3491ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 25.6866ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 18.4584ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 17.1187ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ] + passed +==== optimized work-efficient scan, power-of-two ==== + elapsed time: 8.34717ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ] + passed +==== optimized work-efficient scan, non-power-of-two ==== + elapsed time: 7.95645ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.05674ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.1897ms (CUDA Measured) + [ 0 34 77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 0 3 3 1 2 3 1 3 0 2 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 31.6402ms (std::chrono Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 28.173ms (std::chrono Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 52.5519ms (std::chrono Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 21.0639ms (CUDA Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 20.3782ms (CUDA Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 3 2 ] + passed +==== optimized work-efficient compact, power-of-two ==== + elapsed time: 11.0875ms (CUDA Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== optimized work-efficient compact, non-power-of-two ==== + elapsed time: 11.0332ms (CUDA Measured) + [ 1 1 3 3 1 2 3 1 3 2 1 2 1 ... 3 2 ] + passed + + +``` diff --git a/img/Scan_Performance.png b/img/Scan_Performance.png new file mode 100644 index 0000000..3ee53af Binary files /dev/null and b/img/Scan_Performance.png differ diff --git a/img/Stream_Compaction_Performance.png b/img/Stream_Compaction_Performance.png new file mode 100644 index 0000000..34a3f8f Binary files /dev/null and b/img/Stream_Compaction_Performance.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..6c83f11 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 10000000; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,11 +51,11 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); + //* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /**onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ @@ -64,35 +64,50 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("optimized work-efficient scan, power-of-two"); + StreamCompaction::Efficient::optimizedScan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("optimized work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::optimizedScan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +152,28 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("optimized work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::optimizedCompact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("optimized work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::optimizedCompact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..e403b14 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,15 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < n) { + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } + } } /** @@ -32,7 +40,10 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < n && bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..f15181e 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -12,6 +12,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 256 +#define efficientBlockSize 256 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..54bd316 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,14 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int curr; + int sum = idata[0]; + odata[0] = 0; + for (int i = 1; i < n; i++) { + curr = idata[i]; + odata[i] = sum; + sum += curr; + } timer().endCpuTimer(); } @@ -30,9 +37,14 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -41,10 +53,28 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + int* indicator = new int[n]; + timer().startCpuTimer(); + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + indicator[i] = 0; + } + else { + indicator[i] = 1; + } + } + odata[0] = 0; // odata is currently the array storing the exclusive scan result + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + indicator[i - 1]; + } + int count = odata[n - 1] + indicator[n - 1]; + for (int i = 0; i < n; i++) { + if (indicator[i] != 0) { + odata[odata[i]] = idata[i]; //odata is now the compacted array + } + } timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..6cabcf3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -16,9 +16,145 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int round = 1 << ilog2ceil(n); + int* dev_data; + cudaMalloc((void**)&dev_data, round * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + for (int i = 0; i < ilog2(round); i++) { + int numThreads = (round >> (i + 1)); + dim3 fullBlocksPerGrid(numThreads / efficientBlockSize); + if (fullBlocksPerGrid.x == 0) { + kernUpSweep << <1, numThreads >> > (round, i, dev_data); + } + else { + kernUpSweep << > > (round, i, dev_data); + } + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + kernSetRootZero << <1, 1 >> > (round, dev_data); + for (int i = ilog2(round) - 1; i >= 0; i--) { + int numThreads = (round >> (i + 1)); + dim3 fullBlocksPerGrid(numThreads / efficientBlockSize); + if (fullBlocksPerGrid.x == 0) { + kernDownSweep << <1, numThreads >> > (round, i, dev_data); + } + else { + kernDownSweep << > > (round, i, dev_data); + } + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + __global__ void kernUpSweep(int n, int level, int* arr) { + int index = (blockIdx.x * blockDim.x + threadIdx.x) << (level + 1); + if (index < n) { + arr[index + (1 << (level + 1)) - 1] += arr[index + (1 << level) - 1]; + } + } + + + __global__ void kernSetRootZero(int n, int* arr) { + arr[n - 1] = 0; + } + + __global__ void kernDownSweep(int n, int level, int* arr) { + int index = (blockIdx.x * blockDim.x + threadIdx.x) << (level + 1); + if (index < n) { + int left = arr[index + (1 << level) - 1]; + arr[index + (1 << level) - 1] = arr[index + (1 << (level + 1)) - 1]; + arr[index + (1 << (level + 1)) - 1] += left; + } + } + + __global__ void kernScanShared(int n, int logn, int* arr, int* sums) { + __shared__ int sArr[2 * efficientBlockSize]; + int index = 2 * (blockIdx.x * blockDim.x + threadIdx.x); + if (index < n) { + sArr[2 * threadIdx.x] = arr[index]; + sArr[2 * threadIdx.x + 1] = arr[index + 1]; + for (int i = 0; i < logn; i++) { + __syncthreads(); + if (threadIdx.x < (blockDim.x >> i)) { + sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1] += sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1]; + } + } + __syncthreads(); + if (threadIdx.x == 0) { + sums[blockIdx.x] = sArr[2 * blockDim.x - 1]; + sArr[2 * blockDim.x - 1] = 0; + } + for (int i = logn - 1; i >= 0; i--) { + __syncthreads(); + if (threadIdx.x < (blockDim.x >> i)) { + int left = sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1]; + sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1] = sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1]; + sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1] += left; + } + } + __syncthreads(); + arr[index] = sArr[2 * threadIdx.x]; + arr[index + 1] = sArr[2 * threadIdx.x + 1]; + __syncthreads(); + } + } + + + void optimizedScan(int n, int* odata, const int* idata) { + int round = 1 << ilog2ceil(n); + int* dev_data; + cudaMalloc((void**)&dev_data, round * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); + optimizedScanRecursive(round, dev_data); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + + void optimizedScanRecursive(int n, int* dev_data) { + int round = 1 << ilog2ceil(n); + dim3 fullBlocksPerGrid(round / (2 * efficientBlockSize)); + int* dev_sum; + if (fullBlocksPerGrid.x == 0) { + int logn = ilog2(round); + cudaMalloc((void**)&dev_sum, 1 * sizeof(int)); + kernScanShared << <1, round / 2>> > (round, logn, dev_data, dev_sum); + } + else { + cudaMalloc((void**)&dev_sum, fullBlocksPerGrid.x * sizeof(int)); + int logn = ilog2(efficientBlockSize) + 1; + kernScanShared << > > (round, logn, dev_data, dev_sum); + } + cudaDeviceSynchronize(); + if (fullBlocksPerGrid.x > 1) { + optimizedScanRecursive(fullBlocksPerGrid.x, dev_sum); + cudaDeviceSynchronize(); + fullBlocksPerGrid.x = (round / efficientBlockSize); + kernBlockIncrement << > > (round, dev_data, dev_sum); + cudaDeviceSynchronize(); + } + cudaFree(dev_sum); + + } + + __global__ void kernBlockIncrement(int n, int* data, int* increment) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ int offset; + if (threadIdx.x == 0) { + offset = increment[index / (2 * efficientBlockSize)]; + } + __syncthreads(); + data[index] += offset; + } /** @@ -31,10 +167,91 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int round = 1 << ilog2ceil(n); + int* dev_scan, *dev_bool, *dev_idata, *dev_odata; + cudaMalloc((void**)&dev_scan, round * sizeof(int)); + checkCUDAError("cudaMalloc dev_scan failed!"); + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bool failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + efficientBlockSize - 1) / efficientBlockSize); + Common::kernMapToBoolean << > > (n, dev_bool, dev_idata); + checkCUDAError("kernMapToBoolean failed!"); + cudaMemcpy(dev_scan, dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice); + for (int i = 0; i < ilog2(round); i++) { + int numThreads = (round >> (i + 1)); + dim3 fullBlocksPerGrid(numThreads / efficientBlockSize); + if (fullBlocksPerGrid.x == 0) { + kernUpSweep << <1, numThreads >> > (round, i, dev_scan); + } + else { + kernUpSweep << > > (round, i, dev_scan); + } + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + kernSetRootZero << <1, 1 >> > (round, dev_scan); + for (int i = ilog2(round) - 1; i >= 0; i--) { + int numThreads = (round >> (i + 1)); + dim3 fullBlocksPerGrid(numThreads / efficientBlockSize); + if (fullBlocksPerGrid.x == 0) { + kernDownSweep << <1, numThreads >> > (round, i, dev_scan); + } + else { + kernDownSweep << > > (round, i, dev_scan); + } + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } + Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_scan); + checkCUDAError("kernScatter failed!"); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + int count, lastBool; + cudaMemcpy(&count, &dev_scan[n-1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBool, &dev_bool[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_scan); + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_odata); + return count + lastBool; + } + + int optimizedCompact(int n, int* odata, const int* idata) { + int round = 1 << ilog2ceil(n); + int* dev_scan, * dev_bool, * dev_idata, * dev_odata; + cudaMalloc((void**)&dev_scan, round * sizeof(int)); + checkCUDAError("cudaMalloc dev_scan failed!"); + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bool failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + dim3 fullBlocksPerGrid((n + efficientBlockSize - 1) / efficientBlockSize); + Common::kernMapToBoolean << > > (n, dev_bool, dev_idata); + checkCUDAError("kernMapToBoolean failed!"); + cudaMemcpy(dev_scan, dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice); + optimizedScanRecursive(n, dev_scan); + Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_scan); + checkCUDAError("kernScatter failed!"); timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + int count, lastBool; + cudaMemcpy(&count, &dev_scan[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBool, &dev_bool[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_scan); + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_odata); + return count + lastBool; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..5921571 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,7 +7,16 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void optimizedScan(int n, int* odata, const int* idata); + void optimizedScanRecursive(int n, int* dev_data); int compact(int n, int *odata, const int *idata); + int optimizedCompact(int n, int* odata, const int* idata); + + __global__ void kernUpSweep(int n, int level, int* arr); + __global__ void kernDownSweep(int n, int level, int* arr); + __global__ void kernSetRootZero(int n, int* arr); + __global__ void kernScanShared(int n, int logn, int* arr); + __global__ void kernBlockIncrement(int n, int* data, int* increment); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..19bafa0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" + + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -17,9 +19,51 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_data1, * dev_data2; + cudaMalloc((void**)&dev_data1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data1 failed!"); + cudaMalloc((void**)&dev_data2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data2 failed!"); + cudaMemcpy(dev_data1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_data2, idata, sizeof(int), cudaMemcpyHostToDevice); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); timer().startGpuTimer(); - // TODO + for (int i = 0; i < ilog2ceil(n); i++) { + kernParallelScan << > > (n, i, dev_data1, dev_data2); + checkCUDAError("kernParallelScan failed!"); + cudaDeviceSynchronize(); + int* temp = dev_data1; + dev_data1 = dev_data2; + dev_data2 = temp; + } + kernInclusiveToExclusive << > > (n, dev_data1, dev_data2); + checkCUDAError("kernInclusiveToExclusive failed!"); timer().endGpuTimer(); + cudaMemcpy(odata, dev_data2, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data1); + cudaFree(dev_data2); + } + + __global__ void kernParallelScan(int n, int level, int *src, int *dest) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < n) { + if (index >= (1 << level)) { + dest[index] = src[index - (1 << level)] + src[index]; + } + else { + dest[index] = src[index]; + } + } + } + + __global__ void kernInclusiveToExclusive(int n, int *src, int *dest) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index == 0) { + dest[index] = 0; + } else if (index < n){ + dest[index] = src[index - 1]; + } } } } + diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..f28c981 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + __global__ void kernParallelScan(int n, int level, int* src, int* dest); + __global__ void kernInclusiveToExclusive(int n, int* src, int* dest); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..47343b2 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + thrust::device_ptr dev_thrust_idata(dev_idata); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_idata); timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_idata); } } }