Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,33 @@
CUDA Stream Compaction
======================
**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
Project 2 - CUDA Stream Compaction**

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
* Nithin Pranesh
* Here are some links to connect with me: [LinkedIn](https://www.linkedin.com/in/nithin-pranesh), [YouTube](https://www.youtube.com/channel/UCAQwYrQblfN8qeDW28KkH7g/featured), [Twitter](https://twitter.com/NithinPranesh1).
* Tested on: XPS 15 7590, Windows 20H2, i7-9750H @ 2.60GHz 22GB, GTX 1650.

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
### Project 2: CUDA Stream Compaction

### (TODO: Your README)
### Overview

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
This is a series of implementations of the scan (prefix-sum) and compaction algorithms.
- Scan is an array operation that results in an identically sized array with each index containing a partial sum of the input array up to (and sometimes including) that index.
- Compaction is an array operation that results in the same list with all identity elements removed. This can help compress large sparse arrays to only contain actual information.

This project includes:
- A baseline CPU implementation of scan and compact.
- A naive CUDA GPU-based approach for implementing scan.
- A work-efficient CUDA GPU-based approach for scan and an implementation of compact using this implementation of scan.
- A wrapper for the scan implementation from the "thrust" library.

### Analysis

![](img/scan-chart.jpg)

Lacking a more thorough pass of optimization, the results seem contradictory. For instance, the CPU implementation seems to be the fastest up until a very large array size. Similarly, the work-efficient implementation seems paradoxically slower than the naive approach until very large array sizes.

The CPU and naive GPU approaches are likely close to, if not exactly, optimal due to their simplicity. The work-efficient implementation on the other hand has some obvious optimization that can be done. While the work-efficient algorithm conserves work, threads are currently being launched carelessly even when they clearly will not be required to do any work on that iteration. Fixing this should result in an immediate improvement of performance.

Lastly, varying block-size for each implementation while holding the array size fixed does not yield any significant improvement / deterioration. The typical block size of 128 seems to work fine.

![Screenshot of the output](img/proj2-results.jpg)

Binary file added img/proj2-results.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/scan-chart.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 10 additions & 2 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= 0 && index < n) {
bools[index] = idata[index] != 0;
}
}

/**
Expand All @@ -32,7 +35,12 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= 0 && index < n) {
if (bools[index]) {
odata[indices[index]] = idata[index];
}
}
}

}
Expand Down
1 change: 1 addition & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
#define blockSize 128

/**
* Check for CUDA errors; print and exit if there was a problem.
Expand Down
47 changes: 42 additions & 5 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,24 @@ namespace StreamCompaction {
return timer;
}

int _scan(int n, int* odata, const int* idata) {
int sum = 0;
for (int i = 0; i < n; ++i) {
odata[i] = sum;
sum += idata[i];
}

return sum;
}

/**
* CPU scan (prefix sum).
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
_scan(n, odata, idata);
timer().endCpuTimer();
}

Expand All @@ -30,9 +40,17 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO

int j = 0;
for (int i = 0; i < n; ++i) {
if (idata[i] != 0) {
odata[j] = idata[i];
++j;
}
}

timer().endCpuTimer();
return -1;
return j;
}

/**
Expand All @@ -41,10 +59,29 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
int* bitmap = (int*)std::malloc(n * sizeof(int));
int* scannedBitmap = (int*)std::malloc(n * sizeof(int));

timer().startCpuTimer();
// TODO

// map array to 0s and 1s
for (int i = 0; i < n; ++i) {
bitmap[i] = idata[i] != 0;
}

int count = _scan(n, scannedBitmap, bitmap);
for (int i = 0; i < n - 1; ++i) {
if (scannedBitmap[i] != scannedBitmap[i + 1]) {
odata[scannedBitmap[i]] = idata[i];
}
}

timer().endCpuTimer();
return -1;

std::free(bitmap);
std::free(scannedBitmap);

return count;
}
}
}
118 changes: 113 additions & 5 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,67 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}

__global__ void kernWorkEfficientUpSweepStep(int n, int stride, int* data) {
int index = 2 * stride * (threadIdx.x + (blockIdx.x * blockDim.x)) - 1;
if (index >= stride && index < n) {
data[index] += data[index - stride];
}
}

__global__ void kernWorkEfficientDownSweepStep(int n, int stride, int* data) {
int index = 2 * stride * (threadIdx.x + (blockIdx.x * blockDim.x)) - 1;
if (index >= stride && index < n) {
int oldValue = data[index];
data[index] += data[index - stride];
data[index - stride] = oldValue;
}
}

/**
* Performs prefix-sum (aka scan) on the buffer in place. Expects a padding to keep the length a power of 2.
*/
void _scan(int n, int *dev_buf) {
dim3 fullBlocksPerGrid = ((n + blockSize - 1) / blockSize);

// up-sweep phase
for (int stride = 1; stride < n; stride <<= 1) {
kernWorkEfficientUpSweepStep << <fullBlocksPerGrid, blockSize >> > (n, stride, dev_buf);
checkCUDAError("kernWorkEfficientUpSweepStep failed!");
}

// down-sweep phase
cudaMemset(&dev_buf[n - 1], 0, sizeof(int));
for (int stride = n >> 1; stride > 0; stride >>= 1) {
kernWorkEfficientDownSweepStep << <fullBlocksPerGrid, blockSize >> > (n, stride, dev_buf);
checkCUDAError("kernWorkEfficientDownSweepStep failed!");
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
int nPow2 = 1 << ilog2ceil(n);

int* dev_buf;
cudaMalloc((void**)&dev_buf, sizeof(int) * nPow2);
checkCUDAError("cudaMalloc dev_buf failed!");

cudaMemcpy(dev_buf, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy to device failed!");

timer().startGpuTimer();

_scan(nPow2, dev_buf);

timer().endGpuTimer();

cudaMemcpy(odata, dev_buf, sizeof(int) * n, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy from device failed!");

cudaFree(dev_buf);
checkCUDAError("cudaFree dev_buf failed!");
}

/**
Expand All @@ -31,10 +84,65 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
dim3 fullBlocksPerGrid = ((n + blockSize - 1) / blockSize);

int nPow2 = 1 << ilog2ceil(n);

int* dev_input;
cudaMalloc((void**)&dev_input, sizeof(int) * n);
checkCUDAError("cudaMalloc dev_input failed!");

cudaMemcpy(dev_input, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy input to device failed!");

int* dev_bools;
cudaMalloc((void**)&dev_bools, sizeof(int) * n);
checkCUDAError("cudaMalloc dev_bools failed!");

int* dev_indices;
cudaMalloc((void**)&dev_indices, sizeof(int) * nPow2);
checkCUDAError("cudaMalloc dev_indices failed!");

timer().startGpuTimer();
// TODO

Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_input);
checkCUDAError("kernMapToBoolean failed!");

cudaMemcpy(dev_indices, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToDevice);
checkCUDAError("cudaMemcpy from device to device failed!");

_scan(nPow2, dev_indices);

int count = 0;
cudaMemcpy(&count, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy from device failed!");
count += idata[n - 1] != 0;

int* dev_output;
cudaMalloc((void**)&dev_output, sizeof(int) * count);
checkCUDAError("cudaMalloc dev_output failed!");

Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_output, dev_input, dev_bools, dev_indices);
checkCUDAError("kernScatter failed!");

timer().endGpuTimer();
return -1;

cudaMemcpy(odata, dev_output, sizeof(int) * count, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy output to host failed!");

cudaFree(dev_input);
checkCUDAError("cudaFree dev_input failed!");

cudaFree(dev_output);
checkCUDAError("cudaFree dev_output failed!");

cudaFree(dev_bools);
checkCUDAError("cudaFree dev_bools failed!");

cudaFree(dev_indices);
checkCUDAError("cudaFree dev_indices failed!");

return count;
}
}
}
44 changes: 42 additions & 2 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,55 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
// TODO: __global__

__global__ void kernNaiveScanStep(int n, int offset, int* odata, const int* idata) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= 0 && index < n) {
if (index >= offset) {
odata[index] = idata[index - offset] + idata[index];
}
else {
odata[index] = idata[index];
}
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
dim3 fullBlocksPerGrid = ((n + blockSize - 1) / blockSize);

int* dev_buf0;
cudaMalloc((void**)&dev_buf0, n * sizeof(int));
checkCUDAError("cudaMalloc dev_buf0 failed!");

int* dev_buf1;
cudaMalloc((void**)&dev_buf1, n * sizeof(int));
checkCUDAError("cudaMalloc dev_buf0 failed!");

cudaMemcpy(dev_buf0, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy to device failed!");

timer().startGpuTimer();
// TODO

for (int offset = 1; offset < n; offset <<= 1) {
kernNaiveScanStep << <fullBlocksPerGrid, blockSize >> > (n, offset, dev_buf1, dev_buf0);
checkCUDAError("kernNaiveScanStep failed!");

std::swap(dev_buf0, dev_buf1);
}

timer().endGpuTimer();

cudaMemcpy(&odata[1], dev_buf0, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy from device failed!");

cudaFree(dev_buf0);
checkCUDAError("cudaFree dev_buf0 failed!");

cudaFree(dev_buf1);
checkCUDAError("cudaFree dev_buf1 failed!");
}
}
}
29 changes: 26 additions & 3 deletions stream_compaction/thrust.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,34 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
int* dev_in;
cudaMalloc((void**)&dev_in, sizeof(int) * n);
checkCUDAError("cudaMalloc dev_in failed!");

int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int) * n);
checkCUDAError("cudaMalloc dev_out failed!");

cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy to device failed!");

thrust::device_ptr<int> dev_thrust_in = thrust::device_pointer_cast(dev_in);
thrust::device_ptr<int> dev_thrust_out = thrust::device_pointer_cast(dev_out);

timer().startGpuTimer();
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());

thrust::exclusive_scan(dev_thrust_in, dev_thrust_in + n, dev_thrust_out);

timer().endGpuTimer();

cudaMemcpy(odata, dev_out, sizeof(int) * n, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy from device failed!");

cudaFree(dev_in);
checkCUDAError("cudaFree dev_in failed!");

cudaFree(dev_out);
checkCUDAError("cudaFree dev_out failed!");
}
}
}