diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..09da644
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "files.associations": {
+        "charconv": "cpp",
+        "xstring": "cpp",
+        "xtree": "cpp",
+        "chrono": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e38ddb..1f5e25e 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,132 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Richard Chen
+  * [LinkedIn](https://www.linkedin.com/in/richardrlchen/)
+* Tested on: Windows 11, i7-10875H @ 2.3GHz 16GB, RTX 2060 MAXQ 6GB (PC)
+
+## Overview
+The purpose of this project was to explore parallel algorithms, something a GPU 
+excels at. To that end, we used the GPU to perform stream compaction via scan.
+Stream compaction filters out elements from a list, in this case discarding the 0
+elements from the input. Scan is a fold but all the intermediate steps are also 
+available. 
+
+## Features
+* CPU implementation of scan 
+* CPU implementation of stream compaction
+* Naive parallel version of scan for GPU implemented in CUDA
+* Work efficient parallel version of scan for GPU implemented in CUDA
+* Stream Compaction that leverages the GPU scan
+
+## Example Output
+Tested on 256 element array
+```
+****************
+** SCAN TESTS **
+****************
+    [  28   7  23   2  29  27  22  19   2  38  26  47  45 ...  43   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0005ms    (std::chrono Measured)
+    [   0  28  35  58  60  89 116 138 157 159 197 223 270 ... 6050 6093 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0003ms    (std::chrono Measured)
+    [   0  28  35  58  60  89 116 138 157 159 197 223 270 ... 6016 6031 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.243616ms    (CUDA Measured)
+    a[1] = 28, b[1] = 0
+    FAIL VALUE
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.17504ms    (CUDA Measured)
+    a[1] = 28, b[1] = 0
+    FAIL VALUE
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.325728ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.335584ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.1088ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.048928ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   3   1   3   1   1   0   2   2   0   0   1   2 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0013ms    (std::chrono Measured)
+    [   3   1   3   1   1   2   2   1   2   2   2   1   2 ...   3   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0012ms    (std::chrono Measured)
+    [   3   1   3   1   1   2   2   1   2   2   2   1   2 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.0015ms    (std::chrono Measured)
+    [   3   1   3   1   1   2   2   1   2   2   2   1   2 ...   3   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.36784ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.3792ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+## Performance Analysis
+Optimizing Block Size
+<br>
+<img src="img/time_blocksize.png">  
+  
+There does not seem to be a significant difference when tested on size 2^16. 
+
+Performance as a function of array length
+<br>
+<img src="img/scan_time.png">  
+  
+Notice that the time axis is log scaled. The crossover between the naive and 
+efficient implementations happens around length 2^19. 
+
+Stream Compaction as a function of array length
+<br>
+<img src="img/stream_time.png">  
+  
+Notice that the time axis is log scaled. Even with arrays of up to length 
+2^28, the single threaded CPU is still faster for stream compactification. 
+This goes to show how parallelism and concurrency should be used situationally
+as in many scenarios, the drawbacks might outweigh the benefits. 
+
+### Nsight
+Profiling the execution on arrays of length 2^16
+<br>  
+<img src="img/profile_timeline.png"> 
+In the CUDA HW row, green represents host->device and 
+red represents device->host. Thus we can see that these pairs neatly bracket
+the naive implementation, the work-efficient implementation, and using 
+the thrust library. 
+  
+Up to 936ms is the naive GPU version. It looks about half as long as the 
+work efficient implementation. The work efficient implementation iterates through
+the layers of the "tree" twice, once on the upsweep and once on the downsweep. 
+Until the arrays are large enough, the larger number of computations performed
+will not overcome the overhead incurred from taking twice the number of steps. 
+When tested on length 2^20 arrays, the work efficient implementation was faster.
+
+The thrust implementation is significantly faster and also has far fewer kernel 
+calls. This seems reasonable as there probably is overhead that is incurred when
+spawning multiple kernels so being able to fit the operation into fewer kernel
+calls would drastically cut down on this. Additionally, thrust probably has 
+shared memory optimizations, memory access optimizations, and more efficient
+computation. 
+
+For all 3, the kernel executions themselves have lots of empty time in between
+computations which suggests there is some form of IO bottleneck. 
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/profile_timeline.png b/img/profile_timeline.png
new file mode 100644
index 0000000..7d85b71
Binary files /dev/null and b/img/profile_timeline.png differ
diff --git a/img/scan_time.png b/img/scan_time.png
new file mode 100644
index 0000000..53641b9
Binary files /dev/null and b/img/scan_time.png differ
diff --git a/img/stream_time.png b/img/stream_time.png
new file mode 100644
index 0000000..84271ce
Binary files /dev/null and b/img/stream_time.png differ
diff --git a/img/time_blocksize.png b/img/time_blocksize.png
new file mode 100644
index 0000000..4363946
Binary files /dev/null and b/img/time_blocksize.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..5562fb8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,13 +13,14 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 12;  // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[])
+{
     // Scan tests
 
     printf("\n");
@@ -27,7 +28,7 @@ int main(int argc, char* argv[]) {
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -81,7 +82,16 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
+    // int tmpTest[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    // int *tmpTestOut = new int[8];
+    // zeroArray(8, tmpTestOut);
+    // printDesc("Small array slides example scan");
+    // StreamCompaction::Efficient::scan(8, tmpTestOut, tmpTest);
+    // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    // //printArray(NPOT, c, true);
+    // printCmpResult(NPOT, b, c);
+    // delete[] tmpTestOut;
+
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
@@ -102,7 +112,7 @@ int main(int argc, char* argv[]) {
 
     // Compaction tests
 
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..d3e45a3 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,29 +1,39 @@
 #include "common.h"
 
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char *msg, const char *file, int line)
+{
     cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
+    if (cudaSuccess == err)
+    {
         return;
     }
 
     fprintf(stderr, "CUDA error");
-    if (file) {
+    if (file)
+    {
         fprintf(stderr, " (%s:%d)", file, line);
     }
     fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
     exit(EXIT_FAILURE);
 }
 
-
-namespace StreamCompaction {
-    namespace Common {
+namespace StreamCompaction
+{
+    namespace Common
+    {
 
         /**
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n)
+            {
+                return;
+            }
+            bools[index] = !(idata[index] == 0);
         }
 
         /**
@@ -31,8 +41,17 @@ namespace StreamCompaction {
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
         __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
+                                    const int *idata, const int *bools, const int *indices)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n)
+            {
+                return;
+            }
+            if (bools[index])
+            {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..530c406 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -10,32 +10,41 @@
 #include <chrono>
 #include <stdexcept>
 
+#define blockSize 1024
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
 void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
 
-inline int ilog2(int x) {
+inline int ilog2(int x)
+{
     int lg = 0;
-    while (x >>= 1) {
+    while (x >>= 1)
+    {
         ++lg;
     }
     return lg;
 }
 
-inline int ilog2ceil(int x) {
+inline int ilog2ceil(int x)
+{
     return x == 1 ? 0 : ilog2(x - 1) + 1;
 }
 
-namespace StreamCompaction {
-    namespace Common {
+namespace StreamCompaction
+{
+    namespace Common
+    {
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
 
         __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices);
+                                    const int *idata, const int *bools, const int *indices);
 
         /**
         * This class is used for timing the performance
@@ -60,7 +69,10 @@ namespace StreamCompaction {
 
             void startCpuTimer()
             {
-                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                if (cpu_timer_started)
+                {
+                    throw std::runtime_error("CPU timer already started");
+                }
                 cpu_timer_started = true;
 
                 time_start_cpu = std::chrono::high_resolution_clock::now();
@@ -70,7 +82,10 @@ namespace StreamCompaction {
             {
                 time_end_cpu = std::chrono::high_resolution_clock::now();
 
-                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+                if (!cpu_timer_started)
+                {
+                    throw std::runtime_error("CPU timer not started");
+                }
 
                 std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
                 prev_elapsed_time_cpu_milliseconds =
@@ -81,7 +96,10 @@ namespace StreamCompaction {
 
             void startGpuTimer()
             {
-                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                if (gpu_timer_started)
+                {
+                    throw std::runtime_error("GPU timer already started");
+                }
                 gpu_timer_started = true;
 
                 cudaEventRecord(event_start);
@@ -92,7 +110,10 @@ namespace StreamCompaction {
                 cudaEventRecord(event_end);
                 cudaEventSynchronize(event_end);
 
-                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+                if (!gpu_timer_started)
+                {
+                    throw std::runtime_error("GPU timer not started");
+                }
 
                 cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
                 gpu_timer_started = false;
@@ -109,10 +130,10 @@ namespace StreamCompaction {
             }
 
             // remove copy and move functions
-            PerformanceTimer(const PerformanceTimer&) = delete;
-            PerformanceTimer(PerformanceTimer&&) = delete;
-            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
-            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+            PerformanceTimer(const PerformanceTimer &) = delete;
+            PerformanceTimer(PerformanceTimer &&) = delete;
+            PerformanceTimer &operator=(const PerformanceTimer &) = delete;
+            PerformanceTimer &operator=(PerformanceTimer &&) = delete;
 
         private:
             cudaEvent_t event_start = nullptr;
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..47fc827 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -3,10 +3,12 @@
 
 #include "common.h"
 
-namespace StreamCompaction {
-    namespace CPU {
+namespace StreamCompaction
+{
+    namespace CPU
+    {
         using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
+        PerformanceTimer &timer()
         {
             static PerformanceTimer timer;
             return timer;
@@ -17,9 +19,16 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata)
+        {
             timer().startCpuTimer();
             // TODO
+            int tmpAcc = 0;
+            for (int i = 0; i < n; i++)
+            {
+                odata[i] = tmpAcc;
+                tmpAcc += idata[i];
+            }
             timer().endCpuTimer();
         }
 
@@ -28,11 +37,21 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
+        int compactWithoutScan(int n, int *odata, const int *idata)
+        {
             timer().startCpuTimer();
             // TODO
+            int curIdx = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    odata[curIdx] = idata[i];
+                    curIdx++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return curIdx;
         }
 
         /**
@@ -40,11 +59,39 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithScan(int n, int *odata, const int *idata) {
+        int compactWithScan(int n, int *odata, const int *idata)
+        {
+            int *tmpData = new int[n];
+            int *tmpData2 = new int[n];
             timer().startCpuTimer();
-            // TODO
+            // map
+            for (int i = 0; i < n; i++)
+            {
+                tmpData[i] = idata[i] != 0;
+            }
+            // scan
+            int tmpAcc = 0;
+            for (int i = 0; i < n; i++)
+            {
+                tmpData2[i] = tmpAcc;
+                tmpAcc += tmpData[i];
+            }
+            int const *arrPtr = idata;
+            // if last elem of mapped boolarr is 0, tmpData[n-1] is 0
+            int retVal = tmpData2[n - 1] + tmpData[n - 1];
+            for (int i = 0; i < retVal; i++)
+            {
+                while (*arrPtr == 0)
+                {
+                    arrPtr++;
+                }
+                odata[i] = *arrPtr;
+                arrPtr++;
+            }
             timer().endCpuTimer();
-            return -1;
+            delete[] tmpData2;
+            delete[] tmpData;
+            return retVal;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..ffe6a5d 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,22 +3,86 @@
 #include "common.h"
 #include "efficient.h"
 
-namespace StreamCompaction {
-    namespace Efficient {
+namespace StreamCompaction
+{
+    namespace Efficient
+    {
         using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
+        PerformanceTimer &timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
 
+        __global__ void kernScanEfficientUpSweep(int n, int layer, int shift, int *data)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n)
+            {
+                return;
+            }
+            if (index % (1 << (layer + 1)) == 0)
+            {
+                data[index + (1 << (layer + 1)) - 1] += data[index + (1 << layer) - 1];
+            }
+        }
+
+        __global__ void kernScanEfficientDownSweep(int n, int layer, int max, int shift, int *data)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n)
+            {
+                return;
+            }
+            if (index % (1 << (layer + 1)) == 0)
+            {
+                int t = data[index + (1 << layer) - 1];
+                data[index + (1 << layer) - 1] = data[index + (1 << (layer + 1)) - 1];
+                data[index + (1 << (layer + 1)) - 1] += t;
+            }
+        }
+
+        __global__ void kernSetLastToZero(int n, int *data)
+        {
+            data[n - 1] = 0;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata)
+        {
+            //next power of 2
+            int power = ilog2ceil(n);
+            int size = 1 << power;
+            int offset = size - n;
+            dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+            int *buf;
+            cudaMalloc((void **)&buf, size * sizeof(int));
+            cudaMemset(buf, 0, size * sizeof(int));
+            cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             timer().startGpuTimer();
             // TODO
+            for (int layer = 0; layer < power; layer++)
+            {
+                // invoke kernel
+                int shift = 1 << layer;
+                kernScanEfficientUpSweep<<<fullBlocksPerGrid, blockSize>>>(size, layer, shift, buf);
+                cudaDeviceSynchronize();
+            }
+            kernSetLastToZero<<<1, 1>>>(size, buf);
+            cudaDeviceSynchronize();
+            for (int layer = power - 1; layer >= 0; layer--)
+            {
+                // invoke kernel
+                int shift = 1 << layer;
+                kernScanEfficientDownSweep<<<fullBlocksPerGrid, blockSize>>>(size, layer, power - 1, shift, buf);
+                cudaDeviceSynchronize();
+                // checkCUDAErrorWithLine("cudaDeviceSynchronize buf failed!");
+            }
             timer().endGpuTimer();
+            cudaMemcpy(odata, buf + offset, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(buf);
         }
 
         /**
@@ -30,11 +94,63 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int *odata, const int *idata)
+        {
+            //next power of 2
+            int power = ilog2ceil(n);
+            int size = 1 << power;
+            int offset = size - n;
+            dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+            int *buf; // power of 2 0 padded copy of idata
+            cudaMalloc((void **)&buf, size * sizeof(int));
+            cudaMemset(buf, 0, size * sizeof(int));
+            cudaMemcpy(buf + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int *bools, *indices, *tmpOut; //
+            cudaMalloc((void **)&bools, size * sizeof(int));
+            cudaMalloc((void **)&indices, size * sizeof(int));
+            cudaMalloc((void **)&tmpOut, size * sizeof(int));
             timer().startGpuTimer();
             // TODO
+            // Map
+            Common::kernMapToBoolean<<<fullBlocksPerGrid, blockSize>>>(size, bools, buf);
+            Common::kernMapToBoolean<<<fullBlocksPerGrid, blockSize>>>(size, indices, buf);
+            cudaDeviceSynchronize();
+            // Scan
+            for (int layer = 0; layer < power; layer++)
+            {
+                // invoke kernel
+                int shift = 1 << layer;
+                kernScanEfficientUpSweep<<<fullBlocksPerGrid, blockSize>>>(size, layer, shift, indices);
+                cudaDeviceSynchronize();
+            }
+            kernSetLastToZero<<<1, 1>>>(size, indices);
+            cudaDeviceSynchronize();
+            for (int layer = power - 1; layer >= 0; layer--)
+            {
+                // invoke kernel
+                int shift = 1 << layer;
+                kernScanEfficientDownSweep<<<fullBlocksPerGrid, blockSize>>>(size, layer, power - 1, shift, indices);
+                cudaDeviceSynchronize();
+            }
+            // Scatter
+            Common::kernScatter<<<fullBlocksPerGrid, blockSize>>>(size, tmpOut, buf, bools, indices);
+            cudaDeviceSynchronize();
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, tmpOut, n * sizeof(int), cudaMemcpyDeviceToHost);
+            int retSize;
+            cudaMemcpy(&retSize, indices + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            int tmpLast;
+            cudaMemcpy(&tmpLast, buf + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            retSize += (tmpLast != 0);
+
+            cudaFree(buf);
+            cudaFree(bools);
+            cudaFree(indices);
+            cudaFree(tmpOut);
+            return retSize;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..b2a8ba5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,23 +3,77 @@
 #include "common.h"
 #include "naive.h"
 
-namespace StreamCompaction {
-    namespace Naive {
+namespace StreamCompaction
+{
+    namespace Naive
+    {
         using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
+        PerformanceTimer &timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+        __global__ void kernScanNaive(int n, int layer, int offset, int *odata, const int *idata)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n)
+            {
+                return;
+            }
+            int tmp = idata[index];
+            odata[index] = tmp + ((index >= offset) ? idata[index - offset] : 0);
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata)
+        {
+            int power = ilog2ceil(n);
+            int size = 1 << power;
+            int offset = size - n;
+            dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+            int *bufA;
+            int *bufB;
+            cudaMalloc((void **)&bufA, size * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc bufA failed!");
+            cudaMalloc((void **)&bufB, size * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc bufB failed!");
+
+            cudaMemset(bufA, 0, size * sizeof(int));
+            cudaMemset(bufB, 0, size * sizeof(int));
+            cudaMemcpy(bufA + offset, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy failed!");
+            int *tmp;
             timer().startGpuTimer();
             // TODO
+            for (int layer = 0; layer < power; layer++)
+            {
+                // invoke kernel
+                int offset = 1 << layer;
+                checkCUDAErrorWithLine("loop start failed!");
+                kernScanNaive<<<fullBlocksPerGrid, blockSize>>>(size, layer, offset, bufB, bufA);
+                checkCUDAErrorWithLine("kernscan failed!");
+                cudaDeviceSynchronize();
+                checkCUDAErrorWithLine("cudaDeviceSync failed!");
+                // swap bufA and bufB
+                tmp = bufA;
+                bufA = bufB;
+                bufB = tmp;
+            }
+            checkCUDAErrorWithLine("before memcpy failed!");
+
+            // cudaDeviceSynchronize();
             timer().endGpuTimer();
+            cudaMemcpy(odata + 1, bufA + offset, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMemcpy failed!");
+
+            odata[0] = 0;
+            // cudaDeviceSynchronize();
+            cudaFree(bufB);
+            checkCUDAErrorWithLine("cudaFree bufB failed!");
+            cudaFree(bufA);
+            checkCUDAErrorWithLine("cudaFree bufA failed!");
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..afe7719 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -6,10 +6,12 @@
 #include "common.h"
 #include "thrust.h"
 
-namespace StreamCompaction {
-    namespace Thrust {
+namespace StreamCompaction
+{
+    namespace Thrust
+    {
         using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
+        PerformanceTimer &timer()
         {
             static PerformanceTimer timer;
             return timer;
@@ -17,12 +19,18 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata)
+        {
+            thrust::host_vector<int> hv_idata(idata, idata + n);
+            thrust::device_vector<int> dv_idata(hv_idata);
+            thrust::device_vector<int> dv_odata(n);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dv_idata.begin(), dv_idata.end(), dv_odata.begin());
             timer().endGpuTimer();
+            thrust::copy(dv_odata.begin(), dv_odata.end(), odata);
         }
     }
 }