diff --git a/Performance.xlsx b/Performance.xlsx
new file mode 100644
index 0000000..d28cb65
Binary files /dev/null and b/Performance.xlsx differ
diff --git a/README.md b/README.md
index 0e38ddb..f52b2db 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,126 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yuxuan Zhu
+  * [LinkedIn](https://www.linkedin.com/in/andrewyxzhu/)
+* Tested on: Windows 10, i7-7700HQ @ 2.80GHz 16GB, GTX 1050 4096MB (Personal Laptop)
 
-### (TODO: Your README)
+**Introduction**
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project implemented different versions of the exclusive scanning algorithm and the stream compaction algorithm.
+I implemented one CPU version of exclusive scanning and four GPU versions of exclusive scanning.
 
+The CPU version of exclusive scanning supports in-place scanning operation by using local variables. It is surprisingly fast, even for
+millions of elements.
+The first GPU version of exclusive scanning is naive scanning, which iterativesly sums the array elements and does O(nlog n) computations.
+The second GPU version of exclusive scanning is the work efficient version, which consists of an up-sweep portion and a down-sweep portion. I
+launched different number of threads for each level of computation to reduce the number of "wasted" threads and improve performance.
+The third GPU version is optimized based on the second version. I used shared memory to decrease the freqeuncy of global memory access. It is 
+roughly twice as fast as the second version. It is also faster than the CPU version for larger arrays. This is considered extra credit.
+The fourth GPU version is done by calling the thrust library. It is highly optimized and extremely fast for large arrays.
+
+I also implemented two CPU versions of stream compaction and two GPU versions of stream compaction.
+The first CPU version does stream compaction iteratively. It is quite fast.
+The second CPU version simulates the stream compaction algorithm on a GPU by using scanning. It is a lot slower.
+The first GPU version uses work efficient scanning to implement the stream compaction. It is quite fast.
+The second GPU version uses the optimized work efficient scanning to implement stream compaction. It is also faster than the first version and often faster than
+the CPU version.
+
+**Performance Analysis**
+
+I empirically found that the best block size is 256 for all versions of scan.
+
+![Scan](img/Scan_Performance.png)
+
+The graph above shows the performance comparsion among the different versions of the algorithm. The CPU version is almost always faster, unless the array
+size is extremely large. The thrust library performance is always strictly better than my implementations. This is reasonable since I did not optimize everything due to
+my current limited understanding of GPU performance. For example, I did not try to reduce bank conflicts. I don't know what's happening under the hood for the thrust version of scanning yet. I tried to analyze performance bottle necks of my algorithms by commenting out certain kernels and checking how much improvement in performance I get. For example, I realized the kernel that adds block increment to each element in a block is very slow due to frequent read/write of global memory. I optimized the code to use shared memory to improve the bottleneck.
+
+![Compaction](img/Stream_Compaction_Performance.png)
+
+The graph above shows the performance comparision among different versions of stream compaction algorithm. The CPU version was out-performed by my optimized GPU version when the array size is larger than 100000. 
+
+Below is an attachment of the output of my test program.
+
+
+```
+
+****************
+** SCAN TESTS **
+****************
+    [  34  43  38  22   3  18  25  14  49  31  41  44  20 ...   9   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 9.4994ms    (std::chrono Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 6.9843ms    (std::chrono Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 26.3491ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 25.6866ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 18.4584ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 17.1187ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ]
+    passed
+==== optimized work-efficient scan, power-of-two ====
+   elapsed time: 8.34717ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ]
+    passed
+==== optimized work-efficient scan, non-power-of-two ====
+   elapsed time: 7.95645ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.05674ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933885 244933894 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.1897ms    (CUDA Measured)
+    [   0  34  77 115 137 140 158 183 197 246 277 318 362 ... 244933830 244933839 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   1   0   3   3   1   2   3   1   3   0   2   1 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 31.6402ms    (std::chrono Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 28.173ms    (std::chrono Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 52.5519ms    (std::chrono Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 21.0639ms    (CUDA Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 20.3782ms    (CUDA Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   3   2 ]
+    passed
+==== optimized work-efficient compact, power-of-two ====
+   elapsed time: 11.0875ms    (CUDA Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== optimized work-efficient compact, non-power-of-two ====
+   elapsed time: 11.0332ms    (CUDA Measured)
+    [   1   1   3   3   1   2   3   1   3   2   1   2   1 ...   3   2 ]
+    passed  
+
+
+```                                                           
diff --git a/img/Scan_Performance.png b/img/Scan_Performance.png
new file mode 100644
index 0000000..3ee53af
Binary files /dev/null and b/img/Scan_Performance.png differ
diff --git a/img/Stream_Compaction_Performance.png b/img/Stream_Compaction_Performance.png
new file mode 100644
index 0000000..34a3f8f
Binary files /dev/null and b/img/Stream_Compaction_Performance.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..6c83f11 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 10000000; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,11 +51,11 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
+    //* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /**onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printArray(SIZE, c, true); */
@@ -64,35 +64,50 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, power-of-two");
+    StreamCompaction::Efficient::optimizedScan(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, non-power-of-two");
+    StreamCompaction::Efficient::optimizedScan(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +152,28 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
+    printCmpLenResult(count, expectedNPOT, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient compact, power-of-two");
+    count = StreamCompaction::Efficient::optimizedCompact(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(count, c, true);
+    printCmpLenResult(count, expectedCount, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient compact, non-power-of-two");
+    count = StreamCompaction::Efficient::optimizedCompact(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..e403b14 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,15 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index < n) {
+                if (idata[index] == 0) {
+                    bools[index] = 0;
+                }
+                else {
+                    bools[index] = 1;
+                }
+            }
         }
 
         /**
@@ -32,7 +40,10 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index < n && bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..f15181e 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -12,6 +12,8 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 256
+#define efficientBlockSize 256
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..54bd316 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,14 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int curr;
+            int sum = idata[0];
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                curr = idata[i];
+                odata[i] = sum;
+                sum += curr;
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +37,14 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[count++] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -41,10 +53,28 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
+            int* indicator = new int[n];
+            timer().startCpuTimer();            
+            for (int i = 0; i < n; i++) {
+                if (idata[i] == 0) {
+                    indicator[i] = 0;
+                }
+                else {
+                    indicator[i] = 1;
+                }
+            }            
+            odata[0] = 0;   // odata is currently the array storing the exclusive scan result 
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + indicator[i - 1];
+            }
+            int count = odata[n - 1] + indicator[n - 1];
+            for (int i = 0; i < n; i++) {
+                if (indicator[i] != 0) {
+                    odata[odata[i]] = idata[i];     //odata is now the compacted array
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..6cabcf3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -16,9 +16,145 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int round = 1 << ilog2ceil(n);
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, round * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data failed!");
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             timer().startGpuTimer();
-            // TODO
+            for (int i = 0; i < ilog2(round); i++) {
+                int numThreads = (round >> (i + 1));
+                dim3 fullBlocksPerGrid(numThreads / efficientBlockSize);
+                if (fullBlocksPerGrid.x == 0) {
+                    kernUpSweep << <1, numThreads >> > (round, i, dev_data);
+                }
+                else {
+                    kernUpSweep << <fullBlocksPerGrid, efficientBlockSize >> > (round, i, dev_data);
+                }                
+                checkCUDAError("kernUpSweep failed!");
+                cudaDeviceSynchronize();
+            }
+            kernSetRootZero << <1, 1 >> > (round, dev_data);
+            for (int i = ilog2(round) - 1; i >= 0; i--) {
+                int numThreads = (round >> (i + 1));
+                dim3 fullBlocksPerGrid(numThreads / efficientBlockSize);
+                if (fullBlocksPerGrid.x == 0) {
+                    kernDownSweep << <1, numThreads >> > (round, i, dev_data);
+                }
+                else {
+                    kernDownSweep << <fullBlocksPerGrid, efficientBlockSize >> > (round, i, dev_data);
+                }                
+                checkCUDAError("kernDownSweep failed!");
+                cudaDeviceSynchronize();
+            }
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+
+        __global__ void kernUpSweep(int n, int level, int* arr) {
+            int index = (blockIdx.x * blockDim.x + threadIdx.x) << (level + 1);
+            if (index < n) {
+                arr[index + (1 << (level + 1)) - 1] += arr[index + (1 << level) - 1];
+            }
+        }
+        
+
+        __global__ void kernSetRootZero(int n, int* arr) {
+            arr[n - 1] = 0;
+        }
+
+        __global__ void kernDownSweep(int n, int level, int* arr) {
+            int index = (blockIdx.x * blockDim.x + threadIdx.x) << (level + 1);
+            if (index < n) {
+                int left = arr[index + (1 << level) - 1];
+                arr[index + (1 << level) - 1] = arr[index + (1 << (level + 1)) - 1];
+                arr[index + (1 << (level + 1)) - 1] += left;
+            }
+        }
+
+        __global__ void kernScanShared(int n, int logn, int* arr, int* sums) {
+            __shared__ int sArr[2 * efficientBlockSize];
+            int index = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
+            if (index < n) {
+                sArr[2 * threadIdx.x] = arr[index];
+                sArr[2 * threadIdx.x + 1] = arr[index + 1];
+                for (int i = 0; i < logn; i++) {
+                    __syncthreads();
+                    if (threadIdx.x < (blockDim.x >> i)) {                        
+                        sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1] += sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1];
+                    }                    
+                }
+                __syncthreads();
+                if (threadIdx.x == 0) {
+                    sums[blockIdx.x] = sArr[2 * blockDim.x - 1];
+                    sArr[2 * blockDim.x - 1] = 0;
+                }
+                for (int i = logn - 1; i >= 0; i--) {
+                    __syncthreads();
+                    if (threadIdx.x < (blockDim.x >> i)) {
+                        int left = sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1];
+                        sArr[(threadIdx.x << (i + 1)) + (1 << i) - 1] = sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1];
+                        sArr[(threadIdx.x << (i + 1)) + (1 << (i + 1)) - 1] += left;
+                    }
+                }
+                __syncthreads();
+                arr[index] = sArr[2 * threadIdx.x];
+                arr[index + 1] = sArr[2 * threadIdx.x + 1];
+                __syncthreads();
+            }
+        }
+
+
+        void optimizedScan(int n, int* odata, const int* idata) {
+            int round = 1 << ilog2ceil(n);
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, round * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data failed!");
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            timer().startGpuTimer();
+            optimizedScanRecursive(round, dev_data);
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+
+
+        void optimizedScanRecursive(int n, int* dev_data) {
+            int round = 1 << ilog2ceil(n);
+            dim3 fullBlocksPerGrid(round / (2 * efficientBlockSize));
+            int* dev_sum;
+            if (fullBlocksPerGrid.x == 0) {
+                int logn = ilog2(round);
+                cudaMalloc((void**)&dev_sum, 1 * sizeof(int));
+                kernScanShared << <1, round / 2>> > (round, logn, dev_data, dev_sum);
+            }
+            else {
+                cudaMalloc((void**)&dev_sum, fullBlocksPerGrid.x * sizeof(int));
+                int logn = ilog2(efficientBlockSize) + 1;
+                kernScanShared << <fullBlocksPerGrid, efficientBlockSize >> > (round, logn, dev_data, dev_sum);
+            }
+            cudaDeviceSynchronize();
+            if (fullBlocksPerGrid.x > 1) {
+                optimizedScanRecursive(fullBlocksPerGrid.x, dev_sum);
+                cudaDeviceSynchronize();
+                fullBlocksPerGrid.x = (round / efficientBlockSize);
+                kernBlockIncrement << <fullBlocksPerGrid, efficientBlockSize >> > (round, dev_data, dev_sum);
+                cudaDeviceSynchronize();                
+            }   
+            cudaFree(dev_sum);
+
+        }
+
+        __global__ void kernBlockIncrement(int n, int* data, int* increment) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            __shared__ int offset;
+            if (threadIdx.x == 0) {
+                offset = increment[index / (2 * efficientBlockSize)];
+            }
+            __syncthreads();
+            data[index] += offset;
+
         }
 
         /**
@@ -31,10 +167,91 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int round = 1 << ilog2ceil(n);
+            int* dev_scan, *dev_bool, *dev_idata, *dev_odata;
+            cudaMalloc((void**)&dev_scan, round * sizeof(int));
+            checkCUDAError("cudaMalloc dev_scan failed!");
+            cudaMalloc((void**)&dev_bool, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bool failed!");
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            timer().startGpuTimer();
+            dim3 fullBlocksPerGrid((n + efficientBlockSize - 1) / efficientBlockSize);
+            Common::kernMapToBoolean << <fullBlocksPerGrid, efficientBlockSize >> > (n, dev_bool, dev_idata);
+            checkCUDAError("kernMapToBoolean failed!");
+            cudaMemcpy(dev_scan, dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+            for (int i = 0; i < ilog2(round); i++) {
+                int numThreads = (round >> (i + 1));
+                dim3 fullBlocksPerGrid(numThreads / efficientBlockSize);
+                if (fullBlocksPerGrid.x == 0) {
+                    kernUpSweep << <1, numThreads >> > (round, i, dev_scan);
+                }
+                else {
+                    kernUpSweep << <fullBlocksPerGrid, efficientBlockSize >> > (round, i, dev_scan);
+                }
+                checkCUDAError("kernUpSweep failed!");
+                cudaDeviceSynchronize();
+            }
+            kernSetRootZero << <1, 1 >> > (round, dev_scan);
+            for (int i = ilog2(round) - 1; i >= 0; i--) {
+                int numThreads = (round >> (i + 1));
+                dim3 fullBlocksPerGrid(numThreads / efficientBlockSize);
+                if (fullBlocksPerGrid.x == 0) {
+                    kernDownSweep << <1, numThreads >> > (round, i, dev_scan);
+                }
+                else {
+                    kernDownSweep << <fullBlocksPerGrid, efficientBlockSize >> > (round, i, dev_scan);
+                }
+                checkCUDAError("kernDownSweep failed!");
+                cudaDeviceSynchronize();
+            }
+            Common::kernScatter << <fullBlocksPerGrid, efficientBlockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_scan);
+            checkCUDAError("kernScatter failed!");
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            int count, lastBool;
+            cudaMemcpy(&count, &dev_scan[n-1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastBool, &dev_bool[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_scan);
+            cudaFree(dev_bool);
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            return count + lastBool;
+        }
+
+        int optimizedCompact(int n, int* odata, const int* idata) {
+            int round = 1 << ilog2ceil(n);
+            int* dev_scan, * dev_bool, * dev_idata, * dev_odata;
+            cudaMalloc((void**)&dev_scan, round * sizeof(int));
+            checkCUDAError("cudaMalloc dev_scan failed!");
+            cudaMalloc((void**)&dev_bool, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bool failed!");
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             timer().startGpuTimer();
-            // TODO
+            dim3 fullBlocksPerGrid((n + efficientBlockSize - 1) / efficientBlockSize);
+            Common::kernMapToBoolean << <fullBlocksPerGrid, efficientBlockSize >> > (n, dev_bool, dev_idata);
+            checkCUDAError("kernMapToBoolean failed!");
+            cudaMemcpy(dev_scan, dev_bool, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+            optimizedScanRecursive(n, dev_scan);
+            Common::kernScatter << <fullBlocksPerGrid, efficientBlockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_scan);
+            checkCUDAError("kernScatter failed!");
             timer().endGpuTimer();
-            return -1;
+            cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            int count, lastBool;
+            cudaMemcpy(&count, &dev_scan[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastBool, &dev_bool[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_scan);
+            cudaFree(dev_bool);
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            return count + lastBool;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..5921571 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -7,7 +7,16 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+        void optimizedScan(int n, int* odata, const int* idata);
+        void optimizedScanRecursive(int n, int* dev_data);
 
         int compact(int n, int *odata, const int *idata);
+        int optimizedCompact(int n, int* odata, const int* idata);
+
+        __global__ void kernUpSweep(int n, int level, int* arr);
+        __global__ void kernDownSweep(int n, int level, int* arr);
+        __global__ void kernSetRootZero(int n, int* arr);
+        __global__ void kernScanShared(int n, int logn, int* arr);
+        __global__ void kernBlockIncrement(int n, int* data, int* increment);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..19bafa0 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -17,9 +19,51 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_data1, * dev_data2;
+            cudaMalloc((void**)&dev_data1, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data1 failed!");
+            cudaMalloc((void**)&dev_data2, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data2 failed!");
+            cudaMemcpy(dev_data1, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_data2, idata, sizeof(int), cudaMemcpyHostToDevice);
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
             timer().startGpuTimer();
-            // TODO
+            for (int i = 0; i < ilog2ceil(n); i++) {
+                kernParallelScan << <fullBlocksPerGrid, blockSize >> > (n, i, dev_data1, dev_data2);
+                checkCUDAError("kernParallelScan failed!");
+                cudaDeviceSynchronize();
+                int* temp = dev_data1;
+                dev_data1 = dev_data2;
+                dev_data2 = temp;
+            }
+            kernInclusiveToExclusive << <fullBlocksPerGrid, blockSize >> > (n, dev_data1, dev_data2);
+            checkCUDAError("kernInclusiveToExclusive failed!");
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data2, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_data1);
+            cudaFree(dev_data2);
+        }
+
+        __global__ void kernParallelScan(int n, int level, int *src, int *dest) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index < n) {
+                if (index >= (1 << level)) {
+                    dest[index] = src[index - (1 << level)] + src[index];
+                }
+                else {
+                    dest[index] = src[index];
+                }                
+            }
+        }
+
+        __global__ void kernInclusiveToExclusive(int n, int *src, int *dest) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index == 0) {
+                dest[index] = 0;
+            } else if (index < n){
+                dest[index] = src[index - 1];
+            }
         }
     }
 }
+
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 37dcb06..f28c981 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -7,5 +7,7 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+        __global__ void kernParallelScan(int n, int level, int* src, int* dest);
+        __global__ void kernInclusiveToExclusive(int n, int* src, int* dest);
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..47343b2 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,19 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, sizeof(int) * n);
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            thrust::device_ptr<int> dev_thrust_idata(dev_idata);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_idata);
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_idata);
         }
     }
 }