diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62c0e59..88a0e0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,8 @@ set(headers
     src/sceneStructs.h
     src/preview.h
     src/utilities.h
+    src/common.h
+    src/efficient.h
     )
 
 set(sources
@@ -84,6 +86,8 @@ set(sources
     src/scene.cpp
     src/preview.cpp
     src/utilities.cpp
+    src/common.cu
+    src/efficient.cu
     )
 
 list(SORT headers)
diff --git a/README.md b/README.md
index 110697c..d356db7 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,152 @@
+ <p align="center">
+  <img src="build/cornell_coverpic2.png">
+</p>
+
 CUDA Path Tracer
 ================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
+* Author: Chhavi Sharma ([LinkedIn](https://www.linkedin.com/in/chhavi275/))
+* Tested on: Windows 10, Intel Core(R) Core(TM) i7-6700 CPU @ 3.40GHz 16GB, 
+             NVIDIA Quadro P1000 4GB (MOORE100B-06)
+
+### Index
+
+- [Introduction]( https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#introduction )
+- [Implementation Details]( https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#implementation-details )
+- [Features](https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#features )
+- [Description]( https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#description )
+- [Analysis]( https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#analysis )
+- [Some More Results and Bloopers](https://github.com/chhavisharma/Project3-CUDA-Path-Tracer/blob/master/README.md#more-rsults-and-bloopers)
+
+### Introduction 
+
+Path tracing is a computer graphics method of rendering digital 3D images such that the global illumination is as close as possible to reality. Path Tracing is similar to ray tracing in which rays are cast from a virtual camera and traced through a simulated scene by random sampling to incrementally compute a final image. The random sampling process makes it possible to render some complex phenomena which are not handled in regular ray tracing such as multiple reflections. 
+
+### Implementation Details
+We implement an estimation of the Bidirectional Scattering Distribution Function to compute the an estimated illumination per pixel in the image over several iterations. In reality: Rays leave light sources -> bounce around a scene and change color/intensity based on the scene’s materials -> some hit pixels in a camera/ our eyes. Our implementation simulations this phenomnenon in reverse where a ray is launched from our camera thorugh each pixel of the image, and it's subequent intersections and bounces in the scene are traced upto a certain depth to compute the final color of the pixel. 
+This is implemented by computing a single bounce at each time-step for all the rays in the image parallely to get maximum throughput.  
+
+The ray starts with an identity color which is modified multiplicatively as it hits differnet materials in the scene.
+The bounce direction and colour intensity depend on various material properties and the angle of incidence. We simulate four types of materials i.e. Emissive, Diffused, Reflective, Refractive and their combinations.
+
+### Features
+
+#### Basic Features
+   - [x] BSDF Shading (Diffused, Reflective, Emissive)
+   - [x] Basic Stream Compaction to remove dead rays 
+   - [x] Material type sort 
+   - [x] First bounce intersection caching 
+#### Advance Features
+   - [x] BSDF Shading (Refractive, Percentage Combinations)
+   - [x] Stochasitc Sampled Anti Aliasing
+   - [x] Work Efficient Stream Compaction usnig Shared Memory 
+   - [x] Motion Blur
+   - [ ] Depth of Field
+   - [ ] Loading OBJ files
+
+Most features have been implemented with toggleable options.
+
+### Description
+
+
+- Shading using [BSDF](https://en.wikipedia.org/wiki/Bidirectional_scattering_distribution_function)
+      - Diffuse Reflection: Reflects all rays randomly in the normal facing semi-sphere.
+      - Specular Reflection: Reflects the incoming ray about the normal where angle of incidence is equal to the angle of relection (mirror like behaviour).
+      - Refraction: Allows ray to pass through the media based on the ratio of the refractive index of the two mediums [snell's law](https://en.wikipedia.org/wiki/Snell%27s_law)
+      - Emissive Media: Rays in our computation terminate at these materials since they emit light.
+      - Percentage combinations/Probabiltiy splits between properties.   
+
+
+<p align="center">
+    Refraction    |    Reflection    |    Diffuese
+</p>
+<p align="center">
+  <img src="build/cornell_refraction.png" width="260" alt="Refraction" />
+  <img src="build/cornell_reflection.png" width="260" alt="Reflection"/>
+  <img src="build/cornell_diffuese.png" width="260" alt="Diffuese"/>   
+</p>      
+
+      
+<p align="center">
+    Refract-Reflect-50-50    |    Emmisive    |    Refract-Reflect-30-70
+</p>      
+<p align="center">
+  <img src="build/cornell_Refract-Reflect-50-50.png" width="260" alt="Refract-Reflect-50-50" />
+  <img src="build/cornell_emmisive.png" width="260" alt="Emmisive" />
+  <img src="build/cornell_70-30_ref_rel.png" width="260" alt="Refract-Reflect-30-70" />
+</p> 
+
+
+- Stream Compaction: We reorganise the rays that have terminated, either by hitting a source of light, or reaching maximum depth, by using stream compaction so that more cuda aprs can exit early when they find all theri theireads terminated. This reduces unnecessary compute per bounce operation and in turn speeds up the rendering. 
+- We also sort the rays by material type in order to allow continous memory access.
+- First Bounce Intersection caching speeds up each iteration trememdously. Since there is no uncertanility in the first ray intersection computation we can cache the intersections at the begining of the first iteration and use it for all successive iterations. The first iteration als has the maximum number of rays therefore caching then at iter 0 shows drastic speedup for the rest of the iterations.
+
+<p align="center">
+    <img src="img/FBC.png" width="600"/>
+</p>
+
+
+
+- AntiALiasing: We use anti-aliasing, a technique used to add greater realism to a digital image by smoothing jagged edges on curved lines and diagonals. We do this by randomly shifting the ray location per pixel instead of the center.
+
+<p align="center">
+    <img src="img/aa.PNG" width="600"/>
+</p>
+
+
+- Motion Blur: We implemtned motion blur by adding a veolity to an object of choice. As a result at each iteration, the object was rendered at a slightly translated location appearing giving the motion blur effect. 
+
+
+Motion Blur Screen Captures 
+
+<p align="center">
+    <img src="build/mb11.png" width="420"/>
+    <img src="build/mb22.png" width="420"/>
+</p>
+ 
+<p align="center">
+   <img src="img/mb1.PNG" width="420" height="420"/>
+   <img src="img/mb4.PNG" width="420" height="420" />
+</p>    
+
+
+### Analysis
+
+- Stream compaction helps most after a few bounces. We show the contrast in runtime and number ofrays processed with and without stream comapction in the following figures. These clearly show that stream comapction speeds up the rendering. 
+
+<p align="center">
+   <img src="img/plots.PNG"/>
+</p>
+
+- Opening up the scene by removing two walls allows more light to escape. Stream compaction in a closed scene senario would not help much if there aren't many premature terminated rays. In case of more open scenes, stream comaption allows more warps of threads to exit and not idle.
+
+<p align="center">
+    <img src="img/CLosedOpen.png" width="600"/>
+</p>
+
+<p align="center">
+      <img src="img/cornell.2019-09-30_04-21-28z.5000samp.png" width="300"/>
+      <img src="img/cornell.2019-09-30_04-28-55z.5000samp.png" width="300"/>
+</p>
+
+### More Rsults and Bloopers
+
+1. Bloopers (Mostly motion blur)
+<p align="center">
+    <img src="img/cornell.2019-09-28_03-45-01z.232samp.png" width="260"/>
+    <img src="img/cornell.2019-09-28_05-00-15z.2959samp.png" width="260"/>
+    <img src="img/cornell.2019-09-28_05-15-10z.5000samp.png" width="260"/>
+</p>
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
 
+2. Additional Results
+   
+<p align="center">
+   <img src="build/cornell.2019-09-30_05-12-34z.4512samp.png"/>
+</p>
+<p align="center">
+  <img src="build/cornell_coverpic.png">
+</p>
diff --git a/Stats.xlsx b/Stats.xlsx
new file mode 100644
index 0000000..7ba8804
Binary files /dev/null and b/Stats.xlsx differ
diff --git a/build/cornell.2019-09-28_00-10-41z.4965samp.png b/build/cornell.2019-09-28_00-10-41z.4965samp.png
new file mode 100644
index 0000000..407ab91
Binary files /dev/null and b/build/cornell.2019-09-28_00-10-41z.4965samp.png differ
diff --git a/build/cornell.2019-09-28_00-10-41z.5000samp.png b/build/cornell.2019-09-28_00-10-41z.5000samp.png
new file mode 100644
index 0000000..7113409
Binary files /dev/null and b/build/cornell.2019-09-28_00-10-41z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_00-34-07z.5000samp.png b/build/cornell.2019-09-28_00-34-07z.5000samp.png
new file mode 100644
index 0000000..b6d6ef2
Binary files /dev/null and b/build/cornell.2019-09-28_00-34-07z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_00-46-49z.5000samp.png b/build/cornell.2019-09-28_00-46-49z.5000samp.png
new file mode 100644
index 0000000..b6d6ef2
Binary files /dev/null and b/build/cornell.2019-09-28_00-46-49z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_02-04-11z.5000samp.png b/build/cornell.2019-09-28_02-04-11z.5000samp.png
new file mode 100644
index 0000000..c3dcf81
Binary files /dev/null and b/build/cornell.2019-09-28_02-04-11z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_03-16-00z.5000samp.png b/build/cornell.2019-09-28_03-16-00z.5000samp.png
new file mode 100644
index 0000000..234010d
Binary files /dev/null and b/build/cornell.2019-09-28_03-16-00z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_03-45-01z.232samp.png b/build/cornell.2019-09-28_03-45-01z.232samp.png
new file mode 100644
index 0000000..8905a8b
Binary files /dev/null and b/build/cornell.2019-09-28_03-45-01z.232samp.png differ
diff --git a/build/cornell.2019-09-28_05-00-15z.2959samp.png b/build/cornell.2019-09-28_05-00-15z.2959samp.png
new file mode 100644
index 0000000..548ac26
Binary files /dev/null and b/build/cornell.2019-09-28_05-00-15z.2959samp.png differ
diff --git a/build/cornell.2019-09-28_05-00-15z.3530samp.png b/build/cornell.2019-09-28_05-00-15z.3530samp.png
new file mode 100644
index 0000000..7462101
Binary files /dev/null and b/build/cornell.2019-09-28_05-00-15z.3530samp.png differ
diff --git a/build/cornell.2019-09-28_05-00-15z.3568samp.png b/build/cornell.2019-09-28_05-00-15z.3568samp.png
new file mode 100644
index 0000000..e4abc98
Binary files /dev/null and b/build/cornell.2019-09-28_05-00-15z.3568samp.png differ
diff --git a/build/cornell.2019-09-28_05-15-10z.434samp.png b/build/cornell.2019-09-28_05-15-10z.434samp.png
new file mode 100644
index 0000000..7c065d7
Binary files /dev/null and b/build/cornell.2019-09-28_05-15-10z.434samp.png differ
diff --git a/build/cornell.2019-09-28_05-15-10z.5000samp.png b/build/cornell.2019-09-28_05-15-10z.5000samp.png
new file mode 100644
index 0000000..6ce5392
Binary files /dev/null and b/build/cornell.2019-09-28_05-15-10z.5000samp.png differ
diff --git a/build/cornell.2019-09-28_05-15-10z.702samp.png b/build/cornell.2019-09-28_05-15-10z.702samp.png
new file mode 100644
index 0000000..f71dd75
Binary files /dev/null and b/build/cornell.2019-09-28_05-15-10z.702samp.png differ
diff --git a/build/cornell.2019-09-28_05-34-43z.3007samp.png b/build/cornell.2019-09-28_05-34-43z.3007samp.png
new file mode 100644
index 0000000..74b601e
Binary files /dev/null and b/build/cornell.2019-09-28_05-34-43z.3007samp.png differ
diff --git a/build/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png b/build/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png
new file mode 100644
index 0000000..7f114ec
Binary files /dev/null and b/build/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png differ
diff --git a/build/cornell.2019-09-28_05-38-08z.3011samp.png b/build/cornell.2019-09-28_05-38-08z.3011samp.png
new file mode 100644
index 0000000..b46c2db
Binary files /dev/null and b/build/cornell.2019-09-28_05-38-08z.3011samp.png differ
diff --git a/build/cornell.2019-09-28_05-48-01z.3006samp.png b/build/cornell.2019-09-28_05-48-01z.3006samp.png
new file mode 100644
index 0000000..2494720
Binary files /dev/null and b/build/cornell.2019-09-28_05-48-01z.3006samp.png differ
diff --git a/build/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png b/build/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png
new file mode 100644
index 0000000..fe743f9
Binary files /dev/null and b/build/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png differ
diff --git a/build/cornell.2019-09-29_10-15-46z.2846samp.png b/build/cornell.2019-09-29_10-15-46z.2846samp.png
new file mode 100644
index 0000000..ed977e1
Binary files /dev/null and b/build/cornell.2019-09-29_10-15-46z.2846samp.png differ
diff --git a/build/cornell.2019-09-29_23-13-09z.5000samp.png b/build/cornell.2019-09-29_23-13-09z.5000samp.png
new file mode 100644
index 0000000..678b684
Binary files /dev/null and b/build/cornell.2019-09-29_23-13-09z.5000samp.png differ
diff --git a/build/cornell.2019-09-29_23-59-06z.2318samp.png b/build/cornell.2019-09-29_23-59-06z.2318samp.png
new file mode 100644
index 0000000..255cdb9
Binary files /dev/null and b/build/cornell.2019-09-29_23-59-06z.2318samp.png differ
diff --git a/build/cornell.2019-09-29_23-59-06z.2393samp.png b/build/cornell.2019-09-29_23-59-06z.2393samp.png
new file mode 100644
index 0000000..32975a5
Binary files /dev/null and b/build/cornell.2019-09-29_23-59-06z.2393samp.png differ
diff --git a/build/cornell.2019-09-30_03-16-41z.5000samp.png b/build/cornell.2019-09-30_03-16-41z.5000samp.png
new file mode 100644
index 0000000..38fe62f
Binary files /dev/null and b/build/cornell.2019-09-30_03-16-41z.5000samp.png differ
diff --git a/build/cornell.2019-09-30_04-21-28z.5000samp.png b/build/cornell.2019-09-30_04-21-28z.5000samp.png
new file mode 100644
index 0000000..02f76d7
Binary files /dev/null and b/build/cornell.2019-09-30_04-21-28z.5000samp.png differ
diff --git a/build/cornell.2019-09-30_04-28-55z.5000samp.png b/build/cornell.2019-09-30_04-28-55z.5000samp.png
new file mode 100644
index 0000000..636c82d
Binary files /dev/null and b/build/cornell.2019-09-30_04-28-55z.5000samp.png differ
diff --git a/build/cornell.2019-09-30_04-31-03z.3441samp.png b/build/cornell.2019-09-30_04-31-03z.3441samp.png
new file mode 100644
index 0000000..052019a
Binary files /dev/null and b/build/cornell.2019-09-30_04-31-03z.3441samp.png differ
diff --git a/build/cornell.2019-09-30_04-31-03z.3767samp.png b/build/cornell.2019-09-30_04-31-03z.3767samp.png
new file mode 100644
index 0000000..aea6ec1
Binary files /dev/null and b/build/cornell.2019-09-30_04-31-03z.3767samp.png differ
diff --git a/build/cornell.2019-09-30_05-12-34z.4512samp.png b/build/cornell.2019-09-30_05-12-34z.4512samp.png
new file mode 100644
index 0000000..b7a31d8
Binary files /dev/null and b/build/cornell.2019-09-30_05-12-34z.4512samp.png differ
diff --git a/build/cornell.2019-09-30_05-12-34z.4540samp.png b/build/cornell.2019-09-30_05-12-34z.4540samp.png
new file mode 100644
index 0000000..8aed363
Binary files /dev/null and b/build/cornell.2019-09-30_05-12-34z.4540samp.png differ
diff --git a/build/cornell.2019-09-30_05-12-34z.4555samp.png b/build/cornell.2019-09-30_05-12-34z.4555samp.png
new file mode 100644
index 0000000..42fe8a1
Binary files /dev/null and b/build/cornell.2019-09-30_05-12-34z.4555samp.png differ
diff --git a/build/cornell.2019-09-30_05-25-56z.3141samp.png b/build/cornell.2019-09-30_05-25-56z.3141samp.png
new file mode 100644
index 0000000..b5dff1e
Binary files /dev/null and b/build/cornell.2019-09-30_05-25-56z.3141samp.png differ
diff --git a/build/cornell.2019-09-30_05-25-56z.3160samp.png b/build/cornell.2019-09-30_05-25-56z.3160samp.png
new file mode 100644
index 0000000..6d7f807
Binary files /dev/null and b/build/cornell.2019-09-30_05-25-56z.3160samp.png differ
diff --git a/build/cornell_70-30_ref_rel.png b/build/cornell_70-30_ref_rel.png
new file mode 100644
index 0000000..54cc565
Binary files /dev/null and b/build/cornell_70-30_ref_rel.png differ
diff --git a/build/cornell_Refract-Reflect-50-50.png b/build/cornell_Refract-Reflect-50-50.png
new file mode 100644
index 0000000..234010d
Binary files /dev/null and b/build/cornell_Refract-Reflect-50-50.png differ
diff --git a/build/cornell_coverpic.png b/build/cornell_coverpic.png
new file mode 100644
index 0000000..632c9a8
Binary files /dev/null and b/build/cornell_coverpic.png differ
diff --git a/build/cornell_coverpic2.png b/build/cornell_coverpic2.png
new file mode 100644
index 0000000..9356c99
Binary files /dev/null and b/build/cornell_coverpic2.png differ
diff --git a/build/cornell_diffuese.png b/build/cornell_diffuese.png
new file mode 100644
index 0000000..0ffa2d1
Binary files /dev/null and b/build/cornell_diffuese.png differ
diff --git a/build/cornell_emmisive.png b/build/cornell_emmisive.png
new file mode 100644
index 0000000..7f2cf85
Binary files /dev/null and b/build/cornell_emmisive.png differ
diff --git a/build/cornell_reflection.png b/build/cornell_reflection.png
new file mode 100644
index 0000000..02f76d7
Binary files /dev/null and b/build/cornell_reflection.png differ
diff --git a/build/cornell_refraction.png b/build/cornell_refraction.png
new file mode 100644
index 0000000..864deee
Binary files /dev/null and b/build/cornell_refraction.png differ
diff --git a/build/mb11.png b/build/mb11.png
new file mode 100644
index 0000000..efaa862
Binary files /dev/null and b/build/mb11.png differ
diff --git a/build/mb22.png b/build/mb22.png
new file mode 100644
index 0000000..2d75020
Binary files /dev/null and b/build/mb22.png differ
diff --git a/img/CLosedOpen.png b/img/CLosedOpen.png
new file mode 100644
index 0000000..2ee7af9
Binary files /dev/null and b/img/CLosedOpen.png differ
diff --git a/img/FBC.png b/img/FBC.png
new file mode 100644
index 0000000..47c35d8
Binary files /dev/null and b/img/FBC.png differ
diff --git a/img/aa.PNG b/img/aa.PNG
new file mode 100644
index 0000000..7b657df
Binary files /dev/null and b/img/aa.PNG differ
diff --git a/img/cornell.2019-09-28_00-10-41z.4965samp.png b/img/cornell.2019-09-28_00-10-41z.4965samp.png
new file mode 100644
index 0000000..407ab91
Binary files /dev/null and b/img/cornell.2019-09-28_00-10-41z.4965samp.png differ
diff --git a/img/cornell.2019-09-28_00-10-41z.5000samp.png b/img/cornell.2019-09-28_00-10-41z.5000samp.png
new file mode 100644
index 0000000..7113409
Binary files /dev/null and b/img/cornell.2019-09-28_00-10-41z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_00-34-07z.5000samp.png b/img/cornell.2019-09-28_00-34-07z.5000samp.png
new file mode 100644
index 0000000..b6d6ef2
Binary files /dev/null and b/img/cornell.2019-09-28_00-34-07z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_00-46-49z.5000samp.png b/img/cornell.2019-09-28_00-46-49z.5000samp.png
new file mode 100644
index 0000000..b6d6ef2
Binary files /dev/null and b/img/cornell.2019-09-28_00-46-49z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_02-04-11z.5000samp.png b/img/cornell.2019-09-28_02-04-11z.5000samp.png
new file mode 100644
index 0000000..c3dcf81
Binary files /dev/null and b/img/cornell.2019-09-28_02-04-11z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_03-16-00z.5000samp.png b/img/cornell.2019-09-28_03-16-00z.5000samp.png
new file mode 100644
index 0000000..234010d
Binary files /dev/null and b/img/cornell.2019-09-28_03-16-00z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_03-45-01z.232samp.png b/img/cornell.2019-09-28_03-45-01z.232samp.png
new file mode 100644
index 0000000..8905a8b
Binary files /dev/null and b/img/cornell.2019-09-28_03-45-01z.232samp.png differ
diff --git a/img/cornell.2019-09-28_05-00-15z.2959samp.png b/img/cornell.2019-09-28_05-00-15z.2959samp.png
new file mode 100644
index 0000000..548ac26
Binary files /dev/null and b/img/cornell.2019-09-28_05-00-15z.2959samp.png differ
diff --git a/img/cornell.2019-09-28_05-00-15z.3530samp.png b/img/cornell.2019-09-28_05-00-15z.3530samp.png
new file mode 100644
index 0000000..7462101
Binary files /dev/null and b/img/cornell.2019-09-28_05-00-15z.3530samp.png differ
diff --git a/img/cornell.2019-09-28_05-00-15z.3568samp.png b/img/cornell.2019-09-28_05-00-15z.3568samp.png
new file mode 100644
index 0000000..e4abc98
Binary files /dev/null and b/img/cornell.2019-09-28_05-00-15z.3568samp.png differ
diff --git a/img/cornell.2019-09-28_05-15-10z.434samp.png b/img/cornell.2019-09-28_05-15-10z.434samp.png
new file mode 100644
index 0000000..7c065d7
Binary files /dev/null and b/img/cornell.2019-09-28_05-15-10z.434samp.png differ
diff --git a/img/cornell.2019-09-28_05-15-10z.5000samp.png b/img/cornell.2019-09-28_05-15-10z.5000samp.png
new file mode 100644
index 0000000..6ce5392
Binary files /dev/null and b/img/cornell.2019-09-28_05-15-10z.5000samp.png differ
diff --git a/img/cornell.2019-09-28_05-15-10z.702samp.png b/img/cornell.2019-09-28_05-15-10z.702samp.png
new file mode 100644
index 0000000..f71dd75
Binary files /dev/null and b/img/cornell.2019-09-28_05-15-10z.702samp.png differ
diff --git a/img/cornell.2019-09-28_05-34-43z.3007samp.png b/img/cornell.2019-09-28_05-34-43z.3007samp.png
new file mode 100644
index 0000000..74b601e
Binary files /dev/null and b/img/cornell.2019-09-28_05-34-43z.3007samp.png differ
diff --git a/img/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png b/img/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png
new file mode 100644
index 0000000..7f114ec
Binary files /dev/null and b/img/cornell.2019-09-28_05-34-43z.3007samp_withoutANTIALIASING.png differ
diff --git a/img/cornell.2019-09-28_05-38-08z.3011samp.png b/img/cornell.2019-09-28_05-38-08z.3011samp.png
new file mode 100644
index 0000000..b46c2db
Binary files /dev/null and b/img/cornell.2019-09-28_05-38-08z.3011samp.png differ
diff --git a/img/cornell.2019-09-28_05-48-01z.3006samp.png b/img/cornell.2019-09-28_05-48-01z.3006samp.png
new file mode 100644
index 0000000..2494720
Binary files /dev/null and b/img/cornell.2019-09-28_05-48-01z.3006samp.png differ
diff --git a/img/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png b/img/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png
new file mode 100644
index 0000000..fe743f9
Binary files /dev/null and b/img/cornell.2019-09-28_05-48-01z.3006samp_withANTIALIASING.png differ
diff --git a/img/cornell.2019-09-29_10-15-46z.2846samp.png b/img/cornell.2019-09-29_10-15-46z.2846samp.png
new file mode 100644
index 0000000..ed977e1
Binary files /dev/null and b/img/cornell.2019-09-29_10-15-46z.2846samp.png differ
diff --git a/img/cornell.2019-09-29_23-13-09z.5000samp.png b/img/cornell.2019-09-29_23-13-09z.5000samp.png
new file mode 100644
index 0000000..678b684
Binary files /dev/null and b/img/cornell.2019-09-29_23-13-09z.5000samp.png differ
diff --git a/img/cornell.2019-09-29_23-59-06z.2318samp.png b/img/cornell.2019-09-29_23-59-06z.2318samp.png
new file mode 100644
index 0000000..255cdb9
Binary files /dev/null and b/img/cornell.2019-09-29_23-59-06z.2318samp.png differ
diff --git a/img/cornell.2019-09-29_23-59-06z.2393samp.png b/img/cornell.2019-09-29_23-59-06z.2393samp.png
new file mode 100644
index 0000000..32975a5
Binary files /dev/null and b/img/cornell.2019-09-29_23-59-06z.2393samp.png differ
diff --git a/img/cornell.2019-09-30_03-16-41z.5000samp.png b/img/cornell.2019-09-30_03-16-41z.5000samp.png
new file mode 100644
index 0000000..38fe62f
Binary files /dev/null and b/img/cornell.2019-09-30_03-16-41z.5000samp.png differ
diff --git a/img/cornell.2019-09-30_04-21-28z.5000samp.png b/img/cornell.2019-09-30_04-21-28z.5000samp.png
new file mode 100644
index 0000000..02f76d7
Binary files /dev/null and b/img/cornell.2019-09-30_04-21-28z.5000samp.png differ
diff --git a/img/cornell.2019-09-30_04-28-55z.5000samp.png b/img/cornell.2019-09-30_04-28-55z.5000samp.png
new file mode 100644
index 0000000..636c82d
Binary files /dev/null and b/img/cornell.2019-09-30_04-28-55z.5000samp.png differ
diff --git a/img/cornell.2019-09-30_04-31-03z.3441samp.png b/img/cornell.2019-09-30_04-31-03z.3441samp.png
new file mode 100644
index 0000000..052019a
Binary files /dev/null and b/img/cornell.2019-09-30_04-31-03z.3441samp.png differ
diff --git a/img/cornell.2019-09-30_04-31-03z.3767samp.png b/img/cornell.2019-09-30_04-31-03z.3767samp.png
new file mode 100644
index 0000000..aea6ec1
Binary files /dev/null and b/img/cornell.2019-09-30_04-31-03z.3767samp.png differ
diff --git a/img/cornell_cover.png b/img/cornell_cover.png
new file mode 100644
index 0000000..38fe62f
Binary files /dev/null and b/img/cornell_cover.png differ
diff --git a/img/graph1_withoutSC_time.pmg b/img/graph1_withoutSC_time.pmg
new file mode 100644
index 0000000..d90e6bd
Binary files /dev/null and b/img/graph1_withoutSC_time.pmg differ
diff --git a/img/graph1_withoutSC_time.png b/img/graph1_withoutSC_time.png
new file mode 100644
index 0000000..ad2a762
Binary files /dev/null and b/img/graph1_withoutSC_time.png differ
diff --git a/img/graph2_thrustSC_rays.png b/img/graph2_thrustSC_rays.png
new file mode 100644
index 0000000..6e39663
Binary files /dev/null and b/img/graph2_thrustSC_rays.png differ
diff --git a/img/graph2_thrustSC_time.png b/img/graph2_thrustSC_time.png
new file mode 100644
index 0000000..f00c69d
Binary files /dev/null and b/img/graph2_thrustSC_time.png differ
diff --git a/img/graph3_WESC_rays.png b/img/graph3_WESC_rays.png
new file mode 100644
index 0000000..f85809b
Binary files /dev/null and b/img/graph3_WESC_rays.png differ
diff --git a/img/graph3_WESC_time.png b/img/graph3_WESC_time.png
new file mode 100644
index 0000000..427fe20
Binary files /dev/null and b/img/graph3_WESC_time.png differ
diff --git a/img/mb1.PNG b/img/mb1.PNG
new file mode 100644
index 0000000..3595553
Binary files /dev/null and b/img/mb1.PNG differ
diff --git a/img/mb2.PNG b/img/mb2.PNG
new file mode 100644
index 0000000..d1d7b36
Binary files /dev/null and b/img/mb2.PNG differ
diff --git a/img/mb3.PNG b/img/mb3.PNG
new file mode 100644
index 0000000..1c191bf
Binary files /dev/null and b/img/mb3.PNG differ
diff --git a/img/mb4.PNG b/img/mb4.PNG
new file mode 100644
index 0000000..5a2f4b0
Binary files /dev/null and b/img/mb4.PNG differ
diff --git a/img/plots.PNG b/img/plots.PNG
new file mode 100644
index 0000000..af4f6dc
Binary files /dev/null and b/img/plots.PNG differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..3f34e32 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -43,11 +43,21 @@ MATERIAL 4
 RGB         .98 .98 .98
 SPECEX      0
 SPECRGB     .98 .98 .98
-REFL        1
+REFL        0
 REFR        0
 REFRIOR     0
 EMITTANCE   0
 
+// Specular white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .524 .80 .92.2
+REFL        0.8
+REFR        0.2
+REFRIOR     1.66
+EMITTANCE   0
+
 // Camera
 CAMERA
 RES         800 800
@@ -111,7 +121,16 @@ SCALE       .01 10 10
 // Sphere
 OBJECT 6
 sphere
-material 4
+material 5
 TRANS       -1 4 -1
 ROTAT       0 0 0
 SCALE       3 3 3
+
+
+// Ceiling light 2
+OBJECT 7
+cube
+material 0
+TRANS       0 10 4
+ROTAT       0 0 0
+SCALE       5 .2 1
\ No newline at end of file
diff --git a/scenes/cornell_new.txt b/scenes/cornell_new.txt
new file mode 100644
index 0000000..2d4b872
--- /dev/null
+++ b/scenes/cornell_new.txt
@@ -0,0 +1,210 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white // Glassy material Half and Half
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0.5
+REFR        0.5
+REFRIOR     1.33
+EMITTANCE   0
+
+// Specular white // Pure REFL
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Pure Diffuse Yellow
+MATERIAL 7
+RGB         .95 .95 .95
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white // Glassy material Not Half and Half
+MATERIAL 8
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0.7
+REFR        0.3
+REFRIOR     1.33
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES        1024 1024
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+//========================
+//===objects==============
+//========================
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+//=========================
+
+// Sphere
+OBJECT 6
+sphere
+material 6
+TRANS       -1 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Box on the floor
+OBJECT 7
+cube
+material 5
+TRANS        -3 0.85 3.5
+ROTAT        45 45 65
+SCALE        1.2 1.2 1.2
+
+// Ceiling light 2
+OBJECT 8
+cube
+material 0
+TRANS       0 10 4
+ROTAT       0 0 0
+SCALE       5 .2 0.5
+
+// The other box
+OBJECT 9
+cube
+material 5
+TRANS        3 1 3.5
+ROTAT        45 -45 45
+SCALE        1.2 1.2 1.2
+
+// Sphere
+OBJECT 10
+sphere
+material 8
+TRANS       3 6 -6
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// cube 3
+OBJECT 11
+cube
+material 5
+TRANS        2.5 2 2.5
+ROTAT        45 -35 45
+SCALE        1.2 1.2 1.2
+
+// cube 4
+OBJECT 12
+cube
+material 5
+TRANS        2 3 1.5
+ROTAT        45 -25 45
+SCALE        1.2 1.2 1.2
\ No newline at end of file
diff --git a/scenes/cornell_new2.txt b/scenes/cornell_new2.txt
new file mode 100644
index 0000000..35995d4
--- /dev/null
+++ b/scenes/cornell_new2.txt
@@ -0,0 +1,333 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         0.9 .41 .38
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .45 .87 .45
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white Half and Half
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0.5
+REFR        0.5
+REFRIOR     1.33
+EMITTANCE   0
+
+// white Full Reflection
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Red Refraction
+MATERIAL 6
+RGB         .78 .48 .48
+SPECEX      0
+SPECRGB     .78 .48 .48
+REFL        0.2
+REFR        0.7
+REFRIOR     1.33
+EMITTANCE   0
+
+// Purple Refraction
+MATERIAL 7
+RGB         .48 .48 .98
+SPECEX      0
+SPECRGB     .87 .73 .93
+REFL        0.3
+REFR        0.7
+REFRIOR     1.33
+EMITTANCE   0
+
+// Blue Reflection
+MATERIAL 8
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .524 .80 .92.2
+REFL        0.8
+REFR        0.2
+REFRIOR     1.66
+EMITTANCE   0
+
+// white Full Reflection
+MATERIAL 9
+RGB         1 0.99 0.8
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   2
+
+//====================================
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+//=====================================
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       11 .01 11
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 11 11
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 11 11
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 11 11
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 11 11
+
+// Sphere
+OBJECT 6
+sphere
+material 6
+TRANS       -2 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Left wall mirror
+OBJECT 7
+cube
+material 5
+TRANS       -4.9 5 1
+ROTAT       0 0 0
+SCALE       .01 3 8
+
+// Right wall mirror
+OBJECT 8
+cube
+material 5
+TRANS       4.9 5 1
+ROTAT       0 0 0
+SCALE       .01 3 8
+
+
+// Sphere
+OBJECT 9
+sphere
+material 7
+TRANS       2 5 -2
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 10
+sphere
+material 4
+TRANS       3 2 3
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Ceiling light 2
+//====================
+OBJECT 11
+cube
+material 0
+TRANS       0 10 4
+ROTAT       0 0 0
+SCALE       6 .2 1
+
+// splere
+OBJECT 12
+sphere
+material 8
+TRANS       -2.8 0.6 2.5
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// splere
+OBJECT 13
+sphere
+material 8
+TRANS       -3 0.5 3
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 14
+sphere
+material 8
+TRANS       -3.5 0.75 3.2
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 15
+sphere
+material 8
+TRANS       -3.5 0.5 2.5
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+
+// splere
+OBJECT 16
+sphere
+material 8
+TRANS       -0.8 0.35 2.5
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// splere
+OBJECT 17
+sphere
+material 8
+TRANS       -1 0.5 3
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 18
+sphere
+material 8
+TRANS       -1.5 0.45 3.2
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 19
+sphere
+material 8
+TRANS       -1.5 0.5 2.5
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+
+// sphere
+OBJECT 20
+sphere
+material 8
+TRANS       -1.95 0.45 3.6
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 21
+sphere
+material 8
+TRANS       -1.89 0.45 3.9
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+
+// sphere
+OBJECT 22
+sphere
+material 8
+TRANS       -1.95 0.45 3.2
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 23
+sphere
+material 8
+TRANS       -1.89 0.45 2.8
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 24
+sphere
+material 8
+TRANS       0.95 0.45 3.6
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 25
+sphere
+material 8
+TRANS       1.89 0.40 3.9
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
+
+// sphere
+OBJECT 26
+sphere
+material 8
+TRANS       1.3 0.40 3.9
+ROTAT       0 0 0
+SCALE       0.5 0.5 0.5
\ No newline at end of file
diff --git a/src/common.cu b/src/common.cu
new file mode 100644
index 0000000..f7a70a6
--- /dev/null
+++ b/src/common.cu
@@ -0,0 +1,54 @@
+#include "common.h"
+
+//void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+//    cudaError_t err = cudaGetLastError();
+//    if (cudaSuccess == err) {
+//        return;
+//    }
+//
+//    fprintf(stderr, "CUDA error");
+//    if (file) {
+//        fprintf(stderr, " (%s:%d)", file, line);
+//    }
+//    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+//    exit(EXIT_FAILURE);
+//}
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+            // TODO
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (tid >= n) return;
+			
+			if(idata[tid]==-1){
+				bools[tid] = 0;
+			}
+			else {
+				bools[tid] = 1;
+			}
+		}
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices) {
+            // TODO
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (tid >= n) return;
+
+			if (bools[tid] == 1) {
+				odata[indices[tid]] = idata[tid];
+			}
+        }
+
+    }
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..52eccf1
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 1024
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+	    /**
+	    * This class is used for timing the performance
+	    * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+	    */
+	    class PerformanceTimer
+	    {
+	    public:
+		    PerformanceTimer()
+		    {
+			    cudaEventCreate(&event_start);
+			    cudaEventCreate(&event_end);
+		    }
+
+		    ~PerformanceTimer()
+		    {
+			    cudaEventDestroy(event_start);
+			    cudaEventDestroy(event_end);
+		    }
+
+		    void startCpuTimer()
+		    {
+			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+			    cpu_timer_started = true;
+
+			    time_start_cpu = std::chrono::high_resolution_clock::now();
+		    }
+
+		    void endCpuTimer()
+		    {
+			    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+			    prev_elapsed_time_cpu_milliseconds =
+				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+			    cpu_timer_started = false;
+		    }
+
+		    void startGpuTimer()
+		    {
+			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+			    gpu_timer_started = true;
+
+			    cudaEventRecord(event_start);
+		    }
+
+		    void endGpuTimer()
+		    {
+			    cudaEventRecord(event_end);
+			    cudaEventSynchronize(event_end);
+
+			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+			    gpu_timer_started = false;
+		    }
+
+		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+		    {
+			    return prev_elapsed_time_cpu_milliseconds;
+		    }
+
+		    float getGpuElapsedTimeForPreviousOperation() //noexcept
+		    {
+			    return prev_elapsed_time_gpu_milliseconds;
+		    }
+
+		    // remove copy and move functions
+		    PerformanceTimer(const PerformanceTimer&) = delete;
+		    PerformanceTimer(PerformanceTimer&&) = delete;
+		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+	    private:
+		    cudaEvent_t event_start = nullptr;
+		    cudaEvent_t event_end = nullptr;
+
+		    using time_point_t = std::chrono::high_resolution_clock::time_point;
+		    time_point_t time_start_cpu;
+		    time_point_t time_end_cpu;
+
+		    bool cpu_timer_started = false;
+		    bool gpu_timer_started = false;
+
+		    float prev_elapsed_time_cpu_milliseconds = 0.f;
+		    float prev_elapsed_time_gpu_milliseconds = 0.f;
+	    };
+    }
+}
diff --git a/src/efficient.cu b/src/efficient.cu
new file mode 100644
index 0000000..a180665
--- /dev/null
+++ b/src/efficient.cu
@@ -0,0 +1,507 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+namespace StreamCompaction {
+	namespace Efficient {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		int *dev_arrayA;
+		int *dev_arrayB;
+
+		int *dev_bools;
+		int *dev_boolScans;
+
+		int *dev_idata;
+		int *dev_odata;
+
+		int * dev_indices;
+
+		int *dev_lastElements;
+		int *dev_lastElements2;
+
+		void printArray(int n, const int *a, bool abridged = false) {
+			printf("    [ ");
+			for (int i = 0; i < n; i++) {
+				if (abridged && i + 2 == 15 && n > 16) {
+					i = n - 2;
+					printf("... ");
+				}
+				printf("%3d ", a[i]);
+			}
+			printf("]\n");
+		}
+
+		__global__ void kernEffScanUpSweep(int N, int pow2d, int pow2d1, int* arrA) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (k >= N) return;
+
+			if ((k % pow2d1) == 0 && (k + pow2d1 - 1) < N && (k + pow2d - 1) < N) {
+				arrA[k + pow2d1 - 1] += arrA[k + pow2d - 1];
+			}
+		}
+
+		__global__ void kernEffScanDownSweep(int N, int pow2d, int pow2d1, int* arrA) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (k >= N) return;
+
+			int tmp = 0;
+
+			if ((k % pow2d1) == 0 && (k + pow2d1 - 1) < N && (k + pow2d - 1) < N) {
+				tmp = arrA[k + pow2d - 1];
+				arrA[k + pow2d - 1] = arrA[k + pow2d1 - 1];
+				arrA[k + pow2d1 - 1] += tmp;
+			}
+		}
+
+		__global__ void kernInitZero(int N, int* array) {
+
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid < N) {
+				array[tid] = 0;
+			}
+		}
+
+		__global__ void kernScanShared(int n, int * g_odata, int * g_idata) {
+
+			extern __shared__ int temp[];  // allocated on invocation
+
+			int thid = threadIdx.x;
+			int tid_read = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_read >= n) return;
+
+			int offset = 1;
+
+			temp[2 * thid] = g_idata[2 * tid_read]; // load input into shared memory
+			temp[2 * thid + 1] = g_idata[2 * tid_read + 1];
+
+			// build sum in place up the tree
+			for (int d = 2 * blockDim.x >> 1; d > 0; d >>= 1)
+			{
+				__syncthreads();
+
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+
+					temp[bi] += temp[ai];
+				}
+				offset *= 2;
+			}
+
+			if (thid == 0) { temp[2 * blockDim.x - 1] = 0; } // clear the last element
+
+			for (int d = 1; d < 2 * blockDim.x; d *= 2) // traverse down tree & build scan
+			{
+				offset >>= 1;
+				__syncthreads();
+
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+
+					int t = temp[ai];
+					temp[ai] = temp[bi];
+					temp[bi] += t;
+
+				}
+			}
+
+			__syncthreads();
+
+			g_odata[2 * tid_read] = temp[2 * thid]; // write results to device memory
+			g_odata[2 * tid_read + 1] = temp[2 * thid + 1];
+		}
+
+		__global__ void kernGetLastElement(int n, int* s_data, int * g_odata, int * g_idata) {
+			int thid = threadIdx.x;
+
+			int tid_global = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_global >= n) return;
+
+			if (thid == blockDim.x - 1) {
+				s_data[blockIdx.x] = g_odata[tid_global] + g_idata[tid_global];
+			}
+		}
+
+		__global__ void kernUpdateScan(int n, int* s_data, int * g_odata, int * g_idata) {
+			int thid = threadIdx.x;
+			int tid_global = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_global >= n) return;
+
+			g_odata[tid_global] += s_data[blockIdx.x];
+
+		}
+
+		/*
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		 /*
+		 void scan(int n, int *odata, const int *idata) {
+
+			 // TODO
+			 int n_new = n;
+
+			 //check for non-2powerN
+			 if (1 << ilog2ceil(n) != n)
+				 n_new = (1 << ilog2ceil(n));
+
+			 int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			 cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			 checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			 //Initialize to Zero
+			 kernInitZero <<<fullBlocksPerGrid, blockSize >>> (n_new, dev_arrayA);
+			 checkCUDAErrorFn("kernInitZero failed!");
+
+			 // Fill dev_arrayA with idata
+			 cudaMemcpy(dev_arrayA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			 checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			 bool tmp = true;
+			 try {
+				 timer().startGpuTimer();
+				 //printf("IN WEScan timer started!\n");
+			 }
+			 catch (const std::runtime_error& e) {
+				 tmp = false;
+			 }
+
+			 // Upstream
+			 int pow2d1 = 0;
+			 int pow2d = 0;
+			 for (int d = 0; d <= ilog2ceil(n_new)-1; d++) {
+				 pow2d = 1 << (d);
+				 pow2d1 = 1 << (d+1);
+				 kernEffScanUpSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, dev_arrayA);
+				 checkCUDAErrorFn("kernEffScanUpSweep failed!");
+			 }
+
+			 // Downstream
+			 int *zero = new int[1];
+			 zero[0] = 0;
+			 cudaMemcpy(dev_arrayA + n_new-1, zero, 1*sizeof(int), cudaMemcpyHostToDevice);
+
+			 for (int d = ilog2ceil(n_new)-1; d >= 0; d--) {
+				 pow2d = 1 << (d);
+				 pow2d1 = 1 << (d + 1);
+				 kernEffScanDownSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, dev_arrayA);
+				 checkCUDAErrorFn("kernGenerateRandomPosArray failed!");
+			 }
+
+			 if (tmp == true) {
+				 timer().endGpuTimer();
+				 //printf("IN WEScan timer ended!\n");
+			 }
+
+			 // Copy back to cpu
+			 cudaMemcpy(odata, dev_arrayA, n*sizeof(int), cudaMemcpyDeviceToHost);
+			 checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			 //printf("BBT Scan Final Computed : \n");
+			 //printArray(n, odata, true);
+
+			 cudaFree(dev_arrayA);
+			 return;
+		 }
+		 */
+
+
+		void oldScan(int n_new, int *odata, int *idata) {
+
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			// Upstream
+			int pow2d1 = 0;
+			int pow2d = 0;
+			for (int d = 0; d <= ilog2ceil(n_new) - 1; d++) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d + 1);
+				kernEffScanUpSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, idata);
+				checkCUDAErrorFn("kernEffScanUpSweep failed!");
+			}
+
+			// Downstream
+			int *zero = new int[1];
+			zero[0] = 0;
+			cudaMemcpy(idata + n_new - 1, zero, 1 * sizeof(int), cudaMemcpyHostToDevice);
+
+			for (int d = ilog2ceil(n_new) - 1; d >= 0; d--) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d + 1);
+				kernEffScanDownSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, idata);
+				checkCUDAErrorFn("kernGenerateRandomPosArray failed!");
+			}
+
+			// Copy back to out
+			cudaMemcpy(odata, idata, n_new * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayB to odata failed!");
+			return;
+		}
+
+
+		void scan(int n, int *odata, const int *idata) {
+
+			// TODO
+			int n_new = n;
+			//int *tmp_print = new int[n];
+
+			//check for non-2powerN
+			if (1 << ilog2ceil(n) != n)
+				n_new = (1 << ilog2ceil(n));
+
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//Initialize to Zero
+			kernInitZero << <fullBlocksPerGrid, blockSize >> > (n_new, dev_arrayA);
+			checkCUDAErrorFn("kernInitZero failed!");
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			// More arrays
+			cudaMalloc((void**)&dev_odata, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements2, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			bool tmp = true;
+			try {
+				timer().startGpuTimer();
+				//printf("IN WEScan timer started!\n");
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+
+			//printf("\n==========================STARTED WES================================\n");
+			//printf("Pre Scan Array \n");
+			//printArray(n, idata, true);
+
+			//fullBlocksPerGrid = 4;
+
+			kernScanShared << < fullBlocksPerGrid, blockSize / 2, blockSize  * sizeof(int) >> > (n_new, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to tmp_print failed!");
+			//printf("kernScanShared results per %d blocks\n", fullBlocksPerGrid);
+			//printArray(n_new, tmp_print, true);
+
+			kernGetLastElement << < fullBlocksPerGrid, blockSize, blockSize * sizeof(int) >> > (n_new, dev_lastElements, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_lastElements, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("kernGetLastElement results\n");
+			//printArray(fullBlocksPerGrid, tmp_print, true);
+
+			oldScan(fullBlocksPerGrid, dev_lastElements2, dev_lastElements);
+
+			//kernScanShared << < 1, blockSize / 2, blockSize * sizeof(int) >> > (n_new, dev_lastElements2, dev_lastElements);
+			//cudaMemcpy(tmp_print, dev_lastElements2, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("scan on  kernGetLastElement\n");
+			//printArray(fullBlocksPerGrid, tmp_print, true);
+
+			kernUpdateScan << < fullBlocksPerGrid, blockSize >> > (n_new, dev_lastElements2, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("FINAL Scan results\n");
+			//printArray(n_new, tmp_print, true);
+			//printf("\n==========================FINISHED WES================================\n");
+
+
+			if (tmp == true) {
+				timer().endGpuTimer();
+				//printf("IN WEScan timer ended!\n");
+			}
+
+			// Copy back
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			//printf("BBT Scan Final Computed : \n");
+			//printArray(n, odata, true);
+			cudaFree(dev_arrayA);
+			cudaFree(dev_odata);
+			cudaFree(dev_lastElements);
+			cudaFree(dev_lastElements2);
+
+			return;
+		}
+
+
+
+		void compact_scan(int n, int *dev_odata, int *dev_idata) {
+
+			// TODO
+			int n_new = n;
+			//int *tmp_print = new int[n];
+
+			//check for non-2powerN
+			if (1 << ilog2ceil(n) != n) {
+				n_new = (1 << ilog2ceil(n));
+			}
+
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//Initialize to Zero
+			kernInitZero << <fullBlocksPerGrid, blockSize >> > (n_new, dev_arrayA);
+			checkCUDAErrorFn("kernInitZero failed!");
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			// More arrays
+			cudaMalloc((void**)&dev_lastElements, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements2, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			bool tmp = true;
+			try {
+				timer().startGpuTimer();
+				//printf("IN WEScan timer started!\n");
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+
+			kernScanShared << < fullBlocksPerGrid, blockSize / 2, blockSize * sizeof(int) >> > (n_new, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+
+			kernGetLastElement << < fullBlocksPerGrid, blockSize, blockSize * sizeof(int) >> > (n_new, dev_lastElements, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_lastElements, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+
+			oldScan(fullBlocksPerGrid, dev_lastElements2, dev_lastElements);
+			//kernScanShared << < 1, blockSize / 2, blockSize * sizeof(int) >> > (n_new, dev_lastElements2, dev_lastElements);
+
+			kernUpdateScan << < fullBlocksPerGrid, blockSize >> > (n_new, dev_lastElements2, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+
+			if (tmp == true) {
+				timer().endGpuTimer();
+				//printf("IN WEScan timer ended!\n");
+			}
+
+			// Copy back
+			//cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			//printf("BBT Scan Final Computed : \n");
+			//printArray(n, odata, true);
+			cudaFree(dev_arrayA);
+			cudaFree(dev_lastElements);
+			cudaFree(dev_lastElements2);
+
+			return;
+		}
+
+
+		/**
+		 * Performs stream compaction on idata, storing the result into odata.
+		 * All zeroes are discarded.
+		 *
+		 * @param n      The number of elements in idata.
+		 * @param odata  The array into which to store elements.
+		 * @param idata  The array of elements to compact.
+		 * @returns      The number of elements remaining after compaction.
+		 */
+
+		int compact(int n, int *dev_idata) {
+
+			// TODO
+			int * indices = new int[n];
+			int * bools = new int[n];
+			int fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_bools, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_bools failed!");
+
+			//cudaMalloc((void**)&dev_idata, n * sizeof(int));
+			//checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			//checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			int n_new = n;
+			if (1 << ilog2ceil(n) != n) {
+				int n_new = (1 << ilog2ceil(n));
+			} // allocate enough memory to thandle non power of two
+			cudaMalloc((void**)&dev_indices, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_indices failed!");
+
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_indices failed!");
+
+
+			timer().startGpuTimer();
+
+			//Compute bools
+			Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
+			checkCUDAErrorFn("kernMapToBoolean failed!");
+
+			//compute scans
+			compact_scan(n, dev_indices, dev_bools);
+
+			//scatter
+			Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+			checkCUDAErrorFn("kernScatter failed!");
+
+			timer().endGpuTimer();
+
+			int *lastiEl = new int[1];
+			cudaMemcpy(lastiEl, dev_idata + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+
+			// Copy back to cpu
+			cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+
+
+			int *lastEl = new int[1];
+			cudaMemcpy(lastEl, dev_indices + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+
+
+
+			//printf("GPU Compaction : \n");
+			//printArray(n, odata, true);
+
+			cudaFree(dev_bools);
+			//cudaFree(dev_idata);
+			cudaFree(dev_indices);
+			cudaFree(dev_odata);
+
+			if (lastiEl[0] != 0) {
+				return lastEl[0] + 1;
+			}
+			else {
+				return lastEl[0];
+			}
+		}
+	}
+}
diff --git a/src/efficient.h b/src/efficient.h
new file mode 100644
index 0000000..a50dcfa
--- /dev/null
+++ b/src/efficient.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compact(int n, int *idata);
+    }
+}
diff --git a/src/interactions.h b/src/interactions.h
index 5ce3628..4c7afe6 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -76,4 +76,4 @@ void scatterRay(
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
-}
+}
\ No newline at end of file
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index c1ec122..1b5a483 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -9,12 +9,38 @@
 #include "scene.h"
 #include "glm/glm.hpp"
 #include "glm/gtx/norm.hpp"
+#include "glm/gtx/transform.hpp"
+#include "glm/gtx/transform.hpp"
+#include "glm/gtc/matrix_inverse.hpp"
 #include "utilities.h"
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
-
-#define ERRORCHECK 1
+#include <thrust/partition.h>
+#include <thrust/sort.h>
+
+#include "efficient.h"
+
+#define ERRORCHECK		1
+
+//=======================
+// FEATURE SWITCH
+//=======================
+
+// Core Feature Switch
+//========================
+// Kernel and BSDF
+// Basic Stream Compaction
+#define THRUSTSTCOMP	1 //slow 
+#define SORTBYMATERIAL	0 //slow 
+#define FIRSTCACHE		1
+#define TIMEDEPTH       0
+// Advance Features
+//========================
+#define ANTIALIASING	0
+#define WORKEFFCOMP     0
+#define MOTIONBLUR		0  
+#define DEPTHOFFIELD	0 // not implemented
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
@@ -73,8 +99,15 @@ static Geom * dev_geoms = NULL;
 static Material * dev_materials = NULL;
 static PathSegment * dev_paths = NULL;
 static ShadeableIntersection * dev_intersections = NULL;
+
 // TODO: static variables for device memory, any extra info you need, etc
-// ...
+static ShadeableIntersection * dev_intersections_cache = NULL;
+static int * materials_to_sort = NULL;
+int * dev_paths_idx = NULL;
+
+// time iteration
+cudaEvent_t start, stop;
+
 
 void pathtraceInit(Scene *scene) {
     hst_scene = scene;
@@ -96,8 +129,19 @@ void pathtraceInit(Scene *scene) {
   	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
     // TODO: initialize any extra device memeory you need
+	cudaMalloc(&dev_intersections_cache, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections_cache, 0, pixelcount * sizeof(ShadeableIntersection));
+
+	cudaMalloc(&materials_to_sort, pixelcount * sizeof(int));
+	cudaMemset(materials_to_sort, 0, pixelcount * sizeof(int));
+
+	cudaMalloc(&dev_paths_idx, pixelcount * sizeof(int));
+	cudaMemset(dev_paths_idx, 0, pixelcount * sizeof(int));
 
     checkCUDAError("pathtraceInit");
+
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
 }
 
 void pathtraceFree() {
@@ -106,8 +150,12 @@ void pathtraceFree() {
   	cudaFree(dev_geoms);
   	cudaFree(dev_materials);
   	cudaFree(dev_intersections);
-    // TODO: clean up any extra device memory you created
 
+    // TODO: clean up any extra device memory you created
+	cudaFree(dev_intersections_cache);
+	cudaFree(materials_to_sort);
+	cudaFree(dev_paths_idx);
+	
     checkCUDAError("pathtraceFree");
 }
 
@@ -119,7 +167,7 @@ void pathtraceFree() {
 * motion blur - jitter rays "in time"
 * lens effect - jitter ray origin positions based on a lens
 */
-__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
+__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments, int * dev_paths_idx)
 {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -129,9 +177,15 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		PathSegment & segment = pathSegments[index];
 
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+        segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		// TODO: implement antialiasing by jittering the ray
+#if ANTIALIASING 
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, segment.remainingBounces, index);
+		thrust::uniform_real_distribution<float> u01(-0.49, 0.49);
+		x += u01(rng);
+		y += u01(rng);
+#endif
 		segment.ray.direction = glm::normalize(cam.view
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
@@ -139,6 +193,39 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
+		dev_paths_idx[index] = index;
+		//printf("%d generateRayFromCamera indices %d %d\n", iter, index, dev_paths_idx[index]);
+	}
+}
+
+__global__ void KernMotionBlur(int depth
+	, Geom * geoms
+	, int geoms_size
+	, int iter
+	, glm::vec3 speed) {
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx == 0)
+	{
+		for (int i = 0; i < geoms_size; i++)
+		{
+			Geom & geom = geoms[i];
+			if (geom.type == SPHERE && (geom.materialid != 0)) {
+
+				glm::vec3 translation = geom.translation + speed;
+
+				glm::mat4 translationMat = glm::translate(glm::mat4(), (1.0f*iter)*translation);
+				glm::mat4 rotationMat = glm::rotate(glm::mat4(), geom.rotation.x * (float)PI / 180, glm::vec3(1, 0, 0));
+				rotationMat = rotationMat * glm::rotate(glm::mat4(), geom.rotation.y * (float)PI / 180, glm::vec3(0, 1, 0));
+				rotationMat = rotationMat * glm::rotate(glm::mat4(), geom.rotation.z * (float)PI / 180, glm::vec3(0, 0, 1));
+				glm::mat4 scaleMat = glm::scale(glm::mat4(), geom.scale);
+				
+				
+				geom.transform = translationMat * rotationMat * scaleMat;
+				geom.inverseTransform = glm::inverse(geom.transform);
+				geom.invTranspose = glm::inverseTranspose(geom.transform);
+
+			}
+		}
 	}
 }
 
@@ -153,14 +240,17 @@ __global__ void computeIntersections(
 	, Geom * geoms
 	, int geoms_size
 	, ShadeableIntersection * intersections
+	,int iter
+	, int *dev_paths_idx
 	)
 {
 	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (path_index < num_paths)
-	{
-		PathSegment pathSegment = pathSegments[path_index];
-
+	if (path_index < num_paths &&dev_paths_idx[path_index]!=-1)
+	{	
+		//printf("%d inComputeInteractions numpaths %d %d %d \n", iter, num_paths, path_index, dev_paths_idx[path_index]);
+		PathSegment pathSegment = pathSegments[dev_paths_idx[path_index]];
+		
 		float t;
 		glm::vec3 intersect_point;
 		glm::vec3 normal;
@@ -171,18 +261,32 @@ __global__ void computeIntersections(
 		glm::vec3 tmp_intersect;
 		glm::vec3 tmp_normal;
 
-		// naive parse through global geoms
+		float alpha = 0.8;
+		glm::mat4 motion = glm::mat4(1.0f, 0.0f, 0.0f, iter*0.0f,
+			0.0f, 1.0f, 0.0f, iter*0.0005f,
+			0.0f, 0.0f, 1.0f, iter*0.0f,
+			0.0f, 0.0f, 0.0f, 1.0f);
 
+		// naive parse through global geoms
 		for (int i = 0; i < geoms_size; i++)
 		{
 			Geom & geom = geoms[i];
 
-			if (geom.type == CUBE)
+			if (geom.type == CUBE )
 			{
 				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
 			}
 			else if (geom.type == SPHERE)
-			{
+			{   
+
+#if MOTIONBLUR
+				if (geom.materialid != 0) {
+					geom.transform = alpha * geom.transformInitial + (1 - alpha)*motion*geom.transformInitial;
+					geom.inverseTransform = glm::inverse(geom.transform);
+					geom.invTranspose = glm::inverseTranspose(geom.transform);
+				}
+#endif
+
 				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
 			}
 			// TODO: add more intersection tests here... triangle? metaball? CSG?
@@ -200,14 +304,15 @@ __global__ void computeIntersections(
 
 		if (hit_geom_index == -1)
 		{
-			intersections[path_index].t = -1.0f;
+			intersections[dev_paths_idx[path_index]].t = -1.0f;
 		}
 		else
 		{
 			//The ray hits something
-			intersections[path_index].t = t_min;
-			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
-			intersections[path_index].surfaceNormal = normal;
+			intersections[dev_paths_idx[path_index]].t = t_min;
+			intersections[dev_paths_idx[path_index]].materialId = geoms[hit_geom_index].materialid;
+			intersections[dev_paths_idx[path_index]].surfaceNormal = normal;
+			intersections[dev_paths_idx[path_index]].intersectionPoint = intersect_point;
 		}
 	}
 }
@@ -227,6 +332,7 @@ __global__ void shadeFakeMaterial (
 	, ShadeableIntersection * shadeableIntersections
 	, PathSegment * pathSegments
 	, Material * materials
+	, int depth
 	)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -237,7 +343,7 @@ __global__ void shadeFakeMaterial (
       // Set up the RNG
       // LOOK: this is how you use thrust's RNG! Please look at
       // makeSeededRandomEngine as well.
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
+      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, depth);
       thrust::uniform_real_distribution<float> u01(0, 1);
 
       Material material = materials[intersection.materialId];
@@ -265,23 +371,186 @@ __global__ void shadeFakeMaterial (
   }
 }
 
+
+
+// NEW SHADER!
+
+
+__global__ void shadeMaterial (
+	int iter
+	, int num_paths
+	, ShadeableIntersection * shadeableIntersections
+	, PathSegment * pathSegments
+	, Material * materials
+	, int depth
+	, int *dev_paths_idx
+	) {
+	
+	int idxd = blockIdx.x * blockDim.x + threadIdx.x;
+	int idx = dev_paths_idx[idxd];
+
+	if (dev_paths_idx[idxd] == -1) return;
+
+	if (idxd < num_paths && (pathSegments[idx].remainingBounces > 0))
+	
+	{
+		ShadeableIntersection intersection = shadeableIntersections[idx];
+
+		if (intersection.t > 0.0f) { // if the intersection exists...
+
+			thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, depth);
+			thrust::uniform_real_distribution<float> u01(0, 1);
+
+			Material material = materials[intersection.materialId];
+			glm::vec3 materialColor = material.color;
+
+			// If the material indicates that the object was a light, "light" the ray
+			if (material.emittance > 0.0f) {
+				pathSegments[idx].color *= (materialColor * material.emittance);
+				pathSegments[idx].remainingBounces = 0;
+				dev_paths_idx[idxd] = -1;
+			}
+			else { 
+				// Random sample probabiltiy between the three types of materials
+				// material.hasReflective + material.hasRefractive + // 1- sum is diffuese material.
+				// assert(material.hasReflective + material.hasRefractive <= 0.1f);
+				char scase = 'D';
+				if (material.hasReflective == 0.0f && material.hasRefractive == 0.0f) {
+					scase = 'D'; // Diffuse
+				}
+				else if (material.hasRefractive == 1.0f && material.hasReflective == 0.0f) {
+					scase = 'F'; // Refractive
+				}
+				else if (material.hasReflective == 1.0f && material.hasRefractive == 0.0f) {
+					scase = 'R'; // Reflective
+				}
+				else if (material.hasReflective >= 0.0f && material.hasRefractive >= 0.0f) {
+					// randomly pick between the three cases
+					float rand = u01(rng);
+					float reflect = material.hasReflective;
+					float refract = material.hasRefractive + reflect;
+					
+					if (rand <= reflect) { scase = 'R'; }
+					else if (rand > reflect && rand <= refract) { scase = 'F'; }
+					else { scase = 'D';}//rand > refract -> diffuese 
+				}
+
+				switch (scase) {
+
+					case 'D': // DIFFUSE
+						pathSegments[idx].color *= materialColor;
+						pathSegments[idx].ray.direction = calculateRandomDirectionInHemisphere(intersection.surfaceNormal, rng);
+						pathSegments[idx].ray.origin = intersection.intersectionPoint;
+						pathSegments[idx].remainingBounces -= 1;
+
+						break;
+				
+					case 'F': // REFRACTION
+						//check change of media
+						float cosTheta = glm::dot(glm::normalize(pathSegments[idx].ray.direction), (intersection.surfaceNormal));
+
+						if (cosTheta > 0.0f) { //  Object to Air
+							glm::vec3 tmp = glm::refract(pathSegments[idx].ray.direction, (glm::vec3(-1.0f)*intersection.surfaceNormal), material.indexOfRefraction);
+							if (glm::length(tmp) > 0.0000001f) {
+								pathSegments[idx].color *= material.specular.color;
+								pathSegments[idx].ray.direction = tmp;
+								pathSegments[idx].ray.origin = intersection.intersectionPoint;
+								pathSegments[idx].remainingBounces -= 1;
+							}
+							else {
+								// Reflection
+								pathSegments[idx].color *= material.specular.color;
+								pathSegments[idx].ray.direction = glm::reflect(pathSegments[idx].ray.direction, (glm::vec3(-1.0f)*intersection.surfaceNormal));
+								pathSegments[idx].ray.origin = intersection.intersectionPoint;
+								pathSegments[idx].remainingBounces -= 1;
+							}
+						}
+						else { //  Air to Object
+							glm::vec3 tmp = glm::refract(pathSegments[idx].ray.direction, glm::vec3(1.0f)*intersection.surfaceNormal, (0.1f/material.indexOfRefraction));
+							if (glm::length(tmp) > 0.0000001f) {
+								pathSegments[idx].color *= material.specular.color;
+								pathSegments[idx].ray.direction = tmp;
+								pathSegments[idx].ray.origin = intersection.intersectionPoint;
+								pathSegments[idx].remainingBounces -= 1;
+							}
+							else {
+								// Reflection
+								pathSegments[idx].color *= material.specular.color;
+								pathSegments[idx].ray.direction = glm::reflect(pathSegments[idx].ray.direction, (glm::vec3(1.0f)*intersection.surfaceNormal));
+								pathSegments[idx].ray.origin = intersection.intersectionPoint;
+								pathSegments[idx].remainingBounces -= 1;
+							}
+						}
+						break; 
+				
+					case 'R': // REFLECTION
+						pathSegments[idx].color *= material.specular.color;
+						pathSegments[idx].ray.direction = glm::reflect(pathSegments[idx].ray.direction, intersection.surfaceNormal);
+						pathSegments[idx].ray.origin = intersection.intersectionPoint;
+						pathSegments[idx].remainingBounces -= 1;
+						break;
+				} 
+				
+				// offset ray
+				pathSegments[idx].ray.origin = pathSegments[idx].ray.origin + (pathSegments[idx].ray.direction)*glm::vec3(0.015f);// EPSILON);
+				// clamp color
+				pathSegments[idx].color = glm::clamp(pathSegments[idx].color, glm::vec3(0.0f), glm::vec3(1.0));
+				
+				if(pathSegments[idx].remainingBounces == 0)
+					dev_paths_idx[idxd] = -1;
+			}
+		}
+		else {// If there was no intersection, color the ray black.
+				pathSegments[idx].color = glm::vec3(0.0f);
+				pathSegments[idx].remainingBounces = 0;
+				dev_paths_idx[idxd] = -1;
+		}
+	}
+}
+
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
 {
 	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 
 	if (index < nPaths)
-	{
+	{	
 		PathSegment iterationPath = iterationPaths[index];
 		image[iterationPath.pixelIndex] += iterationPath.color;
 	}
 }
 
+//struct hasExited
+//{
+//	__host__ __device__
+//		bool operator()(const PathSegment &dev_path)
+//	{return (dev_path.remainingBounces > 0);}
+//};
+
+struct hasExited
+{
+	__host__ __device__
+		bool operator()(const int &dev_path_idx)
+	{
+		return (dev_path_idx >= 0);
+	}
+};
+
+
+struct materialCmp{
+	__host__ __device__
+		bool operator()(const ShadeableIntersection& m1, const ShadeableIntersection& m2) {
+		return m1.materialId < m2.materialId;
+	}
+};
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
  */
 void pathtrace(uchar4 *pbo, int frame, int iter) {
+
     const int traceDepth = hst_scene->state.traceDepth;
     const Camera &cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
@@ -326,58 +595,157 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
 
     // TODO: perform one iteration of path tracing
 
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
+	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths, dev_paths_idx);
+	checkCUDAError("Error in generate camera ray");
 
 	int depth = 0;
 	PathSegment* dev_path_end = dev_paths + pixelcount;
 	int num_paths = dev_path_end - dev_paths;
 
+
+
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
+	dim3 numblocksPathSegmentTracing;
 
-  bool iterationComplete = false;
+	bool iterationComplete = false;
+	
+	bool flag = false;
+	if (iter == 2) {
+		flag = true;
+	}
+
+		
 	while (!iterationComplete) {
+#if TIMEDEPTH
+		cudaEventRecord(start);
+#endif
+#if FIRSTCACHE
+		if (depth == 0) {
+			if (iter == 1) {//cache first bounce
+				// clean shading chunks
+				cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+				// tracing
+				numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+				computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+					depth
+					, num_paths
+					, dev_paths
+					, dev_geoms
+					, hst_scene->geoms.size()
+					, dev_intersections
+					, iter
+					, dev_paths_idx
+					);
+				cudaMemcpy(dev_intersections_cache, dev_intersections, 
+					pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+				checkCUDAError("error in trace-one-bounce");
+				cudaDeviceSynchronize();
+			}
+			else {// use cached bounce!
+				// clean shading chunks
+				cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+				
+				cudaMemcpy(dev_intersections, dev_intersections_cache,
+					pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+			}
+		}
+		else { // non-zero depth
+			// clean shading chunks
+			cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+			// tracing
+			numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+			computeIntersections <<<numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				, iter
+				, dev_paths_idx
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+		}
+#else
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+		// tracing
+		numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+			depth
+			, num_paths
+			, dev_paths
+			, dev_geoms
+			, hst_scene->geoms.size()
+			, dev_intersections
+			, iter
+			, dev_paths_idx
+			);
+		checkCUDAError("error in trace-one-bounce");
+		cudaDeviceSynchronize();
+#endif
 
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
+		depth++;
+
+		// TODO:
+		// --- Shading Stage ---
+		// Shade path segments based on intersections and generate new rays by
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+		// TODO: compare between directly shading the path segments and shading
+		// path segments that have been reshuffled to be contiguous in memory.
+
+		shadeMaterial<<<numblocksPathSegmentTracing, blockSize1d >>> (
+		iter,
+		num_paths,
+		dev_intersections,
+		dev_paths,
+		dev_materials,
+		depth,
+		dev_paths_idx
 		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-	depth++;
-
-
-	// TODO:
-	// --- Shading Stage ---
-	// Shade path segments based on intersections and generate new rays by
-  // evaluating the BSDF.
-  // Start off with just a big kernel that handles all the different
-  // materials you have in the scenefile.
-  // TODO: compare between directly shading the path segments and shading
-  // path segments that have been reshuffled to be contiguous in memory.
-
-  shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = true; // TODO: should be based off stream compaction results.
+
+#if WORKEFFCOMP
+		//Compute stream compaction here
+		num_paths = StreamCompaction::Efficient::compact(num_paths, dev_paths_idx);
+		//cout << "num_paths " << num_paths << endl;
+#elif THRUSTSTCOMP
+		//Compute stream compaction here
+		int *end = thrust::partition(thrust::device, dev_paths_idx, dev_paths_idx + num_paths, hasExited());
+		num_paths = end - dev_paths_idx;
+#else
+		; // No cpmaction at all
+#endif
+
+		iterationComplete = (num_paths <= 0) || (depth > traceDepth);
+
+#if TIMEDEPTH
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		float milliseconds = 0;
+		cudaEventElapsedTime(&milliseconds, start, stop);
+		printf("Iter %d \t depth %d \t LiveRays %d  ElapsedTimems %0.04f\n", iter, depth, num_paths, milliseconds);
+#endif
+
+
+#if SORTBYMATERIAL
+		//sort by matrial 
+		if (iterationComplete == false) {
+			thrust::stable_sort_by_key(thrust::device, dev_intersections, dev_intersections+num_paths, dev_paths, materialCmp());
+		}
+#endif
+
 	}
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+
+	num_paths = pixelcount;
+
+	// Assemble this iteration and apply it to the image
+	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
 
     ///////////////////////////////////////////////////////////////////////////
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 1241227..d67ac27 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -3,6 +3,7 @@
 #include <vector>
 #include "scene.h"
 
+
 void pathtraceInit(Scene *scene);
 void pathtraceFree();
 void pathtrace(uchar4 *pbo, int frame, int iteration);
diff --git a/src/preview.cpp b/src/preview.cpp
index 4eb0bc1..9540fcb 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -171,7 +171,8 @@ bool init() {
 void mainLoop() {
     while (!glfwWindowShouldClose(window)) {
         glfwPollEvents();
-        runCuda();
+        
+		runCuda();
 
         string title = "CIS565 Path Tracer | " + utilityCore::convertIntToString(iteration) + " Iterations";
         glfwSetWindowTitle(window, title.c_str());
diff --git a/src/scene.cpp b/src/scene.cpp
index cbae043..ac76663 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -81,6 +81,8 @@ int Scene::loadGeom(string objectid) {
 
         newGeom.transform = utilityCore::buildTransformationMatrix(
                 newGeom.translation, newGeom.rotation, newGeom.scale);
+		newGeom.transformInitial = utilityCore::buildTransformationMatrix(
+			newGeom.translation, newGeom.rotation, newGeom.scale);
         newGeom.inverseTransform = glm::inverse(newGeom.transform);
         newGeom.invTranspose = glm::inverseTranspose(newGeom.transform);
 
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index b38b820..42eda26 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -24,6 +24,7 @@ struct Geom {
     glm::vec3 rotation;
     glm::vec3 scale;
     glm::mat4 transform;
+	glm::mat4 transformInitial;
     glm::mat4 inverseTransform;
     glm::mat4 invTranspose;
 };
@@ -73,4 +74,5 @@ struct ShadeableIntersection {
   float t;
   glm::vec3 surfaceNormal;
   int materialId;
+  glm::vec3 intersectionPoint;
 };
diff --git a/stream_compaction/Project2-Stream-Compaction/CMakeLists.txt b/stream_compaction/Project2-Stream-Compaction/CMakeLists.txt
new file mode 100644
index 0000000..aad00c7
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(cis565_stream_compaction_test)
+
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+# Enable C++11 for host code
+set(CMAKE_CXX_STANDARD 11)
+
+list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)
+list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo)
+
+# Crucial magic for CUDA linking
+find_package(Threads REQUIRED)
+find_package(CUDA 10 REQUIRED)
+
+set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
+set(CUDA_SEPARABLE_COMPILATION ON)
+
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+endif()
+
+include_directories(.)
+add_subdirectory(stream_compaction)
+
+cuda_add_executable(${CMAKE_PROJECT_NAME}
+    "src/main.cpp"
+    )
+
+target_link_libraries(${CMAKE_PROJECT_NAME}
+    stream_compaction
+    ${CORELIBS}
+    )
diff --git a/stream_compaction/Project2-Stream-Compaction/GNUmakefile b/stream_compaction/Project2-Stream-Compaction/GNUmakefile
new file mode 100644
index 0000000..2b43311
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/GNUmakefile
@@ -0,0 +1,31 @@
+CMAKE_ALT1 := /usr/local/bin/cmake
+CMAKE_ALT2 := /Applications/CMake.app/Contents/bin/cmake
+CMAKE := $(shell \
+	which cmake 2>/dev/null || \
+	([ -e ${CMAKE_ALT1} ] && echo "${CMAKE_ALT1}") || \
+	([ -e ${CMAKE_ALT2} ] && echo "${CMAKE_ALT2}") \
+	)
+
+all: Release
+
+
+Debug: build
+	(cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make)
+
+MinSizeRel: build
+	(cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make)
+
+Release: build
+	(cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make)
+
+RelWithDebugInfo: build
+	(cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make)
+
+
+build:
+	mkdir -p build
+
+clean:
+	((cd build && make clean) 2>&- || true)
+
+.PHONY: all Debug MinSizeRel Release RelWithDebugInfo clean
diff --git a/stream_compaction/Project2-Stream-Compaction/INSTRUCTIONS_STREAM_COMPACTION.md b/stream_compaction/Project2-Stream-Compaction/INSTRUCTIONS_STREAM_COMPACTION.md
new file mode 100644
index 0000000..58b7474
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/INSTRUCTIONS_STREAM_COMPACTION.md
@@ -0,0 +1,293 @@
+Project 2 Stream Compaction Instructions
+========================
+
+**Summary:** In this project, you'll implement GPU stream compaction in CUDA,
+from scratch. This algorithm is widely used, and will be important for
+accelerating your path tracer project.
+
+Your stream compaction implementations in this project will simply remove `0`s
+from an array of `int`s. In the path tracer, you will remove terminated paths
+from an array of rays.
+
+In addition to being useful for your path tracer, this project is meant to
+reorient your algorithmic thinking to the way of the GPU. On GPUs, many
+algorithms can benefit from massive parallelism and, in particular, data
+parallelism: executing the same code many times simultaneously with different
+data.
+
+You'll implement a few different versions of the *Scan* (*Prefix Sum*)
+algorithm. First, you'll implement a CPU version of the algorithm to reinforce
+your understanding. Then, you'll write a few GPU implementations: "naive" and
+"work-efficient." Finally, you'll use some of these to implement GPU stream
+compaction.
+
+**Algorithm overview & details:** There are two primary references for details
+on the implementation of scan and stream compaction.
+
+* The [slides on Parallel Algorithms](https://docs.google.com/presentation/d/1ETVONA7QDM-WqsEj4qVOGD6Kura5I6E9yqH-7krnwZ0/edit#slide=id.p126)
+  for Scan, Stream Compaction, and Work-Efficient Parallel Scan.
+* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html).
+    - This online version contains a few small errors (in superscripting, missing braces, bad indentation, etc.) 
+    - We maintain a fix for this at [GPU Gem 3 Ch 39 Patch](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#gpu-gem-3-ch-39-patch). If you find more errors in the chapter, welcome to open new pull requests to contribute.
+* If you are still unclear after reading the steps, take a look at the last chapter - [Algorithm Examples](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#algorithm-examples).
+* [Recitation slides](https://docs.google.com/presentation/d/1daOnWHOjMp1sIqMdVsNnvEU1UYynKcEMARc_W6bGnqE/edit?usp=sharing)
+
+Your GPU stream compaction implementation will live inside of the
+`stream_compaction` subproject. In this way, you will be able to easily copy it
+over for use in your GPU path tracer.
+
+
+## Part 0: The Usual
+
+This project (and all other CUDA projects in this course) requires an NVIDIA
+graphics card with CUDA capability. Any card with Compute Capability 2.0
+(`sm_20`) or greater will work. Check your GPU on this
+[compatibility table](https://developer.nvidia.com/cuda-gpus).
+If you do not have a personal machine with these specs, you may use those
+computers in SIG Lab which have supported GPUs.
+
+### Useful existing code
+
+* `stream_compaction/common.h`
+  * `checkCUDAError` macro: checks for CUDA errors and exits if there were any.
+  * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer.
+* `main.cpp`
+  * Some testing code for your implementations.
+
+**Note 1:** The tests will simply compare against your CPU implementation
+Do it first!
+
+**Note 2:** The tests default to an array of size 256.
+Test with something larger (10,000? 1,000,000?), too!
+
+
+## Part 1: CPU Scan & Stream Compaction
+
+This stream compaction method will remove `0`s from an array of `int`s.
+
+Do this first, and double check the output! It will be used as the expected
+value for the other tests.
+
+In `stream_compaction/cpu.cu`, implement:
+
+* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. For performance comparison, this is supposed to be a simple `for` loop. But for better understanding before starting moving to GPU, you can simulate the GPU scan in this function first.
+* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using
+  the `scan` function.
+* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan`
+  function. Map the input array to an array of 0s and 1s, scan it, and use
+  scatter to produce the output. You will need a **CPU** scatter implementation
+  for this (see slides or GPU Gems chapter for an explanation).
+
+These implementations should only be a few lines long.
+
+
+## Part 2: Naive GPU Scan Algorithm
+
+In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan`
+
+This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. Example 39-1 uses shared memory. This is not required in this project. You can simply use global memory. As a result of this, you will have to do `ilog2ceil(n)` separate kernel invocations.
+
+Since your individual GPU threads are not guaranteed to run simultaneously, you
+can't generally operate on an array in-place on the GPU; it will cause race
+conditions. Instead, create two device arrays. Swap them at each iteration:
+read from A and write to B, read from B and write to A, and so on.
+
+Beware of errors in Example 39-1 in the chapter; both the pseudocode and the CUDA
+code in the online version of Chapter 39 are known to have a few small errors
+(in superscripting, missing braces, bad indentation, etc.)
+
+Be sure to test non-power-of-two-sized arrays.
+
+## Part 3: Work-Efficient GPU Scan & Stream Compaction
+
+### 3.1. Scan
+
+In `stream_compaction/efficient.cu`, implement
+`StreamCompaction::Efficient::scan`
+
+Most of the text in Part 2 applies.
+
+* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2.
+* This can be done in place - it doesn't suffer from the race conditions of
+  the naive method, since there won't be a case where one thread writes to
+  and another thread reads from the same location in the array.
+* Beware of errors in Example 39-2.
+* Test non-power-of-two-sized arrays.
+
+Since the work-efficient scan operates on a binary tree structure, it works
+best with arrays with power-of-two length. Make sure your implementation works
+on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory
+- your intermediate array sizes will need to be rounded to the next power of
+two.
+
+### 3.2. Stream Compaction
+
+This stream compaction method will remove `0`s from an array of `int`s.
+
+In `stream_compaction/efficient.cu`, implement
+`StreamCompaction::Efficient::compact`
+
+For compaction, you will also need to implement the scatter algorithm presented
+in the slides and the GPU Gems chapter.
+
+In `stream_compaction/common.cu`, implement these for use in `compact`:
+
+* `StreamCompaction::Common::kernMapToBoolean`
+* `StreamCompaction::Common::kernScatter`
+
+
+## Part 4: Using Thrust's Implementation
+
+In `stream_compaction/thrust.cu`, implement:
+
+* `StreamCompaction::Thrust::scan`
+
+This should be a very short function which wraps a call to the Thrust library
+function `thrust::exclusive_scan(first, last, result)`.
+
+To measure timing, be sure to exclude memory operations by passing
+`exclusive_scan` a `thrust::device_vector` (which is already allocated on the
+GPU).  You can create a `thrust::device_vector` by creating a
+`thrust::host_vector` from the given pointer, then casting it.
+
+For thrust stream compaction, take a look at [thrust::remove_if](https://thrust.github.io/doc/group__stream__compaction.html). It's not required to analyze `thrust::remove_if` but you're encouraged to do so.
+
+## Part 5: Why is My GPU Approach So Slow? (Extra Credit) (+5)
+
+If you implement your efficient scan version following the slides closely, there's a good chance 
+that you are getting an "efficient" gpu scan that is actually not that efficient -- it is slower than the cpu approach? 
+
+Though it is totally acceptable for this assignment, 
+In addition to explain the reason of this phenomena, you are encouraged to try to upgrade your work-efficient gpu scan. 
+
+Thinking about these may lead you to an aha moment: 
+- What is the occupancy at a deeper level in the upper/down sweep? Are most threads actually working?
+- Are you always launching the same number of blocks throughout each level of the upper/down sweep?
+- If some threads are being lazy, can we do an early termination on them? 
+- How can I compact the threads? What should I modify to keep the remaining threads still working correctly?
+
+Keep in mind this optimization won't need you change a lot of your code structures. 
+It's all about some index calculation hacks.
+
+If you don't run into the slower gpu approach. 
+Congratulations! You are way ahead and you earn this extra credit automatically. 
+
+
+## Part 6: Radix Sort (Extra Credit) (+10)
+
+Add an additional module to the `stream_compaction` subproject. Implement radix
+sort using one of your scan implementations. Add tests to check its correctness.
+
+## Part 7: GPU Scan Using Shared Memory && Hardware Optimization(Extra Credit) (+10)
+
+Implement [GPU Gem Ch 39](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html) Example 39.1, 39.2.
+
+Notice that the size of the shared memory is dynamic and related to the block size. Since each SM has limited shared memory, the block size you set will affect the occupancy of the blocks in each SM. For example, let's say your graphics card has N Kb of shared memory per SM, if you use the maximum of N Kb shared memory per block, then you would have a max occupancy of 1 block per SM. This might not be the best performance.
+
+Besides we can optimize the efficiency by changing our memory access pattern to avoid bank conflicts. See [GPU Gem Ch 39](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html) Section 39.2.3. This hasn't been covered in the course but we encourage you to challenge yourself.
+
+## Write-up
+
+1. Update all of the TODOs at the top of your `README.md`.
+2. Add a description of this project including a list of its features.
+3. Add your performance analysis (see below).
+
+All extra credit features must be documented in your `README.md`, explaining its
+value (with performance comparison, if applicable!) and showing an example how
+it works. For radix sort, show how it is called and an example of its output.
+
+Always profile with Release mode builds and run without debugging.
+
+### Questions
+
+* Roughly optimize the block sizes of each of your implementations for minimal
+  run time on your GPU.
+  * (You shouldn't compare unoptimized implementations to each other!)
+
+* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
+  Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
+  (with array size on the independent axis).
+  * We wrapped up both CPU and GPU timing functions as a performance timer class for you to conveniently measure the time cost.
+    * We use `std::chrono` to provide CPU high-precision timing and CUDA event to measure the CUDA performance.
+    * For CPU, put your CPU code between `timer().startCpuTimer()` and `timer().endCpuTimer()`.
+    * For GPU, put your CUDA code between `timer().startGpuTimer()` and `timer().endGpuTimer()`. Be sure **not** to include any *initial/final* memory operations (`cudaMalloc`, `cudaMemcpy`) in your performance measurements, for comparability.
+    * Don't mix up `CpuTimer` and `GpuTimer`.
+  * To guess at what might be happening inside the Thrust implementation (e.g.
+    allocation, memory copy), take a look at the Nsight timeline for its
+    execution. Your analysis here doesn't have to be detailed, since you aren't
+    even looking at the code for the implementation.
+
+* Write a brief explanation of the phenomena you see here.
+  * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
+    it different for each implementation?
+
+* Paste the output of the test program into a triple-backtick block in your
+  README.
+  * If you add your own tests (e.g. for radix sort or to test additional corner
+    cases), be sure to mention it explicitly.
+
+These questions should help guide you in performance analysis on future
+assignments, as well.
+
+## GPU Gem 3 Ch 39 Patch
+
+* Example 1
+![](img/example-1.png)
+
+* Example 2
+![](img/example-2.jpg)
+
+* Figure-39-4
+![](img/figure-39-4.jpg)
+
+* Figure-39-2. This image shows an naive inclusive scan. We should convert this to an exclusive one for compaction.
+![](img/figure-39-2.jpg)
+
+## Algorithm Examples
+
+* scan: 
+  - goal: produce a prefix sum array of a given array (we only care about exclusive scan here)
+  - input
+    - [1 5 0 1 2 0 3]
+  - output
+    - [0 1 6 6 7 9 9]
+* compact: 
+  - goal: closely and neatly packed the elements != 0
+  - input
+    - [1 5 0 1 2 0 3]
+  - output
+    - [1 5 1 2 3]
+* compactWithoutScan (CPU)
+  - an implementation of compact. So the goal, input and output should all be the same as compact
+  - Simply loop through the input array, meanwhile maintain a pointer indicating which address shall we put the next non-zero element
+* compactWithScan (CPU/GPU)
+  - an implementation of compact. So the goal, input and output should all be the same as compact
+  - 3 steps
+    - map
+      + goal: map our original data array (integer, Light Ray, etc) to a bool array
+      + input
+        - [1 5 0 1 2 0 3]
+      + output
+        - [1 1 0 1 1 0 1]
+    - scan
+        + take the output of last step as input
+        + input
+          - [1 1 0 1 1 0 1]
+        + output
+          - [0 1 2 2 3 4 4]
+    - scatter
+        + preserve non-zero elements and compact them into a new array
+        + input:
+          + original array
+            - [1 5 0 1 2 0 3]
+          + mapped array
+            - [1 1 0 1 1 0 1]
+          + scanned array
+            - [0 1 2 2 3 4 4]
+        + output:
+          - [1 5 1 2 3]
+        + This can be done in parallel on GPU
+        + You can try multi-threading on CPU if you want (not required and not our focus)
+        + for each element input[i] in original array
+          - if it's non-zero (given by mapped array)
+          - then put it at output[index], where index = scanned[i]
\ No newline at end of file
diff --git a/stream_compaction/Project2-Stream-Compaction/README.md b/stream_compaction/Project2-Stream-Compaction/README.md
new file mode 100644
index 0000000..b5fe356
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/README.md
@@ -0,0 +1,149 @@
+## Project 2 Part 1 - CUDA Stream Compaction
+**University of Pennsylvania
+CIS 565: GPU Programming and Architecture**
+
+* Author: Chhavi Sharma ([LinkedIn](https://www.linkedin.com/in/chhavi275/))
+* Tested on: Windows 10, Intel Core(R) Core(TM) i7-6700 CPU @ 3.40GHz 16GB, 
+             NVIDIA Quadro P1000 4GB (MOORE100B-06)
+
+### Index
+
+- [Introduction](https://github.com/chhavisharma/Project2-Number-Algorithms/blob/master/Project2-Stream-Compaction/README.md#introduciton)
+- [Algorithms](https://github.com/chhavisharma/Project2-Number-Algorithms/blob/master/Project2-Stream-Compaction/README.md#algorithms)
+- [Questions and Performance Analysis](https://github.com/chhavisharma/Project2-Number-Algorithms/blob/master/Project2-Stream-Compaction/README.md#questions-and-performance-analysis)
+
+
+### Introduciton
+
+In this project, we implement GPU based Stream Compaction in CUDA. To aid stream compaction, we also implement various versions of the *Scan* (*Prefix Sum*) algorithm, such as CPU, GPU Naive and GPU Work Efficient versions.
+algorithm
+
+Stream compaction, also known as stream filtering or selection, usually produces a smaller output array which contains only the wanted elements from the input array based on some criteria for further processing, while preserving order. For our implementation, We will attemp to remove '0's from an array of ints.
+
+
+### Algorithms
+
+#### 1: CPU Scan & Stream Compaction
+ 
+ We implement stream compaction in two ways:
+ 
+ - CPU based stream compaction: 
+   - Loop over the input data array
+      - Copy non-zero elements to output array
+      - count copies to track size
+   ```
+   compactWithoutScan(N, Odata, Idata)
+     if n > 0
+       int size=0;
+       for i in Idata
+           if (idata[i] != 0) 
+              Odata[counter] = Idata[i]
+              size+=1
+       return size
+   ```
+   
+ - CPU based stream compaction with CPU based scan: 
+   - Compute *Indicator Array* of the input data size that is 1 for non zero elements, an 0 otherwise.
+   - Compute *Scan* over indicator Array to get another array. This gives us write indices for the valid elements in the output array. It also gives us the total valid elelemts.
+   - *Scatter* data, read from the input array where Indiacator Array is 1, write to the outut array at index given by the scan array. That is:
+  ```
+      - map
+        goal: map our original data array (integer, Light Ray, etc) to a bool array
+        input  [1 5 0 1 2 0 3]
+        output [1 1 0 1 1 0 1]
+    - scan
+        take the output of last step as input
+        input   [1 1 0 1 1 0 1]
+        output  [0 1 2 2 3 4 4]
+    - scatter
+        preserve non-zero elements and compact them into a new array
+        original array  [1 5 0 1 2 0 3]
+        mapped array    [1 1 0 1 1 0 1]
+        scanned array   [0 1 2 2 3 4 4]
+        output          [1 5 1 2 3]
+  ```
+  
+-Scan : The goal is to produce a prefic sum of the input array.
+    ```
+    input  [1 1 0 1 1 0 1]
+    output [0 1 2 2 3 4 4]
+    ```  
+
+
+#### 2: Naive GPU Scan
+We can naively parallelize the scan algorithm on the GPU to reduce the loop to ```log2(n)``` iterations. At the first iteraction, n-1 threads add a pair of values and store it in the next array, but as iteractions progress, the number of additions come down to 'O(1)'. Thus this scan has a runtime of 'log2(n)' where as the CPU sequential scan has the runtime of 'O(n)'. The number of additions in this scenario increase to ```O(n*log2(n))```.
+
+
+![Naive Parallel Scan](img/NaiveScan.png)
+
+#### 3: Work-Efficient GPU Scan
+
+We can further parallelize the scan algorithm to bring down the number of addition operations to ```O(n)``` and make it *Work Efficient*. This is done by implementing the scan algorithm using a Balanced Binary Tree and perfroming the UpSweep and DownSweep algorithm. During Upsweep, we start from the tree's leaf nodes and compute partial sums upto the root. These operations are in place. 
+
+![UpSweep](img/UpSweepScan.png)
+
+Finally in the downsweep, starting from the Root node, we perfom the following steps to get the preorder sum.
+
+![DownSweep](img/DownSweepScan.png)
+
+#### 4: Work-Efficient Stream Compaction
+Work efficient stram compaction is nothing but the stream compaction algorithm explained above that uses the work-efficient scan. 
+    ```
+     compactWithScan(n, odata, idata) 
+        Compute indicator array (Parallely)
+        Compute scan (Work efficiently)
+        Scatter 
+    ```
+#### 5: Using Thrust's Implementation
+We also experiemented with CUDA's `thrust::exclusive_scan(first, last, result)` function to compute scan and compare performance.
+
+
+### Questions and Performance Analysis
+
+  * **BlockSize Optimization for each Implementation**
+  We compare the rumtime of GPU Naive scan and and the work efficient naive scan with the number of threads per block to pick  the most optimal configuration for furhter tests.
+
+   *Block Size v/s Runtime*
+  ![](img/BlockSize_vs_Runtime.png)
+
+  From the above plot, we find that BlockSize 128 was the most optimal for both Naive and Work Efficient Scan.
+
+  * **Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).**
+
+     The following plots show the runtime comparasion of all the above scan implementations. 
+     
+     SCAN with increasing data size: Lower Runtime -> Faster Algorithm
+
+    ![](img/Scan1.png)    
+    ![](img/Scan2.png)    
+
+
+     SCAN with increasing data size and nonPowersOf2: Lower Runtime -> Faster Algorithm
+
+    ![](img/Scan1NP.png)    
+    ![](img/Scan2NP.png)   
+    
+  * **Write a brief explanation of the phenomena you see here.Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation?**
+
+  **Scan Comparasion**
+  Varying the data from '2^10' to '2^28', we observe the following:-
+  - The CPU implementation of Scan is realtively faster than all the GPU implementations when the data size is small. This is becuase the compute time is not able to hide the other kernel overheads such as allocating threads and multiple kernel launches. 
+  - As the data size grows, serial compute scan on the CPU grows in time. The naive GPU scan and Work efficint scan are faster. 
+  - For really large data sizes, CPU scan runs faster than Naive GPU scan, which could be becuause the 'n* log2(n)' additional compute in the naive GPU scan overpowers the parallelization. 
+  - Work efficient scan perfroms the best on large data sizes, as comapred to Naive nad CPU scan, becuase of compute 'log(n)' and time 'log(n)' optimization. 
+  - Thrust scan perfroms far better than any other implementation for large data sizes. Thrust maybe optimizing the memory I/O and kernel launching overheads further as comapred to our implementation of Work-efficient scan where we launch a kernel every time in a for loop for upstream and downstream. 
+    
+    
+  **Compaction Comparasion**
+
+  ![](img/SC1.png) 
+    
+  ![](img/SC2.png) 
+
+  Varying the data from '2^10' to '2^28', we observe the following:-
+  - Internstingly, stream compction without scan on the cpu outperforms stream compaction with scan in terms of computation time. The drawback of stream compaction without scan is that it is not in place and uses double the memory of input size. 
+  
+
+  * **Paste the output of the test program **
+
+  ![](img/BlockSz-128-DataSz-20.PNG)
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/CMakeParseArguments.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/CMakeParseArguments.cmake
new file mode 100644
index 0000000..8553f38
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/CMakeParseArguments.cmake
@@ -0,0 +1,161 @@
+#.rst:
+# CMakeParseArguments
+# -------------------
+#
+#
+#
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords>
+# <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions
+# for parsing the arguments given to that macro or function.  It
+# processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e.  keywords which can be used when calling the macro without any
+# value following, like e.g.  the OPTIONAL keyword of the install()
+# command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g.  DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this
+# macro which can be followed by more than one value, like e.g.  the
+# TARGETS or FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.  These
+# variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see
+# whether your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments
+# as the real install() command:
+#
+# ::
+#
+#    function(MY_INSTALL)
+#      set(options OPTIONAL FAST)
+#      set(oneValueArgs DESTINATION RENAME)
+#      set(multiValueArgs TARGETS CONFIGURATIONS)
+#      cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}"
+#                            "${multiValueArgs}" ${ARGN} )
+#      ...
+#
+#
+#
+# Assume my_install() has been called like this:
+#
+# ::
+#
+#    my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+#
+#
+# After the cmake_parse_arguments() call the macro will have set the
+# following variables:
+#
+# ::
+#
+#    MY_INSTALL_OPTIONAL = TRUE
+#    MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+#    MY_INSTALL_DESTINATION = "bin"
+#    MY_INSTALL_RENAME = "" (was not used)
+#    MY_INSTALL_TARGETS = "foo;bar"
+#    MY_INSTALL_CONFIGURATIONS = "" (was not used)
+#    MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+#
+#
+# You can then continue and process these variables.
+#
+# Keywords terminate lists of values, e.g.  if directly after a
+# one_value_keyword another recognized keyword follows, this is
+# interpreted as the beginning of the new option.  E.g.
+# my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION
+# would be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+  return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+  # first set all result variables to empty/FALSE
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+    set(${prefix}_${arg_name})
+  endforeach()
+
+  foreach(option ${_optionNames})
+    set(${prefix}_${option} FALSE)
+  endforeach()
+
+  set(${prefix}_UNPARSED_ARGUMENTS)
+
+  set(insideValues FALSE)
+  set(currentArgName)
+
+  # now iterate over all arguments and fill the result variables
+  foreach(currentArg ${ARGN})
+    list(FIND _optionNames "${currentArg}" optionIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _singleArgNames "${currentArg}" singleArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _multiArgNames "${currentArg}" multiArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+
+    if(${optionIndex} EQUAL -1  AND  ${singleArgIndex} EQUAL -1  AND  ${multiArgIndex} EQUAL -1)
+      if(insideValues)
+        if("${insideValues}" STREQUAL "SINGLE")
+          set(${prefix}_${currentArgName} ${currentArg})
+          set(insideValues FALSE)
+        elseif("${insideValues}" STREQUAL "MULTI")
+          list(APPEND ${prefix}_${currentArgName} ${currentArg})
+        endif()
+      else()
+        list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+      endif()
+    else()
+      if(NOT ${optionIndex} EQUAL -1)
+        set(${prefix}_${currentArg} TRUE)
+        set(insideValues FALSE)
+      elseif(NOT ${singleArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "SINGLE")
+      elseif(NOT ${multiArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "MULTI")
+      endif()
+    endif()
+
+  endforeach()
+
+  # propagate the result variables to the caller:
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+    set(${prefix}_${arg_name}  ${${prefix}_${arg_name}} PARENT_SCOPE)
+  endforeach()
+  set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction()
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA.cmake
new file mode 100644
index 0000000..f4b0783
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA.cmake
@@ -0,0 +1,1806 @@
+#.rst:
+# FindCUDA
+# --------
+#
+# Tools for building CUDA C files: libraries and build dependencies.
+#
+# This script locates the NVIDIA CUDA C tools.  It should work on linux,
+# windows, and mac and should be reasonably up to date with CUDA C
+# releases.
+#
+# This script makes use of the standard find_package arguments of
+# <VERSION>, REQUIRED and QUIET.  CUDA_FOUND will report if an
+# acceptable version of CUDA was found.
+#
+# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if
+# the prefix cannot be determined by the location of nvcc in the system
+# path and REQUIRED is specified to find_package().  To use a different
+# installed version of the toolkit set the environment variable
+# CUDA_BIN_PATH before running cmake (e.g.
+# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default
+# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If
+# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that
+# depend on the path will be relocated.
+#
+# It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain
+# platforms, or to use a cuda runtime not installed in the default
+# location.  In newer versions of the toolkit the cuda library is
+# included with the graphics driver- be sure that the driver version
+# matches what is needed by the cuda runtime version.
+#
+# The following variables affect the behavior of the macros in the
+# script (in alphebetical order).  Note that any of these flags can be
+# changed multiple times in the same directory before calling
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
+# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
+#
+#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
+#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
+#      Note that making this different from the host code when generating object
+#      or C files from CUDA code just won't work, because size_t gets defined by
+#      nvcc in the generated source.  If you compile to PTX and then load the
+#      file yourself, you can mix bit sizes between device and host.
+#
+#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
+#   -- Set to ON if you want the custom build rule to be attached to the source
+#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
+#      targets.
+#
+#      This allows the user to build the target from the CUDA file; however, bad
+#      things can happen if the CUDA source file is added to multiple targets.
+#      When performing parallel builds it is possible for the custom build
+#      command to be run more than once and in parallel causing cryptic build
+#      errors.  VS runs the rules for every source file in the target, and a
+#      source can have only one rule no matter how many projects it is added to.
+#      When the rule is run from multiple targets race conditions can occur on
+#      the generated file.  Eventually everything will get built, but if the user
+#      is unaware of this behavior, there may be confusion.  It would be nice if
+#      this script could detect the reuse of source files across multiple targets
+#      and turn the option off for the user, but no good solution could be found.
+#
+#   CUDA_BUILD_CUBIN (Default OFF)
+#   -- Set to ON to enable and extra compilation pass with the -cubin option in
+#      Device mode. The output is parsed and register, shared memory usage is
+#      printed during build.
+#
+#   CUDA_BUILD_EMULATION (Default OFF for device mode)
+#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
+#      when CUDA_BUILD_EMULATION is TRUE.
+#
+#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
+#   -- Set to the path you wish to have the generated files placed.  If it is
+#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
+#      Intermediate files will always be placed in
+#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
+#
+#   CUDA_HOST_COMPILATION_CPP (Default ON)
+#   -- Set to OFF for C compilation of host code.
+#
+#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
+#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
+#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
+#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
+#      $(VCInstallDir)/bin is a special value that expands out to the path when
+#      the command is run from withing VS.
+#
+#   CUDA_NVCC_FLAGS
+#   CUDA_NVCC_FLAGS_<CONFIG>
+#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
+#      semi-colon delimited (e.g. --compiler-options;-Wall)
+#
+#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
+#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
+#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
+#      host compiler through nvcc's -Xcompiler flag.  This helps make the
+#      generated host code match the rest of the system better.  Sometimes
+#      certain flags give nvcc problems, and this will help you turn the flag
+#      propagation off.  This does not affect the flags supplied directly to nvcc
+#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
+#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
+#      shared library compilation are not affected by this flag.
+#
+#   CUDA_SEPARABLE_COMPILATION (Default OFF)
+#   -- If set this will enable separable compilation for all CUDA runtime object
+#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#      (e.g. calling CUDA_WRAP_SRCS directly),
+#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
+#   CUDA_SOURCE_PROPERTY_FORMAT
+#   -- If this source file property is set, it can override the format specified
+#      to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN).  If an input source file
+#      is not a .cu file, setting this file will cause it to be treated as a .cu
+#      file. See documentation for set_source_files_properties on how to set
+#      this property.
+#
+#   CUDA_USE_STATIC_CUDA_RUNTIME (Default ON)
+#   -- When enabled the static version of the CUDA runtime library will be used
+#      in CUDA_LIBRARIES.  If the version of CUDA configured doesn't support
+#      this option, then it will be silently disabled.
+#
+#   CUDA_VERBOSE_BUILD (Default OFF)
+#   -- Set to ON to see all the commands used when building the CUDA file.  When
+#      using a Makefile generator the value defaults to VERBOSE (run make
+#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
+#      always print the output.
+#
+# The script creates the following macros (in alphebetical order)::
+#
+#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
+#   -- Adds the cufft library to the target (can be any target).  Handles whether
+#      you are in emulation mode or not.
+#
+#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
+#   -- Adds the cublas library to the target (can be any target).  Handles
+#      whether you are in emulation mode or not.
+#
+#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
+#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Creates an executable "cuda_target" which is made up of the files
+#      specified.  All of the non CUDA C files are compiled using the standard
+#      build rules specified by CMAKE and the cuda files are compiled to object
+#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
+#      added automatically to include_directories().  Some standard CMake target
+#      calls can be used on the target after calling this macro
+#      (e.g. set_target_properties and target_link_libraries), but setting
+#      properties that adjust compilation flags will not affect code compiled by
+#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
+#
+#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
+#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
+#
+#   CUDA_BUILD_CLEAN_TARGET()
+#   -- Creates a convience target that deletes all the dependency files
+#      generated.  You should make clean after running this target to ensure the
+#      dependency files get regenerated.
+#
+#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
+#                 [OPTIONS ...] )
+#   -- Returns a list of generated files from the input source files to be used
+#      with ADD_LIBRARY or ADD_EXECUTABLE.
+#
+#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of PTX files generated from the input source files.
+#
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
+#
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
+#
+#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                        cuda_target
+#                                                        object_files )
+#   -- Compute the name of the intermediate link file used for separable
+#      compilation.  This file name is typically passed into
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#      based on cuda_target the list of objects files that need separable
+#      compilation as specified by object_files.  If the object_files list is
+#      empty, then output_file_var will be empty.  This function is called
+#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#      this is a function and not a macro.
+#
+#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+#   -- Sets the directories that should be passed to nvcc
+#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
+#      files.
+#
+#
+#
+#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                            nvcc_flags object_files)
+#
+#   -- Generates the link object required by separable compilation from the given
+#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#      argument.  The only nvcc flag added automatically is the bitness flag as
+#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#      instead of a macro.
+#
+#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
+#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
+#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
+#      function under the hood.
+#
+#      Given the list of files (file0 file1 ... fileN) this macro generates
+#      custom commands that generate either PTX or linkable objects (use "PTX" or
+#      "OBJ" for the format argument to switch).  Files that don't end with .cu
+#      or have the HEADER_FILE_ONLY property are ignored.
+#
+#      The arguments passed in after OPTIONS are extra command line options to
+#      give to nvcc.  You can also specify per configuration options by
+#      specifying the name of the configuration followed by the options.  General
+#      options must preceed configuration specific options.  Not all
+#      configurations need to be specified, only the ones provided will be used.
+#
+#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
+#         DEBUG -g
+#         RELEASE --use_fast_math
+#         RELWITHDEBINFO --use_fast_math;-g
+#         MINSIZEREL --use_fast_math
+#
+#      For certain configurations (namely VS generating object files with
+#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
+#      be produced for the given cuda file.  This is because when you add the
+#      cuda file to Visual Studio it knows that this file produces an object file
+#      and will link in the resulting object file automatically.
+#
+#      This script will also generate a separate cmake script that is used at
+#      build time to invoke nvcc.  This is for several reasons.
+#
+#        1. nvcc can return negative numbers as return values which confuses
+#        Visual Studio into thinking that the command succeeded.  The script now
+#        checks the error codes and produces errors when there was a problem.
+#
+#        2. nvcc has been known to not delete incomplete results when it
+#        encounters problems.  This confuses build systems into thinking the
+#        target was generated when in fact an unusable file exists.  The script
+#        now deletes the output files if there was an error.
+#
+#        3. By putting all the options that affect the build into a file and then
+#        make the build rule dependent on the file, the output files will be
+#        regenerated when the options change.
+#
+#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
+#      determine when to target the object compilation for a shared library.
+#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
+#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
+#      objects intended for shared libraries.  A preprocessor macro,
+#      <target_name>_EXPORTS is defined when a shared library compilation is
+#      detected.
+#
+#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
+#
+#
+#
+# The script defines the following variables::
+#
+#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
+#   CUDA_VERSION_MINOR    -- The minor version.
+#   CUDA_VERSION
+#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
+#
+#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
+#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
+#                            SDK.  This script will not directly support finding
+#                            specific libraries or headers, as that isn't
+#                            supported by NVIDIA.  If you want to change
+#                            libraries when the path changes see the
+#                            FindCUDA.cmake script for an example of how to clear
+#                            these variables.  There are also examples of how to
+#                            use the CUDA_SDK_ROOT_DIR to locate headers or
+#                            libraries, if you so choose (at your own risk).
+#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
+#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
+#   CUDA_LIBRARIES        -- Cuda RT library.
+#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUFFT_TO_TARGET macro)
+#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
+#                            implementation (alterative to:
+#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
+#   CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library.
+#                                 Only available for CUDA version 5.5+
+#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_cusolver_LIBRARY -- CUDA Direct Solver library.
+#                            Only available for CUDA version 7.0+.
+#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# FindCUDA.cmake
+
+# This macro helps us find the location of helper files we will need the full path to
+macro(CUDA_FIND_HELPER_FILE _name _extension)
+  set(_full_name "${_name}.${_extension}")
+  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+  # processed.  Using this variable, we can pull out the current path, and
+  # provide a way to get access to the other files we need local to here.
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
+  if(NOT EXISTS "${CUDA_${_name}}")
+    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "${error_message}")
+    else()
+      if(NOT CUDA_FIND_QUIETLY)
+        message(STATUS "${error_message}")
+      endif()
+    endif()
+  endif()
+  # Set this variable as internal, so the user isn't bugged with it.
+  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+#####################################################################
+## CUDA_INCLUDE_NVCC_DEPENDENCIES
+##
+
+# So we want to try and include the dependency file if it exists.  If
+# it doesn't exist then we need to create an empty one, so we can
+# include it.
+
+# If it does exist, then we need to check to see if all the files it
+# depends on exist.  If they don't then we should clear the dependency
+# file and regenerate it later.  This covers the case where a header
+# file has disappeared or moved.
+
+macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
+  set(CUDA_NVCC_DEPEND)
+  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
+
+
+  # Include the dependency file.  Create it first if it doesn't exist .  The
+  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
+  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
+  # hours figuring out why it didn't work.
+  if(NOT EXISTS ${dependency_file})
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+  # Always include this file to force CMake to run again next
+  # invocation and rebuild the dependencies.
+  #message("including dependency_file = ${dependency_file}")
+  include(${dependency_file})
+
+  # Now we need to verify the existence of all the included files
+  # here.  If they aren't there we need to just blank this variable and
+  # make the file regenerate again.
+#   if(DEFINED CUDA_NVCC_DEPEND)
+#     message("CUDA_NVCC_DEPEND set")
+#   else()
+#     message("CUDA_NVCC_DEPEND NOT set")
+#   endif()
+  if(CUDA_NVCC_DEPEND)
+    #message("CUDA_NVCC_DEPEND found")
+    foreach(f ${CUDA_NVCC_DEPEND})
+      # message("searching for ${f}")
+      if(NOT EXISTS ${f})
+        #message("file ${f} not found")
+        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+      endif()
+    endforeach()
+  else()
+    #message("CUDA_NVCC_DEPEND false")
+    # No dependencies, so regenerate the file.
+    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+  endif()
+
+  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
+  # No incoming dependencies, so we need to generate them.  Make the
+  # output depend on the dependency file itself, which should cause the
+  # rule to re-run.
+  if(CUDA_NVCC_DEPEND_REGENERATE)
+    set(CUDA_NVCC_DEPEND ${dependency_file})
+    #message("Generating an empty dependency_file: ${dependency_file}")
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# Setup variables' defaults
+###############################################################################
+###############################################################################
+
+# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
+else()
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
+endif()
+option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
+
+# Attach the build rule to the source file in VS.  This option
+option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
+
+# Prints out extra information about the cuda file during compilation
+option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
+
+# Set whether we are using emulation or device mode.
+option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
+
+# Where to put the generated output.
+set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
+
+# Parse HOST_COMPILATION mode.
+option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
+
+# Extra user settable flags
+set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
+
+if(CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
+else()
+  if(APPLE
+      AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang"
+      AND "${CMAKE_C_COMPILER}" MATCHES "/cc$")
+    # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+    # unhandled -dumpspecs option to clang. Also in case neither
+    # CMAKE_C_COMPILER is defined (project does not use C language) nor
+    # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+    # nvcc use its own default C compiler.
+    # Only care about this on APPLE with clang to avoid
+    # following symlinks to things like ccache
+    if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+      get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+      # if the real path does not end up being clang then
+      # go back to using CMAKE_C_COMPILER
+      if(NOT "${c_compiler_realpath}" MATCHES "/clang$")
+        set(c_compiler_realpath "${CMAKE_C_COMPILER}")
+      endif()
+    else()
+      set(c_compiler_realpath "")
+    endif()
+    set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
+  else()
+    set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
+      CACHE FILEPATH "Host side compiler used by NVCC")
+  endif()
+endif()
+
+# Propagate the host flags to the host compiler via -Xcompiler
+option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+
+# Enable CUDA_SEPARABLE_COMPILATION
+option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
+
+# Specifies whether the commands used when compiling the .cu file will be printed out.
+option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+
+mark_as_advanced(
+  CUDA_64_BIT_DEVICE_CODE
+  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
+  CUDA_GENERATED_OUTPUT_DIR
+  CUDA_HOST_COMPILATION_CPP
+  CUDA_NVCC_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS
+  CUDA_BUILD_CUBIN
+  CUDA_BUILD_EMULATION
+  CUDA_VERBOSE_BUILD
+  CUDA_SEPARABLE_COMPILATION
+  )
+
+# Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we
+# need to add another entry for the CMAKE_BUILD_TYPE.  We also need to add the
+# standerd set of 4 build types (Debug, MinSizeRel, Release, and RelWithDebInfo)
+# for completeness.  We need run this loop in order to accomodate the addition
+# of extra configuration types.  Duplicate entries will be removed by
+# REMOVE_DUPLICATES.
+set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES CUDA_configuration_types)
+foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semi-colon delimit multiple arguments.")
+    mark_as_advanced(CUDA_NVCC_FLAGS_${config_upper})
+endforeach()
+
+###############################################################################
+###############################################################################
+# Locate CUDA, Set Build Type, etc.
+###############################################################################
+###############################################################################
+
+macro(cuda_unset_include_and_libraries)
+  unset(CUDA_TOOLKIT_INCLUDE CACHE)
+  unset(CUDA_CUDART_LIBRARY CACHE)
+  unset(CUDA_CUDA_LIBRARY CACHE)
+  # Make sure you run this before you unset CUDA_VERSION.
+  if(CUDA_VERSION VERSION_EQUAL "3.0")
+    # This only existed in the 3.0 version of the CUDA toolkit
+    unset(CUDA_CUDARTEMU_LIBRARY CACHE)
+  endif()
+  unset(CUDA_cudart_static_LIBRARY CACHE)
+  unset(CUDA_cublas_LIBRARY CACHE)
+  unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cufft_LIBRARY CACHE)
+  unset(CUDA_cufftemu_LIBRARY CACHE)
+  unset(CUDA_cupti_LIBRARY CACHE)
+  unset(CUDA_curand_LIBRARY CACHE)
+  unset(CUDA_cusolver_LIBRARY CACHE)
+  unset(CUDA_cusparse_LIBRARY CACHE)
+  unset(CUDA_npp_LIBRARY CACHE)
+  unset(CUDA_nppc_LIBRARY CACHE)
+  unset(CUDA_nppi_LIBRARY CACHE)
+  unset(CUDA_npps_LIBRARY CACHE)
+  unset(CUDA_nvcuvenc_LIBRARY CACHE)
+  unset(CUDA_nvcuvid_LIBRARY CACHE)
+
+  unset(CUDA_USE_STATIC_CUDA_RUNTIME CACHE)
+endmacro()
+
+# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
+# if they have then clear the cache variables, so that will be detected again.
+if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
+  unset(CUDA_NVCC_EXECUTABLE CACHE)
+  cuda_unset_include_and_libraries()
+  unset(CUDA_VERSION CACHE)
+endif()
+
+if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+  cuda_unset_include_and_libraries()
+endif()
+
+if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}")
+  # No specific variables to catch.  Use this kind of code before calling
+  # find_package(CUDA) to clean up any variables that may depend on this path.
+
+  #   unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE)
+  #   unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE)
+endif()
+
+# Search for the cuda distribution.
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+
+  # Search in the CUDA_BIN_PATH first.
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS
+      ENV CUDA_PATH
+      ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    DOC "Toolkit location."
+    NO_DEFAULT_PATH
+    )
+  # Now search default paths
+  find_path(CUDA_TOOLKIT_ROOT_DIR
+    NAMES nvcc nvcc.exe
+    PATHS /usr/local/bin
+          /usr/local/cuda/bin
+    DOC "Toolkit location."
+    )
+
+  if (CUDA_TOOLKIT_ROOT_DIR)
+    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+    # We need to force this back into the cache.
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
+  endif()
+  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
+    elseif(NOT CUDA_FIND_QUIETLY)
+      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
+    endif()
+  endif ()
+endif ()
+
+# CUDA_NVCC_EXECUTABLE
+find_program(CUDA_NVCC_EXECUTABLE
+  NAMES nvcc
+  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+  ENV CUDA_PATH
+  ENV CUDA_BIN_PATH
+  PATH_SUFFIXES bin bin64
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_program(CUDA_NVCC_EXECUTABLE nvcc)
+mark_as_advanced(CUDA_NVCC_EXECUTABLE)
+
+if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
+  # Compute the version.
+  execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
+  mark_as_advanced(CUDA_VERSION)
+else()
+  # Need to set these based off of the cached value
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
+endif()
+
+# Always set this convenience variable
+set(CUDA_VERSION_STRING "${CUDA_VERSION}")
+
+# Support for arm cross compilation with CUDA 5.5
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf" CACHE PATH "Toolkit target location.")
+else()
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE PATH "Toolkit target location.")
+endif()
+mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+
+# Target CPU architecture
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+  set(_cuda_target_cpu_arch_initial "ARM")
+else()
+  set(_cuda_target_cpu_arch_initial "")
+endif()
+set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
+mark_as_advanced(CUDA_TARGET_CPU_ARCH)
+
+# CUDA_TOOLKIT_INCLUDE
+find_path(CUDA_TOOLKIT_INCLUDE
+  device_functions.h # Header included in toolkit
+  PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
+  ENV CUDA_PATH
+  ENV CUDA_INC_PATH
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
+mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
+
+# Set the user list of include dir to nothing to initialize it.
+set (CUDA_NVCC_INCLUDE_ARGS_USER "")
+set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+
+macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # CUDA 3.2+ on Windows moved the library directories, so we need the new
+    # and old paths.
+    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
+  endif()
+  # CUDA 3.2+ on Windows moved the library directories, so we need to new
+  # (lib/Win32) and the old path (lib).
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_LIB_PATH
+    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
+    DOC ${_doc}
+    NO_DEFAULT_PATH
+    )
+  # Search default search paths, after we search our own set of paths.
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "/usr/lib/nvidia-current"
+    DOC ${_doc}
+    )
+endmacro()
+
+macro(cuda_find_library_local_first _var _names _doc)
+  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+macro(find_library_local_first _var _names _doc )
+  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+
+# CUDA_LIBRARIES
+cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
+if(CUDA_VERSION VERSION_EQUAL "3.0")
+  # The cudartemu library only existed for the 3.0 version of CUDA.
+  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
+  mark_as_advanced(
+    CUDA_CUDARTEMU_LIBRARY
+    )
+endif()
+if(NOT CUDA_VERSION VERSION_LESS "5.5")
+  cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library")
+  mark_as_advanced(CUDA_cudart_static_LIBRARY)
+endif()
+if(CUDA_cudart_static_LIBRARY)
+  # Set whether to use the static cuda runtime.
+  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON)
+else()
+  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF)
+endif()
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  if(UNIX)
+    # Check for the dependent libraries.  Here we look for pthreads.
+    if (DEFINED CMAKE_THREAD_PREFER_PTHREAD)
+      set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD})
+    endif()
+    set(CMAKE_THREAD_PREFER_PTHREAD 1)
+
+    # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;}
+    # as the source file.  Unfortunately this causes a warning with -Wstrict-prototypes and
+    # -Werror causes the try_compile to fail.  We will just temporarily disable other flags
+    # when doing the find_package command here.
+    set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS})
+    set(CMAKE_C_FLAGS "-fPIC")
+    find_package(Threads REQUIRED)
+    set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags})
+
+    if (DEFINED _cuda_cmake_thread_prefer_pthread)
+      set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread})
+      unset(_cuda_cmake_thread_prefer_pthread)
+    else()
+      unset(CMAKE_THREAD_PREFER_PTHREAD)
+    endif()
+    if (NOT APPLE)
+      # Here is librt that has things such as, clock_gettime, shm_open, and shm_unlink.
+      find_library(CUDA_rt_LIBRARY rt)
+      find_library(CUDA_dl_LIBRARY dl)
+      if (NOT CUDA_rt_LIBRARY)
+        message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.")
+      endif()
+      if (NOT CUDA_dl_LIBRARY)
+        message(WARNING "Expecting to find libdl for libcudart_static, but didn't find it.")
+      endif()
+    endif()
+  endif()
+endif()
+
+# CUPTI library showed up in cuda toolkit 4.0
+if(NOT CUDA_VERSION VERSION_LESS "4.0")
+  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
+  mark_as_advanced(CUDA_cupti_LIBRARY)
+endif()
+
+# Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
+# using the CUDA runtime.  For the dynamic version of the runtime, most of the
+# dependencies are brough in, but for the static version there are additional libraries
+# and linker commands needed.
+# Initialize to empty
+set(CUDA_LIBRARIES)
+
+# If we are using emulation mode and we found the cudartemu library then use
+# that one instead of cudart.
+if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
+elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
+  if (CUDA_rt_LIBRARY)
+    list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY})
+  endif()
+  if (CUDA_dl_LIBRARY)
+    list(APPEND CUDA_LIBRARIES ${CUDA_dl_LIBRARY})
+  endif()
+  if(APPLE)
+    # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
+    # the static cuda runtime can find it at runtime.
+    list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib)
+  endif()
+else()
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
+endif()
+
+# 1.1 toolkit on linux doesn't appear to have a separate library on
+# some platforms.
+cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
+
+mark_as_advanced(
+  CUDA_CUDA_LIBRARY
+  CUDA_CUDART_LIBRARY
+  )
+
+#######################
+# Look for some of the toolkit helper libraries
+macro(FIND_CUDA_HELPER_LIBS _name)
+  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
+  mark_as_advanced(CUDA_${_name}_LIBRARY)
+endmacro()
+
+#######################
+# Disable emulation for v3.1 onward
+if(CUDA_VERSION VERSION_GREATER "3.0")
+  if(CUDA_BUILD_EMULATION)
+    message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
+  endif()
+endif()
+
+# Search for additional CUDA toolkit libraries.
+if(CUDA_VERSION VERSION_LESS "3.1")
+  # Emulation libraries aren't available in version 3.1 onward.
+  find_cuda_helper_libs(cufftemu)
+  find_cuda_helper_libs(cublasemu)
+endif()
+find_cuda_helper_libs(cufft)
+find_cuda_helper_libs(cublas)
+if(NOT CUDA_VERSION VERSION_LESS "3.2")
+  # cusparse showed up in version 3.2
+  find_cuda_helper_libs(cusparse)
+  find_cuda_helper_libs(curand)
+  if (WIN32)
+    find_cuda_helper_libs(nvcuvenc)
+    find_cuda_helper_libs(nvcuvid)
+  endif()
+endif()
+if(CUDA_VERSION VERSION_GREATER "5.0")
+  # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
+  find_cuda_helper_libs(nppc)
+  find_cuda_helper_libs(nppi)
+  find_cuda_helper_libs(npps)
+  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
+elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
+  find_cuda_helper_libs(npp)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS "7.0")
+  # cusolver showed up in version 7.0
+  find_cuda_helper_libs(cusolver)
+endif()
+
+if (CUDA_BUILD_EMULATION)
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
+else()
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
+endif()
+
+########################
+# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
+# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
+find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
+ HINTS
+  "$ENV{NVSDKCOMPUTE_ROOT}/C"
+  ENV NVSDKCUDA_ROOT
+  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
+ PATHS
+  "/Developer/GPU\ Computing/C"
+  )
+
+# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
+# environment variables.
+set(CUDA_SDK_SEARCH_PATH
+  "${CUDA_SDK_ROOT_DIR}"
+  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
+  "/Developer/CUDA"
+  )
+
+# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
+
+# find_path(CUDA_CUT_INCLUDE_DIR
+#   cutil.h
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   PATH_SUFFIXES "common/inc"
+#   DOC "Location of cutil.h"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
+
+# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
+
+
+# Example of how to find a library in the CUDA_SDK_ROOT_DIR
+
+# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
+# # to get these confused, so we are setting the name based on the word size of
+# # the build.
+
+# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+#   set(cuda_cutil_name cutil64)
+# else()
+#   set(cuda_cutil_name cutil32)
+# endif()
+
+# find_library(CUDA_CUT_LIBRARY
+#   NAMES cutil ${cuda_cutil_name}
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   # The new version of the sdk shows up in common/lib, but the old one is in lib
+#   PATH_SUFFIXES "common/lib" "lib"
+#   DOC "Location of cutil library"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
+# mark_as_advanced(CUDA_CUT_LIBRARY)
+# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
+
+
+
+#############################
+# Check for required components
+set(CUDA_FOUND TRUE)
+
+set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
+set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(CUDA
+  REQUIRED_VARS
+    CUDA_TOOLKIT_ROOT_DIR
+    CUDA_NVCC_EXECUTABLE
+    CUDA_INCLUDE_DIRS
+    CUDA_CUDART_LIBRARY
+  VERSION_VAR
+    CUDA_VERSION
+  )
+
+
+
+###############################################################################
+###############################################################################
+# Macros
+###############################################################################
+###############################################################################
+
+###############################################################################
+# Add include directories to pass to the nvcc command.
+macro(CUDA_INCLUDE_DIRECTORIES)
+  foreach(dir ${ARGN})
+    list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -I${dir})
+  endforeach()
+endmacro()
+
+
+##############################################################################
+cuda_find_helper_file(parse_cubin cmake)
+cuda_find_helper_file(make2cmake cmake)
+cuda_find_helper_file(run_nvcc cmake)
+
+##############################################################################
+# Separate the OPTIONS out from the sources
+#
+macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
+  set( ${_sources} )
+  set( ${_cmake_options} )
+  set( ${_options} )
+  set( _found_options FALSE )
+  foreach(arg ${ARGN})
+    if("x${arg}" STREQUAL "xOPTIONS")
+      set( _found_options TRUE )
+    elseif(
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
+        )
+      list(APPEND ${_cmake_options} ${arg})
+    else()
+      if ( _found_options )
+        list(APPEND ${_options} ${arg})
+      else()
+        # Assume this is a file
+        list(APPEND ${_sources} ${arg})
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
+#
+macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
+  set( _found_config )
+  foreach(arg ${ARGN})
+    # Determine if we are dealing with a perconfiguration flag
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      if (arg STREQUAL "${config_upper}")
+        set( _found_config _${arg})
+        # Set arg to nothing to keep it from being processed further
+        set( arg )
+      endif()
+    endforeach()
+
+    if ( arg )
+      list(APPEND ${_option_prefix}${_found_config} "${arg}")
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Helper to add the include directory for CUDA only once
+function(CUDA_ADD_CUDA_INCLUDE_ONCE)
+  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
+  set(_add TRUE)
+  if(_include_directories)
+    foreach(dir ${_include_directories})
+      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
+        set(_add FALSE)
+      endif()
+    endforeach()
+  endif()
+  if(_add)
+    include_directories(${CUDA_INCLUDE_DIRS})
+  endif()
+endfunction()
+
+function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
+  set(cmake_args ${ARGN})
+  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
+  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
+  list(FIND cmake_args SHARED _cuda_found_SHARED)
+  list(FIND cmake_args MODULE _cuda_found_MODULE)
+  list(FIND cmake_args STATIC _cuda_found_STATIC)
+  if( _cuda_found_SHARED GREATER -1 OR
+      _cuda_found_MODULE GREATER -1 OR
+      _cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs)
+  else()
+    if (BUILD_SHARED_LIBS)
+      set(_cuda_build_shared_libs SHARED)
+    else()
+      set(_cuda_build_shared_libs STATIC)
+    endif()
+  endif()
+  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
+endfunction()
+
+##############################################################################
+# Helper to avoid clashes of files with the same basename but different paths.
+# This doesn't attempt to do exactly what CMake internals do, which is to only
+# add this path when there is a conflict, since by the time a second collision
+# in names is detected it's already too late to fix the first one.  For
+# consistency sake the relative path will be added to all files.
+function(CUDA_COMPUTE_BUILD_PATH path build_path)
+  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
+  # Only deal with CMake style paths from here on out
+  file(TO_CMAKE_PATH "${path}" bpath)
+  if (IS_ABSOLUTE "${bpath}")
+    # Absolute paths are generally unnessary, especially if something like
+    # file(GLOB_RECURSE) is used to pick up the files.
+
+    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+    if (_binary_dir_pos EQUAL 0)
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+    else()
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+    endif()
+  endif()
+
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # CMake source.
+
+  # Remove leading /
+  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+  # Avoid absolute paths by removing ':'
+  string(REPLACE ":" "_" bpath "${bpath}")
+  # Avoid relative paths that go up the tree
+  string(REPLACE "../" "__/" bpath "${bpath}")
+  # Avoid spaces
+  string(REPLACE " " "_" bpath "${bpath}")
+
+  # Strip off the filename.  I wait until here to do it, since removin the
+  # basename can make a path that looked like path/../basename turn into
+  # path/.. (notice the trailing slash).
+  get_filename_component(bpath "${bpath}" PATH)
+
+  set(${build_path} "${bpath}" PARENT_SCOPE)
+  #message("${build_path} = ${bpath}")
+endfunction()
+
+##############################################################################
+# This helper macro populates the following variables and setups up custom
+# commands and targets to invoke the nvcc compiler to generate C or PTX source
+# dependent upon the format parameter.  The compiler is invoked once with -M
+# to generate a dependency file and a second time with -cuda or -ptx to generate
+# a .cpp or .ptx file.
+# INPUT:
+#   cuda_target         - Target name
+#   format              - PTX, CUBIN, FATBIN or OBJ
+#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
+#   OPTIONS             - Extra options to NVCC
+# OUTPUT:
+#   generated_files     - List of generated files
+##############################################################################
+##############################################################################
+
+macro(CUDA_WRAP_SRCS cuda_target format generated_files)
+
+  # If CMake doesn't support separable compilation, complain
+  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
+    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
+  endif()
+
+  # Set up all the command line flags here, so that they can be overridden on a per target basis.
+
+  set(nvcc_flags "")
+
+  # Emulation if the card isn't present.
+  if (CUDA_BUILD_EMULATION)
+    # Emulation.
+    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
+  else()
+    # Device mode.  No flags necessary.
+  endif()
+
+  if(CUDA_HOST_COMPILATION_CPP)
+    set(CUDA_C_OR_CXX CXX)
+  else()
+    if(CUDA_VERSION VERSION_LESS "3.0")
+      set(nvcc_flags ${nvcc_flags} --host-compilation C)
+    else()
+      message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
+    endif()
+    set(CUDA_C_OR_CXX C)
+  endif()
+
+  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+
+  if(CUDA_64_BIT_DEVICE_CODE)
+    set(nvcc_flags ${nvcc_flags} -m64)
+  else()
+    set(nvcc_flags ${nvcc_flags} -m32)
+  endif()
+
+  if(CUDA_TARGET_CPU_ARCH)
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  endif()
+
+  # This needs to be passed in at this stage, because VS needs to fill out the
+  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
+  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
+  # $(VCInstallDir)/bin.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" )
+  else()
+    set(ccbin_flags)
+  endif()
+
+  # Figure out which configure we will use and pass that in as an argument to
+  # the script.  We need to defer the decision until compilation time, because
+  # for VS projects we won't know if we are making a debug or release build
+  # until build time.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set( CUDA_build_configuration "$(ConfigurationName)" )
+  else()
+    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
+  endif()
+
+  # Initialize our list of includes with the user ones followed by the CUDA system ones.
+  set(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS_USER} "-I${CUDA_INCLUDE_DIRS}")
+  # Get the include directories for this directory and use them for our nvcc command.
+  # Remove duplicate entries which may be present since include_directories
+  # in CMake >= 2.8.8 does not remove them.
+  get_directory_property(CUDA_NVCC_INCLUDE_DIRECTORIES INCLUDE_DIRECTORIES)
+  list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRECTORIES)
+  if(CUDA_NVCC_INCLUDE_DIRECTORIES)
+    foreach(dir ${CUDA_NVCC_INCLUDE_DIRECTORIES})
+      list(APPEND CUDA_NVCC_INCLUDE_ARGS -I${dir})
+    endforeach()
+  endif()
+
+  # Reset these variables
+  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
+  endforeach()
+
+  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${ARGN})
+  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
+
+  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
+  # respected in CUDA_ADD_LIBRARY.
+  set(_cuda_build_shared_libs FALSE)
+  # SHARED, MODULE
+  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
+  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
+  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
+    set(_cuda_build_shared_libs TRUE)
+  endif()
+  # STATIC
+  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
+  if(_cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs FALSE)
+  endif()
+
+  # CUDA_HOST_FLAGS
+  if(_cuda_build_shared_libs)
+    # If we are setting up code for a shared library, then we need to add extra flags for
+    # compiling objects for shared libraries.
+    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
+  else()
+    set(CUDA_HOST_SHARED_FLAGS)
+  endif()
+  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
+  # always need to set the SHARED_FLAGS, though.
+  if(CUDA_PROPAGATE_HOST_FLAGS)
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CMAKE_${CUDA_C_OR_CXX}_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
+  else()
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
+  endif()
+
+  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
+  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
+    # we convert the strings to lists (like we want).
+
+    if(CUDA_PROPAGATE_HOST_FLAGS)
+      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
+      set(_cuda_fix_g3 FALSE)
+
+      if(CMAKE_COMPILER_IS_GNUCC)
+        if (CUDA_VERSION VERSION_LESS  "3.0" OR
+            CUDA_VERSION VERSION_EQUAL "4.1" OR
+            CUDA_VERSION VERSION_EQUAL "4.2"
+            )
+          set(_cuda_fix_g3 TRUE)
+        endif()
+      endif()
+      if(_cuda_fix_g3)
+        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      else()
+        set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      endif()
+
+      set(_cuda_host_flags "${_cuda_host_flags}\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
+    endif()
+
+    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
+    # like it is currently), we can remove the quotes around the
+    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
+    set(_cuda_nvcc_flags_config "${_cuda_nvcc_flags_config}\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
+  endforeach()
+
+  # Process the C++11 flag.  If the host sets the flag, we need to add it to nvcc and
+  # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses
+  # the C preprocessor).  In order to get this to work correctly, we need to use nvcc's
+  # specific c++11 flag.
+  if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11")
+    # Add the c++11 flag to nvcc if it isn't already present.  Note that we only look at
+    # the main flag instead of the configuration specific flags.
+    if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std;c\\+\\+11" )
+      list(APPEND nvcc_flags --std c++11)
+    endif()
+    string(REGEX REPLACE "[-]+std=c\\+\\+11" "" _cuda_host_flags "${_cuda_host_flags}")
+  endif()
+
+  # Get the list of definitions from the directory property
+  get_directory_property(CUDA_NVCC_DEFINITIONS COMPILE_DEFINITIONS)
+  if(CUDA_NVCC_DEFINITIONS)
+    foreach(_definition ${CUDA_NVCC_DEFINITIONS})
+      list(APPEND nvcc_flags "-D${_definition}")
+    endforeach()
+  endif()
+
+  if(_cuda_build_shared_libs)
+    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
+  endif()
+
+  # Reset the output variable
+  set(_cuda_wrap_generated_files "")
+
+  # Iterate over the macro arguments and create custom
+  # commands for all the .cu files.
+  foreach(file ${ARGN})
+    # Ignore any file marked as a HEADER_FILE_ONLY
+    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+    # Allow per source file overrides of the format.  Also allows compiling non-.cu files.
+    get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
+    if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header)
+
+      if(NOT _cuda_source_format)
+        set(_cuda_source_format ${format})
+      endif()
+      # If file isn't a .cu file, we need to tell nvcc to treat it as such.
+      if(NOT ${file} MATCHES "\\.cu$")
+        set(cuda_language_flag -x=cu)
+      else()
+        set(cuda_language_flag)
+      endif()
+
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
+      else()
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
+      endif()
+
+      if(cuda_compile_to_external_module)
+        # Don't use any of the host compilation flags for PTX targets.
+        set(CUDA_HOST_FLAGS)
+        set(CUDA_NVCC_FLAGS_CONFIG)
+      else()
+        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
+        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
+      endif()
+
+      # Determine output directory
+      cuda_compute_build_path("${file}" cuda_build_path)
+      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
+      if(CUDA_GENERATED_OUTPUT_DIR)
+        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
+      else()
+        if ( cuda_compile_to_external_module )
+          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+        else()
+          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
+        endif()
+      endif()
+
+      # Add a custom target to generate a c or ptx file. ######################
+
+      get_filename_component( basename ${file} NAME )
+      if( cuda_compile_to_external_module )
+        set(generated_file_path "${cuda_compile_output_dir}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
+        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
+      else()
+        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
+        if(CUDA_SEPARABLE_COMPILATION)
+          set(format_flag "-dc")
+        else()
+          set(format_flag "-c")
+        endif()
+      endif()
+
+      # Set all of our file names.  Make sure that whatever filenames that have
+      # generated_file_path in them get passed in through as a command line
+      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
+      # instead of configure time.
+      set(generated_file "${generated_file_path}/${generated_file_basename}")
+      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
+      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
+      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
+      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")
+
+      # Setup properties for obj files:
+      if( NOT cuda_compile_to_external_module )
+        set_source_files_properties("${generated_file}"
+          PROPERTIES
+          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
+          )
+      endif()
+
+      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
+      get_filename_component(file_path "${file}" PATH)
+      if(IS_ABSOLUTE "${file_path}")
+        set(source_file "${file}")
+      else()
+        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+      endif()
+
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
+        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
+      endif()
+
+      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
+      cuda_include_nvcc_dependencies(${cmake_dependency_file})
+
+      # Convience string for output ###########################################
+      if(CUDA_BUILD_EMULATION)
+        set(cuda_build_type "Emulation")
+      else()
+        set(cuda_build_type "Device")
+      endif()
+
+      # Build the NVCC made dependency file ###################################
+      set(build_cubin OFF)
+      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
+         if ( NOT cuda_compile_to_external_module )
+           set ( build_cubin ON )
+         endif()
+      endif()
+
+      # Configure the build script
+      configure_file("${CUDA_run_nvcc}" "${custom_target_script}" @ONLY)
+
+      # So if a user specifies the same cuda file as input more than once, you
+      # can have bad things happen with dependencies.  Here we check an option
+      # to see if this is the behavior they want.
+      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
+        set(main_dep MAIN_DEPENDENCY ${source_file})
+      else()
+        set(main_dep DEPENDS ${source_file})
+      endif()
+
+      if(CUDA_VERBOSE_BUILD)
+        set(verbose_output ON)
+      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
+        set(verbose_output "$(VERBOSE)")
+      else()
+        set(verbose_output OFF)
+      endif()
+
+      # Create up the comment string
+      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
+      else()
+        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
+      endif()
+
+      # Build the generated file and dependency file ##########################
+      add_custom_command(
+        OUTPUT ${generated_file}
+        # These output files depend on the source_file and the contents of cmake_dependency_file
+        ${main_dep}
+        DEPENDS ${CUDA_NVCC_DEPEND}
+        DEPENDS ${custom_target_script}
+        # Make sure the output directory exists before trying to write to it.
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+        COMMAND ${CMAKE_COMMAND} ARGS
+          -D verbose:BOOL=${verbose_output}
+          ${ccbin_flags}
+          -D build_configuration:STRING=${CUDA_build_configuration}
+          -D "generated_file:STRING=${generated_file}"
+          -D "generated_cubin_file:STRING=${generated_cubin_file}"
+          -P "${custom_target_script}"
+        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
+        COMMENT "${cuda_build_comment_string}"
+        )
+
+      # Make sure the build system knows the file is generated.
+      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+
+      list(APPEND _cuda_wrap_generated_files ${generated_file})
+
+      # Add the other files that we want cmake to clean on a cleanup ##########
+      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
+      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
+      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+
+    endif()
+  endforeach()
+
+  # Set the return parameter
+  set(${generated_files} ${_cuda_wrap_generated_files})
+endmacro()
+
+function(_cuda_get_important_host_flags important_flags flag_string)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  else()
+    string(REGEX MATCHALL "-fPIC" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  endif()
+  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+###############################################################################
+# Separable Compilation Link
+###############################################################################
+###############################################################################
+
+# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
+function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
+  if (object_files)
+    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
+  else()
+    set(output_file)
+  endif()
+
+  set(${output_file_var} "${output_file}" PARENT_SCOPE)
+endfunction()
+
+# Setup the build rule for the separable compilation intermediate link file.
+function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
+  if (object_files)
+
+    set_source_files_properties("${output_file}"
+      PROPERTIES
+      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
+                           # be linked.
+      GENERATED TRUE       # This file is generated during the build
+      )
+
+    # For now we are ignoring all the configuration specific flags.
+    set(nvcc_flags)
+    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
+    if(CUDA_64_BIT_DEVICE_CODE)
+      list(APPEND nvcc_flags -m64)
+    else()
+      list(APPEND nvcc_flags -m32)
+    endif()
+    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
+    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
+    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+    endif()
+
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS*
+    set(config_specific_flags)
+    set(flags)
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+      set(important_host_flags)
+      _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      foreach(f ${important_host_flags})
+        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+    endforeach()
+    # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS
+    set(important_host_flags)
+    _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    foreach(f ${important_host_flags})
+      list(APPEND flags -Xcompiler ${f})
+    endforeach()
+
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
+    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
+
+    # Some generators don't handle the multiple levels of custom command
+    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
+    # we work around that issue by compiling the intermediate link object as a
+    # pre-link custom command in that situation.
+    set(do_obj_build_rule TRUE)
+    if (MSVC_VERSION GREATER 1599)
+      # VS 2010 and 2012 have this problem.  If future versions fix this issue,
+      # it should still work, it just won't be as nice as the other method.
+      set(do_obj_build_rule FALSE)
+    endif()
+
+    if (do_obj_build_rule)
+      add_custom_command(
+        OUTPUT ${output_file}
+        DEPENDS ${object_files}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
+        ${flags}
+        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        )
+    else()
+      get_filename_component(output_file_dir "${output_file}" DIRECTORY)
+      add_custom_command(
+        TARGET ${cuda_target}
+        PRE_LINK
+        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
+        )
+    endif()
+ endif()
+endfunction()
+
+###############################################################################
+###############################################################################
+# ADD LIBRARY
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_LIBRARY cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
+    ${_cmake_options} ${_cuda_shared_flag}
+    OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_library(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# ADD EXECUTABLE
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_EXECUTABLE cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_executable(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# (Internal) helper for manually added cuda source files with specific targets
+###############################################################################
+###############################################################################
+macro(cuda_compile_base cuda_target format generated_files)
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} ${format} _generated_files ${_sources} ${_cmake_options}
+    OPTIONS ${_options} )
+
+  set( ${generated_files} ${_generated_files})
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE PTX
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_PTX generated_files)
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUFFT TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUFFT_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_cufft_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUBLAS TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUBLAS_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_cublas_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA BUILD CLEAN TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_BUILD_CLEAN_TARGET)
+  # Call this after you add all your CUDA targets, and you will get a convience
+  # target.  You should also make clean after running this target to get the
+  # build system to generate all the code again.
+
+  set(cuda_clean_target_name clean_cuda_depends)
+  if (CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
+  endif()
+  add_custom_target(${cuda_clean_target_name}
+    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
+
+  # Clear out the variable, so the next time we configure it will be empty.
+  # This is useful so that the files won't persist in the list after targets
+  # have been removed.
+  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+endmacro()
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/make2cmake.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/make2cmake.cmake
new file mode 100644
index 0000000..c433fa8
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/make2cmake.cmake
@@ -0,0 +1,92 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# This converts a file written in makefile syntax into one that can be included
+# by CMake.
+
+file(READ ${input_file} depend_text)
+
+if (NOT "${depend_text}" STREQUAL "")
+
+  # message("FOUND DEPENDS")
+
+  string(REPLACE "\\ " " " depend_text ${depend_text})
+
+  # This works for the nvcc -M generated dependency files.
+  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+
+  set(dependency_list "")
+
+  foreach(file ${depend_text})
+
+    string(REGEX REPLACE "^ +" "" file ${file})
+
+    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
+    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
+    # try to prepend another '/' to the path and test again.  If it still fails remove the
+    # path.
+
+    if(NOT EXISTS "${file}")
+      if (EXISTS "/${file}")
+        set(file "/${file}")
+      else()
+        message(WARNING " Removing non-existent dependency file: ${file}")
+        set(file "")
+      endif()
+    endif()
+
+    if(NOT IS_DIRECTORY "${file}")
+      # If softlinks start to matter, we should change this to REALPATH.  For now we need
+      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
+      # just /include.
+      get_filename_component(file_absolute "${file}" ABSOLUTE)
+      list(APPEND dependency_list "${file_absolute}")
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+# Remove the duplicate entries and sort them.
+list(REMOVE_DUPLICATES dependency_list)
+list(SORT dependency_list)
+
+foreach(file ${dependency_list})
+  set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
+endforeach()
+
+file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/parse_cubin.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/parse_cubin.cmake
new file mode 100644
index 0000000..626c8a2
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/parse_cubin.cmake
@@ -0,0 +1,111 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# Parses a .cubin file produced by nvcc and reports statistics about the file.
+
+
+file(READ ${input_file} file_text)
+
+if (NOT "${file_text}" STREQUAL "")
+
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})
+
+  list(LENGTH file_text len)
+
+  foreach(line ${file_text})
+
+    # Only look at "code { }" blocks.
+    if(line MATCHES "^code")
+
+      # Break into individual lines.
+      string(REGEX REPLACE "\n" ";" line ${line})
+
+      foreach(entry ${line})
+
+        # Extract kernel names.
+        if (${entry} MATCHES "[^g]name = ([^ ]+)")
+          set(entry "${CMAKE_MATCH_1}")
+
+          # Check to see if the kernel name starts with "_"
+          set(skip FALSE)
+          # if (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # message("Skipping ${entry}")
+            # set(skip TRUE)
+          # else ()
+            message("Kernel:    ${entry}")
+          # endif ()
+
+        endif()
+
+        # Skip the rest of the block if necessary
+        if(NOT skip)
+
+          # Registers
+          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Registers: ${entry}")
+          endif()
+
+          # Local memory
+          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Local:     ${entry}")
+          endif()
+
+          # Shared memory
+          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Shared:    ${entry}")
+          endif()
+
+          if (${entry} MATCHES "^}")
+            message("")
+          endif()
+
+        endif()
+
+
+      endforeach()
+
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/run_nvcc.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/run_nvcc.cmake
new file mode 100644
index 0000000..abdd307
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindCUDA/run_nvcc.cmake
@@ -0,0 +1,288 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
+set(source_file "@source_file@") # path
+set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
+set(cmake_dependency_file "@cmake_dependency_file@") # path
+set(CUDA_make2cmake "@CUDA_make2cmake@") # path
+set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
+set(build_cubin @build_cubin@) # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "@generated_file_path@") # path
+set(generated_file_internal "@generated_file@") # path
+set(generated_cubin_file_internal "@generated_cubin_file@") # path
+
+set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
+set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
+@CUDA_NVCC_FLAGS_CONFIG@
+set(nvcc_flags @nvcc_flags@) # list
+set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "@format_flag@") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+@CUDA_HOST_FLAGS@
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION @CUDA_VERSION@)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageHandleStandardArgs.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageHandleStandardArgs.cmake
new file mode 100644
index 0000000..2de1fb3
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageHandleStandardArgs.cmake
@@ -0,0 +1,382 @@
+#.rst:
+# FindPackageHandleStandardArgs
+# -----------------------------
+#
+#
+#
+# FIND_PACKAGE_HANDLE_STANDARD_ARGS(<name> ...  )
+#
+# This function is intended to be used in FindXXX.cmake modules files.
+# It handles the REQUIRED, QUIET and version-related arguments to
+# find_package().  It also sets the <packagename>_FOUND variable.  The
+# package is considered found if all variables <var1>...  listed contain
+# valid results, e.g.  valid filepaths.
+#
+# There are two modes of this function.  The first argument in both
+# modes is the name of the Find-module where it is called (in original
+# casing).
+#
+# The first simple mode looks like this:
+#
+# ::
+#
+#     FIND_PACKAGE_HANDLE_STANDARD_ARGS(<name>
+#       (DEFAULT_MSG|"Custom failure message") <var1>...<varN> )
+#
+# If the variables <var1> to <varN> are all valid, then
+# <UPPERCASED_NAME>_FOUND will be set to TRUE.  If DEFAULT_MSG is given
+# as second argument, then the function will generate itself useful
+# success and error messages.  You can also supply a custom error
+# message for the failure case.  This is not recommended.
+#
+# The second mode is more powerful and also supports version checking:
+#
+# ::
+#
+#     FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME
+#       [FOUND_VAR <resultVar>]
+#       [REQUIRED_VARS <var1>...<varN>]
+#       [VERSION_VAR   <versionvar>]
+#       [HANDLE_COMPONENTS]
+#       [CONFIG_MODE]
+#       [FAIL_MESSAGE "Custom failure message"] )
+#
+# In this mode, the name of the result-variable can be set either to
+# either <UPPERCASED_NAME>_FOUND or <OriginalCase_Name>_FOUND using the
+# FOUND_VAR option.  Other names for the result-variable are not
+# allowed.  So for a Find-module named FindFooBar.cmake, the two
+# possible names are FooBar_FOUND and FOOBAR_FOUND.  It is recommended
+# to use the original case version.  If the FOUND_VAR option is not
+# used, the default is <UPPERCASED_NAME>_FOUND.
+#
+# As in the simple mode, if <var1> through <varN> are all valid,
+# <packagename>_FOUND will be set to TRUE.  After REQUIRED_VARS the
+# variables which are required for this package are listed.  Following
+# VERSION_VAR the name of the variable can be specified which holds the
+# version of the package which has been found.  If this is done, this
+# version will be checked against the (potentially) specified required
+# version used in the find_package() call.  The EXACT keyword is also
+# handled.  The default messages include information about the required
+# version and the version which has been actually found, both if the
+# version is ok or not.  If the package supports components, use the
+# HANDLE_COMPONENTS option to enable handling them.  In this case,
+# find_package_handle_standard_args() will report which components have
+# been found and which are missing, and the <packagename>_FOUND variable
+# will be set to FALSE if any of the required components (i.e.  not the
+# ones listed after OPTIONAL_COMPONENTS) are missing.  Use the option
+# CONFIG_MODE if your FindXXX.cmake module is a wrapper for a
+# find_package(...  NO_MODULE) call.  In this case VERSION_VAR will be
+# set to <NAME>_VERSION and the macro will automatically check whether
+# the Config module was found.  Via FAIL_MESSAGE a custom failure
+# message can be specified, if this is not used, the default message
+# will be displayed.
+#
+# Example for mode 1:
+#
+# ::
+#
+#     find_package_handle_standard_args(LibXml2  DEFAULT_MSG
+#       LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
+#
+#
+#
+# LibXml2 is considered to be found, if both LIBXML2_LIBRARY and
+# LIBXML2_INCLUDE_DIR are valid.  Then also LIBXML2_FOUND is set to
+# TRUE.  If it is not found and REQUIRED was used, it fails with
+# FATAL_ERROR, independent whether QUIET was used or not.  If it is
+# found, success will be reported, including the content of <var1>.  On
+# repeated Cmake runs, the same message won't be printed again.
+#
+# Example for mode 2:
+#
+# ::
+#
+#     find_package_handle_standard_args(LibXslt
+#       FOUND_VAR LibXslt_FOUND
+#       REQUIRED_VARS LibXslt_LIBRARIES LibXslt_INCLUDE_DIRS
+#       VERSION_VAR LibXslt_VERSION_STRING)
+#
+# In this case, LibXslt is considered to be found if the variable(s)
+# listed after REQUIRED_VAR are all valid, i.e.  LibXslt_LIBRARIES and
+# LibXslt_INCLUDE_DIRS in this case.  The result will then be stored in
+# LibXslt_FOUND .  Also the version of LibXslt will be checked by using
+# the version contained in LibXslt_VERSION_STRING.  Since no
+# FAIL_MESSAGE is given, the default messages will be printed.
+#
+# Another example for mode 2:
+#
+# ::
+#
+#     find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
+#     find_package_handle_standard_args(Automoc4  CONFIG_MODE)
+#
+# In this case, FindAutmoc4.cmake wraps a call to find_package(Automoc4
+# NO_MODULE) and adds an additional search directory for automoc4.  Here
+# the result will be stored in AUTOMOC4_FOUND.  The following
+# FIND_PACKAGE_HANDLE_STANDARD_ARGS() call produces a proper
+# success/error message.
+
+#=============================================================================
+# Copyright 2007-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/CMakeParseArguments.cmake)
+
+# internal helper macro
+macro(_FPHSA_FAILURE_MESSAGE _msg)
+  if (${_NAME}_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else ()
+    if (NOT ${_NAME}_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif ()
+  endif ()
+endmacro()
+
+
+# internal helper macro to generate the failure message when used in CONFIG_MODE:
+macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
+  # <name>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
+  if(${_NAME}_CONFIG)
+    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing: ${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
+  else()
+    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
+    # List them all in the error message:
+    if(${_NAME}_CONSIDERED_CONFIGS)
+      set(configsText "")
+      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
+      math(EXPR configsCount "${configsCount} - 1")
+      foreach(currentConfigIndex RANGE ${configsCount})
+        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
+        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
+        set(configsText "${configsText}    ${filename} (version ${version})\n")
+      endforeach()
+      if (${_NAME}_NOT_FOUND_MESSAGE)
+        set(configsText "${configsText}    Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n")
+      endif()
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}")
+
+    else()
+      # Simple case: No Config-file was found at all:
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
+    endif()
+  endif()
+endmacro()
+
+
+function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
+
+# set up the arguments for CMAKE_PARSE_ARGUMENTS and check whether we are in
+# new extended or in the "old" mode:
+  set(options  CONFIG_MODE  HANDLE_COMPONENTS)
+  set(oneValueArgs  FAIL_MESSAGE  VERSION_VAR  FOUND_VAR)
+  set(multiValueArgs REQUIRED_VARS)
+  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
+  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
+
+  if(${INDEX} EQUAL -1)
+    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
+    set(FPHSA_REQUIRED_VARS ${ARGN})
+    set(FPHSA_VERSION_VAR)
+  else()
+
+    CMAKE_PARSE_ARGUMENTS(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})
+
+    if(FPHSA_UNPARSED_ARGUMENTS)
+      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
+    endif()
+
+    if(NOT FPHSA_FAIL_MESSAGE)
+      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
+    endif()
+  endif()
+
+# now that we collected all arguments, process them
+
+  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
+    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
+  endif()
+
+  # In config-mode, we rely on the variable <package>_CONFIG, which is set by find_package()
+  # when it successfully found the config-file, including version checking:
+  if(FPHSA_CONFIG_MODE)
+    list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
+    list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
+    set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
+  endif()
+
+  if(NOT FPHSA_REQUIRED_VARS)
+    message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
+  endif()
+
+  list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
+
+  string(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOLOWER ${_NAME} _NAME_LOWER)
+
+  if(FPHSA_FOUND_VAR)
+    if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$"  OR  FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$")
+      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
+    else()
+      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.")
+    endif()
+  else()
+    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
+  endif()
+
+  # collect all variables which were not found, so they can be printed, so the
+  # user knows better what went wrong (#6375)
+  set(MISSING_VARS "")
+  set(DETAILS "")
+  # check if all passed variables are valid
+  unset(${_FOUND_VAR})
+  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
+    if(NOT ${_CURRENT_VAR})
+      set(${_FOUND_VAR} FALSE)
+      set(MISSING_VARS "${MISSING_VARS} ${_CURRENT_VAR}")
+    else()
+      set(DETAILS "${DETAILS}[${${_CURRENT_VAR}}]")
+    endif()
+  endforeach()
+  if(NOT "${${_FOUND_VAR}}" STREQUAL "FALSE")
+    set(${_FOUND_VAR} TRUE)
+  endif()
+
+  # component handling
+  unset(FOUND_COMPONENTS_MSG)
+  unset(MISSING_COMPONENTS_MSG)
+
+  if(FPHSA_HANDLE_COMPONENTS)
+    foreach(comp ${${_NAME}_FIND_COMPONENTS})
+      if(${_NAME}_${comp}_FOUND)
+
+        if(NOT DEFINED FOUND_COMPONENTS_MSG)
+          set(FOUND_COMPONENTS_MSG "found components: ")
+        endif()
+        set(FOUND_COMPONENTS_MSG "${FOUND_COMPONENTS_MSG} ${comp}")
+
+      else()
+
+        if(NOT DEFINED MISSING_COMPONENTS_MSG)
+          set(MISSING_COMPONENTS_MSG "missing components: ")
+        endif()
+        set(MISSING_COMPONENTS_MSG "${MISSING_COMPONENTS_MSG} ${comp}")
+
+        if(${_NAME}_FIND_REQUIRED_${comp})
+          set(${_FOUND_VAR} FALSE)
+          set(MISSING_VARS "${MISSING_VARS} ${comp}")
+        endif()
+
+      endif()
+    endforeach()
+    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
+    set(DETAILS "${DETAILS}[c${COMPONENT_MSG}]")
+  endif()
+
+  # version handling:
+  set(VERSION_MSG "")
+  set(VERSION_OK TRUE)
+  set(VERSION ${${FPHSA_VERSION_VAR}})
+
+  # check with DEFINED here as the requested or found version may be "0"
+  if (DEFINED ${_NAME}_FIND_VERSION)
+    if(DEFINED ${FPHSA_VERSION_VAR})
+
+      if(${_NAME}_FIND_VERSION_EXACT)       # exact version required
+        # count the dots in the version string
+        string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${VERSION}")
+        # add one dot because there is one dot more than there are components
+        string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS)
+        if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT)
+          # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT
+          # is at most 4 here. Therefore a simple lookup table is used.
+          if (${_NAME}_FIND_VERSION_COUNT EQUAL 1)
+            set(_VERSION_REGEX "[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*")
+          else ()
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
+          endif ()
+          string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${VERSION}")
+          unset(_VERSION_REGEX)
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD)
+            set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${VERSION}\")")
+          endif ()
+          unset(_VERSION_HEAD)
+        else ()
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL VERSION)
+            set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${VERSION}\")")
+          endif ()
+        endif ()
+        unset(_VERSION_DOTS)
+
+      else()     # minimum version specified:
+        if (${_NAME}_FIND_VERSION VERSION_GREATER VERSION)
+          set(VERSION_MSG "Found unsuitable version \"${VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"")
+          set(VERSION_OK FALSE)
+        else ()
+          set(VERSION_MSG "(found suitable version \"${VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")")
+        endif ()
+      endif()
+
+    else()
+
+      # if the package was not found, but a version was given, add that to the output:
+      if(${_NAME}_FIND_VERSION_EXACT)
+         set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
+      else()
+         set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
+      endif()
+
+    endif()
+  else ()
+    if(VERSION)
+      set(VERSION_MSG "(found version \"${VERSION}\")")
+    endif()
+  endif ()
+
+  if(VERSION_OK)
+    set(DETAILS "${DETAILS}[v${VERSION}(${${_NAME}_FIND_VERSION})]")
+  else()
+    set(${_FOUND_VAR} FALSE)
+  endif()
+
+
+  # print the result:
+  if (${_FOUND_VAR})
+    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
+  else ()
+
+    if(FPHSA_CONFIG_MODE)
+      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
+    else()
+      if(NOT VERSION_OK)
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})")
+      else()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing: ${MISSING_VARS}) ${VERSION_MSG}")
+      endif()
+    endif()
+
+  endif ()
+
+  set(${_FOUND_VAR} ${${_FOUND_VAR}} PARENT_SCOPE)
+
+endfunction()
diff --git a/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageMessage.cmake b/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageMessage.cmake
new file mode 100644
index 0000000..a0349d3
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/cmake/FindPackageMessage.cmake
@@ -0,0 +1,57 @@
+#.rst:
+# FindPackageMessage
+# ------------------
+#
+#
+#
+# FIND_PACKAGE_MESSAGE(<name> "message for user" "find result details")
+#
+# This macro is intended to be used in FindXXX.cmake modules files.  It
+# will print a message once for each unique find result.  This is useful
+# for telling the user where a package was found.  The first argument
+# specifies the name (XXX) of the package.  The second argument
+# specifies the message to display.  The third argument lists details
+# about the find result so that if they change the message will be
+# displayed again.  The macro also obeys the QUIET argument to the
+# find_package command.
+#
+# Example:
+#
+# ::
+#
+#   if(X11_FOUND)
+#     FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}"
+#       "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
+#   else()
+#    ...
+#   endif()
+
+#=============================================================================
+# Copyright 2008-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+function(FIND_PACKAGE_MESSAGE pkg msg details)
+  # Avoid printing a message repeatedly for the same find result.
+  if(NOT ${pkg}_FIND_QUIETLY)
+    string(REPLACE "\n" "" details "${details}")
+    set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
+    if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
+      # The message has not yet been printed.
+      message(STATUS "${msg}")
+
+      # Save the find details in the cache to avoid printing the same
+      # message again.
+      set("${DETAILS_VAR}" "${details}"
+        CACHE INTERNAL "Details about finding ${pkg}")
+    endif()
+  endif()
+endfunction()
diff --git a/stream_compaction/Project2-Stream-Compaction/img/BlockSize_vs_Runtime.png b/stream_compaction/Project2-Stream-Compaction/img/BlockSize_vs_Runtime.png
new file mode 100644
index 0000000..bbd12d9
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/BlockSize_vs_Runtime.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/BlockSz-128-DataSz-20.PNG b/stream_compaction/Project2-Stream-Compaction/img/BlockSz-128-DataSz-20.PNG
new file mode 100644
index 0000000..c980577
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/BlockSz-128-DataSz-20.PNG differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/DownSweepScan.png b/stream_compaction/Project2-Stream-Compaction/img/DownSweepScan.png
new file mode 100644
index 0000000..a8ecb33
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/DownSweepScan.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/NaiveScan.png b/stream_compaction/Project2-Stream-Compaction/img/NaiveScan.png
new file mode 100644
index 0000000..55f53ca
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/NaiveScan.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/SC1.png b/stream_compaction/Project2-Stream-Compaction/img/SC1.png
new file mode 100644
index 0000000..b73a929
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/SC1.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/SC2.png b/stream_compaction/Project2-Stream-Compaction/img/SC2.png
new file mode 100644
index 0000000..cdb2eab
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/SC2.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/Scan1.png b/stream_compaction/Project2-Stream-Compaction/img/Scan1.png
new file mode 100644
index 0000000..f0d07ad
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/Scan1.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/Scan1NP.png b/stream_compaction/Project2-Stream-Compaction/img/Scan1NP.png
new file mode 100644
index 0000000..07d3ab6
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/Scan1NP.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/Scan2.png b/stream_compaction/Project2-Stream-Compaction/img/Scan2.png
new file mode 100644
index 0000000..f010255
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/Scan2.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/Scan2NP.png b/stream_compaction/Project2-Stream-Compaction/img/Scan2NP.png
new file mode 100644
index 0000000..e663e7d
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/Scan2NP.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/UpSweepScan.png b/stream_compaction/Project2-Stream-Compaction/img/UpSweepScan.png
new file mode 100644
index 0000000..7ca9b14
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/UpSweepScan.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/example-1.png b/stream_compaction/Project2-Stream-Compaction/img/example-1.png
new file mode 100644
index 0000000..28633a6
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/example-1.png differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/example-2.jpg b/stream_compaction/Project2-Stream-Compaction/img/example-2.jpg
new file mode 100644
index 0000000..984c2fd
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/example-2.jpg differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/figure-39-2.jpg b/stream_compaction/Project2-Stream-Compaction/img/figure-39-2.jpg
new file mode 100644
index 0000000..bc9f9da
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/figure-39-2.jpg differ
diff --git a/stream_compaction/Project2-Stream-Compaction/img/figure-39-4.jpg b/stream_compaction/Project2-Stream-Compaction/img/figure-39-4.jpg
new file mode 100644
index 0000000..5888f20
Binary files /dev/null and b/stream_compaction/Project2-Stream-Compaction/img/figure-39-4.jpg differ
diff --git a/stream_compaction/Project2-Stream-Compaction/src/main.cpp b/stream_compaction/Project2-Stream-Compaction/src/main.cpp
new file mode 100644
index 0000000..9da124f
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/src/main.cpp
@@ -0,0 +1,227 @@
+/**
+ * @file      main.cpp
+ * @brief     Stream compaction test program
+ * @authors   Kai Ninomiya
+ * @date      2015
+ * @copyright University of Pennsylvania
+ */
+
+#include <cstdio>
+#include <stream_compaction/cpu.h>
+#include <stream_compaction/naive.h>
+#include <stream_compaction/efficient.h>
+#include <stream_compaction/thrust.h>
+#include "testing_helpers.hpp"
+
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+//const int SIZE = 1 << 20; // feel free to change the size of array
+//const int NPOT = SIZE - 3; // Non-Power-Of-Two
+//int *a = new int[SIZE];
+//int *b = new int[SIZE];
+//int *c = new int[SIZE];
+
+int SIZE ; 
+int NPOT ;
+int *a ;
+int *b ;
+int *c ;
+
+
+int main(int argc, char* argv[]) {
+	// Scan tests
+
+	printf("\n");
+	printf("****************\n");
+	printf("** SCAN TESTS **\n");
+	printf("****************\n");
+
+	//ofstream outputFile1("Naive_Scan.txt");
+	//ofstream outputFile11("Naive_Scan_NP.txt");
+	//ofstream outputFile2("WorkEff_Scan.txt");
+	//ofstream outputFile22("WorkEff_Scan_NP.txt");
+
+
+	for (int sz = 20; sz < 21; sz++) {
+
+		//int SIZE = 1 << 20; // feel free to change the size of array
+		//SIZE = 1 << sz;
+
+		SIZE = 1<<25;
+
+
+		NPOT = SIZE - 3; // Non-Power-Of-Two
+		a = new int[SIZE];
+		b = new int[SIZE];
+		c = new int[SIZE];
+
+		genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+		a[SIZE - 1] = 0;
+		printArray(SIZE, a, true);
+		
+		// CPU Scans ==================================================================================================
+
+		// initialize b using StreamCompaction::CPU::scan you implement
+		// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+		// At first all cases passed because b && c are all zeroes.
+
+		zeroArray(SIZE, b);
+		printDesc("cpu scan, power-of-two");
+		StreamCompaction::CPU::scan(SIZE, b, a);
+		printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+		//outputFile1 << sz << " | size " << SIZE << " | " << StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation() << " " << "(std::chrono Measured)" << endl;
+		printArray(SIZE, b, true);
+
+		zeroArray(SIZE, c);
+		printDesc("cpu scan, non-power-of-two");
+		StreamCompaction::CPU::scan(NPOT, c, a);
+		printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+		//outputFile2 << sz << " | size " << SIZE << " | " << StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation() << " " << "(std::chrono Measured)" << endl;
+		printArray(NPOT, b, true);
+		printCmpResult(NPOT, b, c);
+
+
+		// GPU naive Scan ===========================================================================================
+		
+		zeroArray(SIZE, c);
+		printDesc("GPU naive scan, power-of-two");
+		StreamCompaction::Naive::scan(SIZE, c, a);
+		printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile1 << sz << " | size " << SIZE << "  StreamCompaction::Naive::scan Poweof2  " << SIZE << " | " << StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation() << " (CUDA Measured)" << endl;
+		//printArray(SIZE, c, true);
+		printCmpResult(SIZE, b, c);
+
+		// For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+		//onesArray(SIZE, c);
+		//printDesc("1s array for finding bugs");
+		//StreamCompaction::Naive::scan(SIZE, c, a);
+		//printArray(SIZE, c, true); 
+
+		zeroArray(SIZE, c);
+		printDesc("GPU naive scan, non-power-of-two");
+		StreamCompaction::Naive::scan(NPOT, c, a);
+		printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile11 << sz << " | size " << SIZE << "  StreamCompaction::Naive::scan NonPoweof2  " << SIZE << " | " << StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation() << " (CUDA Measured)" << endl;
+		//printArray(SIZE, c, true);
+		printCmpResult(NPOT, b, c);
+
+		// GPU Work Eff Scan ===========================================================================================
+
+		zeroArray(SIZE, c);
+		printDesc("work-efficient scan, power-of-two");
+		StreamCompaction::Efficient::scan(SIZE, c, a);
+		printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile2 << sz << " | size " << SIZE <<"  StreamCompaction::Efficient::scan Poweof2  " << SIZE << " | " << StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation() << " (CUDA Measured)" << endl;
+		printCmpResult(SIZE, b, c);
+
+		zeroArray(SIZE, c);
+		printDesc("work-efficient scan, non-power-of-two");
+		StreamCompaction::Efficient::scan(NPOT, c, a);
+		printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile22<< sz << " | size " << SIZE << "  StreamCompaction::Efficient::scan NonPoweof2  " << SIZE << " | " << StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation() << " (CUDA Measured)" << endl;
+		//printArray(NPOT, c, true);
+		printCmpResult(NPOT, b, c);
+
+
+		// GPU Thrust Scan ===========================================================================================
+
+		zeroArray(SIZE, c);
+		printDesc("thrust scan, power-of-two");
+		StreamCompaction::Thrust::scan(SIZE, c, a);
+		printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile1 << sz << " | size " << SIZE << " | " << StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation() << " " << "(CUDA Measured)" << endl;
+		//printArray(SIZE, c, true);
+		printCmpResult(SIZE, b, c);
+
+		zeroArray(SIZE, c);
+		printDesc("thrust scan, non-power-of-two");
+		StreamCompaction::Thrust::scan(NPOT, c, a);
+		printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile2 << sz << " | size " << SIZE << " | " << StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation() << " " << "(CUDA Measured)" << endl;
+		//printArray(NPOT, c, true);
+		printCmpResult(NPOT, b, c);
+
+	}
+
+    printf("\n");
+    printf("*****************************\n");
+
+    printf("** STREAM COMPACTION TESTS **\n");
+    printf("*****************************\n");
+
+	//ofstream outputFile1("SC_CPU.txt");
+	//ofstream outputFile11("SC_CPU_NP.txt");
+	//ofstream outputFile2("SC_CPU_withScan.txt");
+	//ofstream outputFile3("SC_WorkEff.txt");
+	//ofstream outputFile33("SC_WorkEff_NP.txt");
+	
+	for (int sz = 20; sz < 21; sz++) {
+
+		//int SIZE = 1 << 20; // feel free to change the size of array
+		SIZE = 1 << 25;
+
+		NPOT = SIZE - 3; // Non-Power-Of-Two
+		a = new int[SIZE];
+		b = new int[SIZE];
+		c = new int[SIZE];
+
+		// Compaction tests
+
+		genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+		a[SIZE - 1] = 0;
+		printArray(SIZE, a, true);
+
+		int count, expectedCount, expectedNPOT;
+
+		// initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+		// We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+		zeroArray(SIZE, b);
+		printDesc("cpu compact without scan, power-of-two");
+		count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+		printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+		//outputFile1 << sz << " " << SIZE << " " << StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation() << " " << "(std::chrono Measured)" << endl;
+		expectedCount = count;
+		printArray(count, b, true);
+		printCmpLenResult(count, expectedCount, b, b);
+
+		zeroArray(SIZE, c);
+		printDesc("cpu compact without scan, non-power-of-two");
+		count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+		printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+		//outputFile11 << sz << " " << SIZE << " " << StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation() << " " << "(std::chrono Measured)" << endl;
+		expectedNPOT = count;
+		printArray(count, c, true);
+		printCmpLenResult(count, expectedNPOT, b, c);
+
+		zeroArray(SIZE, c);
+		printDesc("cpu compact with scan");
+		count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+		printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+		//outputFile2 << sz << " " << SIZE << " " << StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation() << " " << "(std::chrono Measured)" << endl;
+		printArray(count, c, true);
+		printCmpLenResult(count, expectedCount, b, c);
+
+		zeroArray(SIZE, c);
+		printDesc("work-efficient compact, power-of-two");
+		count = StreamCompaction::Efficient::compact(SIZE, c, a);
+		printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile3 << sz << " " << SIZE << " " << StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation() << " " << "(CUDA Measured)" << endl;
+		//printArray(count, c, true);
+		printCmpLenResult(count, expectedCount, b, c);
+
+		zeroArray(SIZE, c);
+		printDesc("work-efficient compact, non-power-of-two");
+		count = StreamCompaction::Efficient::compact(NPOT, c, a);
+		printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+		//outputFile33 << sz << " " << SIZE << " " << StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation() << " " << "(CUDA Measured)" << endl;
+		//printArray(count, c, true);
+		printCmpLenResult(count, expectedNPOT, b, c);
+	}
+	
+    system("pause"); // stop Win32 console from closing on exit
+	delete[] a;
+	delete[] b;
+	delete[] c;
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/src/testing_helpers.hpp b/stream_compaction/Project2-Stream-Compaction/src/testing_helpers.hpp
new file mode 100644
index 0000000..daa629f
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/src/testing_helpers.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <ctime>
+
+template<typename T>
+int cmpArrays(int n, T *a, T *b) {
+    for (int i = 0; i < n; i++) {
+        if (a[i] != b[i]) {
+            printf("    a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+void printDesc(const char *desc) {
+    printf("==== %s ====\n", desc);
+}
+
+template<typename T>
+void printCmpResult(int n, T *a, T *b) {
+    printf("    %s \n",
+            cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
+}
+
+template<typename T>
+void printCmpLenResult(int n, int expN, T *a, T *b) {
+    if (n != expN) {
+        printf("    expected %d elements, got %d\n", expN, n);
+    }
+    printf("    %s \n",
+            (n == -1 || n != expN) ? "FAIL COUNT" :
+            cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
+}
+
+void zeroArray(int n, int *a) {
+    for (int i = 0; i < n; i++) {
+        a[i] = 0;
+    }
+}
+
+void onesArray(int n, int *a) {
+	for (int i = 0; i < n; i++) {
+		a[i] = 1;
+	}
+}
+
+void genArray(int n, int *a, int maxval) {
+    srand(time(nullptr));
+
+    for (int i = 0; i < n; i++) {
+        a[i] = rand() % maxval;
+    }
+}
+
+void printArray(int n, int *a, bool abridged = false) {
+    printf("    [ ");
+    for (int i = 0; i < n; i++) {
+        if (abridged && i + 2 == 15 && n > 16) {
+            i = n - 2;
+            printf("... ");
+        }
+        printf("%3d ", a[i]);
+    }
+    printf("]\n");
+}
+
+template<typename T>
+void printElapsedTime(T time, std::string note = "")
+{
+	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+}
+
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/stream_compaction/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
new file mode 100644
index 0000000..4bb0dc2
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOURCE_FILES
+    "common.h"
+    "common.cu"
+    "cpu.h"
+    "cpu.cu"
+    "naive.h"
+    "naive.cu"
+    "efficient.h"
+    "efficient.cu"
+    "thrust.h"
+    "thrust.cu"
+    )
+
+cuda_add_library(stream_compaction
+    ${SOURCE_FILES}
+    OPTIONS -arch=sm_61
+    )
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.cu b/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.cu
new file mode 100644
index 0000000..ded83d0
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -0,0 +1,54 @@
+#include "common.h"
+
+void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess == err) {
+        return;
+    }
+
+    fprintf(stderr, "CUDA error");
+    if (file) {
+        fprintf(stderr, " (%s:%d)", file, line);
+    }
+    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+}
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+            // TODO
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (tid >= n) return;
+			
+			if(idata[tid]!=0){
+				bools[tid] = 1;
+			}
+			else {
+				bools[tid] = 0;
+			}
+		}
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices) {
+            // TODO
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (tid >= n) return;
+
+			if (bools[tid] == 1) {
+				odata[indices[tid]] = idata[tid];
+			}
+        }
+
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.h b/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.h
new file mode 100644
index 0000000..52eccf1
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/common.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 1024
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+	    /**
+	    * This class is used for timing the performance
+	    * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+	    */
+	    class PerformanceTimer
+	    {
+	    public:
+		    PerformanceTimer()
+		    {
+			    cudaEventCreate(&event_start);
+			    cudaEventCreate(&event_end);
+		    }
+
+		    ~PerformanceTimer()
+		    {
+			    cudaEventDestroy(event_start);
+			    cudaEventDestroy(event_end);
+		    }
+
+		    void startCpuTimer()
+		    {
+			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+			    cpu_timer_started = true;
+
+			    time_start_cpu = std::chrono::high_resolution_clock::now();
+		    }
+
+		    void endCpuTimer()
+		    {
+			    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+			    prev_elapsed_time_cpu_milliseconds =
+				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+			    cpu_timer_started = false;
+		    }
+
+		    void startGpuTimer()
+		    {
+			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+			    gpu_timer_started = true;
+
+			    cudaEventRecord(event_start);
+		    }
+
+		    void endGpuTimer()
+		    {
+			    cudaEventRecord(event_end);
+			    cudaEventSynchronize(event_end);
+
+			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+			    gpu_timer_started = false;
+		    }
+
+		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+		    {
+			    return prev_elapsed_time_cpu_milliseconds;
+		    }
+
+		    float getGpuElapsedTimeForPreviousOperation() //noexcept
+		    {
+			    return prev_elapsed_time_gpu_milliseconds;
+		    }
+
+		    // remove copy and move functions
+		    PerformanceTimer(const PerformanceTimer&) = delete;
+		    PerformanceTimer(PerformanceTimer&&) = delete;
+		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+	    private:
+		    cudaEvent_t event_start = nullptr;
+		    cudaEvent_t event_end = nullptr;
+
+		    using time_point_t = std::chrono::high_resolution_clock::time_point;
+		    time_point_t time_start_cpu;
+		    time_point_t time_end_cpu;
+
+		    bool cpu_timer_started = false;
+		    bool gpu_timer_started = false;
+
+		    float prev_elapsed_time_cpu_milliseconds = 0.f;
+		    float prev_elapsed_time_gpu_milliseconds = 0.f;
+	    };
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.cu b/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.cu
new file mode 100644
index 0000000..a5c30b1
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -0,0 +1,101 @@
+#include <cstdio>
+#include "cpu.h"
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace CPU {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
+        }
+
+        /**
+         * CPU scan (prefix sum).
+         * For performance analysis, this is supposed to be a simple for loop.
+         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+         */
+        void scan(int n, int *odata, const int *idata) {
+			bool tmp=true;
+			try { 
+				timer().startCpuTimer();
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+			
+            // TODO
+			if (n > 0) {
+				odata[0] = 0;
+				for (int i = 0; i < n-1; i++) {
+					odata[i+1] = idata[i] + odata[i];
+				}
+			}
+			if(tmp ==true) timer().endCpuTimer();
+        }
+
+        /**
+         * CPU stream compaction without using the scan function.
+         *
+         * @returns the number of elements remaining after compaction.
+         */
+        int compactWithoutScan(int n, int *odata, const int *idata) {
+            // TODO
+			if (n > 0) {
+				timer().startCpuTimer();
+				int counter = 0;
+				for (int i = 0; i < n; i++) {
+					if (idata[i] != 0) {
+						odata[counter] = idata[i];
+						counter+=1;
+					}
+				}
+				timer().endCpuTimer();
+				return counter;
+			}
+            return -1;
+        }
+
+        /**
+         * CPU stream compaction using scan and scatter, like the parallel version.
+         *
+         * @returns the number of elements remaining after compaction.
+         */
+        int compactWithScan(int n, int *odata, const int *idata) {
+	        // TODO
+			if (n > 0) {
+				timer().startCpuTimer();
+
+				int * indicator = new int[n];
+				int * scanIndex = new int[n];
+				int tmp = 0;
+
+				// Compute indicator array
+				for (int i = 0; i < n; i++) {
+					if (idata[i] != 0) {
+						indicator[i] = 1;
+					}
+					else {
+						indicator[i] = 0;
+					}
+				}
+
+				// Compute scan
+				scan(n, scanIndex, indicator);
+
+				//Scatter
+				for (int i = 0; i < n; i++) {
+					if (indicator[i] == 1) {
+						odata[scanIndex[i]] = idata[i];
+						tmp = scanIndex[i];
+					}
+				}
+				timer().endCpuTimer();
+				return tmp+1;
+			}
+            return -1;
+        }
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.h b/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.h
new file mode 100644
index 0000000..236ce11
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/cpu.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace CPU {
+	    StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compactWithoutScan(int n, int *odata, const int *idata);
+
+        int compactWithScan(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.cu b/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.cu
new file mode 100644
index 0000000..4679a31
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -0,0 +1,500 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+		int *dev_arrayA;
+		int *dev_arrayB;
+
+		int *dev_bools;
+		int *dev_boolScans;
+
+		int *dev_idata;
+		int *dev_odata;
+		
+		int * dev_indices;
+
+		int *dev_lastElements;
+		int *dev_lastElements2;
+
+		void printArray(int n, const int *a, bool abridged = false) {
+			printf("    [ ");
+			for (int i = 0; i < n; i++) {
+				if (abridged && i + 2 == 15 && n > 16) {
+					i = n - 2;
+					printf("... ");
+				}
+				printf("%3d ", a[i]);
+			}
+			printf("]\n");
+		}
+
+		__global__ void kernEffScanUpSweep(int N, int pow2d, int pow2d1, int* arrA) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (k >= N) return;
+
+			if ((k % pow2d1) == 0 && (k + pow2d1 - 1)<N && (k + pow2d - 1)<N ){
+				arrA[k + pow2d1 - 1] += arrA[k + pow2d - 1];
+			}
+		}
+
+		__global__ void kernEffScanDownSweep(int N, int pow2d, int pow2d1, int* arrA) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (k >= N) return;
+			
+			int tmp = 0;
+			
+			if ((k % pow2d1) == 0 && (k + pow2d1 - 1) < N && (k + pow2d - 1) < N) {
+				tmp = arrA[k + pow2d -1];
+				arrA[k + pow2d - 1] = arrA[k + pow2d1 - 1];
+				arrA[k + pow2d1 - 1] += tmp;
+			}
+		}
+
+		__global__ void kernInitZero(int N, int* array) {
+			
+			int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+			
+			if (tid < N) {
+				array[tid] = 0;
+			}
+		}
+
+		__global__ void kernScanShared(int n, int * g_odata, int * g_idata) {
+		
+			extern __shared__ int temp[];  // allocated on invocation
+			
+			int thid = threadIdx.x;
+			int tid_read = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_read >= n) return;
+
+			int offset = 1;
+
+			temp[2 * thid] = g_idata[2 * tid_read]; // load input into shared memory
+			temp[2 * thid + 1] = g_idata[2 * tid_read + 1];
+
+			// build sum in place up the tree
+			for (int d = 2*blockDim.x >> 1; d > 0; d >>= 1)
+			{
+				__syncthreads();
+
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+
+					temp[bi] += temp[ai];
+				}
+				offset *= 2;
+			}
+
+			if (thid == 0) { temp[2 * blockDim.x - 1] = 0; } // clear the last element
+
+			for (int d = 1; d < 2 * blockDim.x; d *= 2) // traverse down tree & build scan
+			{
+				offset >>= 1;
+				__syncthreads();
+
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+					
+					int t = temp[ai];
+					temp[ai] = temp[bi];
+					temp[bi] += t;
+
+				}
+			}
+
+			__syncthreads();
+
+			g_odata[2 * tid_read] = temp[2 * thid]; // write results to device memory
+			g_odata[2 * tid_read + 1] = temp[2 * thid + 1];
+		}
+
+		__global__ void kernGetLastElement(int n, int* s_data, int * g_odata, int * g_idata) {
+			int thid = threadIdx.x;
+
+			int tid_global = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_global >= n) return;
+
+			if (thid == blockDim.x - 1) {
+				s_data[blockIdx.x] = g_odata[tid_global] +g_idata[tid_global];
+			}
+		}
+
+		__global__ void kernUpdateScan(int n, int* s_data, int * g_odata, int * g_idata) {
+			int thid = threadIdx.x;
+			int tid_global = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (tid_global >= n) return;
+
+			g_odata[tid_global]  += s_data[blockIdx.x];
+
+		}
+
+        /*
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+		/*
+        void scan(int n, int *odata, const int *idata) {
+
+            // TODO
+			int n_new = n;
+
+			//check for non-2powerN
+			if (1 << ilog2ceil(n) != n)
+				n_new = (1 << ilog2ceil(n));
+
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//Initialize to Zero
+			kernInitZero <<<fullBlocksPerGrid, blockSize >>> (n_new, dev_arrayA);
+			checkCUDAErrorFn("kernInitZero failed!");
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			bool tmp = true;
+			try {
+				timer().startGpuTimer();
+				//printf("IN WEScan timer started!\n");
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+
+			// Upstream
+			int pow2d1 = 0;
+			int pow2d = 0;
+			for (int d = 0; d <= ilog2ceil(n_new)-1; d++) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d+1);
+				kernEffScanUpSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, dev_arrayA);
+				checkCUDAErrorFn("kernEffScanUpSweep failed!");		
+			}
+
+			// Downstream
+			int *zero = new int[1];
+			zero[0] = 0;
+			cudaMemcpy(dev_arrayA + n_new-1, zero, 1*sizeof(int), cudaMemcpyHostToDevice);
+
+			for (int d = ilog2ceil(n_new)-1; d >= 0; d--) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d + 1);
+				kernEffScanDownSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, dev_arrayA);
+				checkCUDAErrorFn("kernGenerateRandomPosArray failed!");
+			}
+
+			if (tmp == true) { 
+				timer().endGpuTimer();
+				//printf("IN WEScan timer ended!\n");
+			}
+
+			// Copy back to cpu
+			cudaMemcpy(odata, dev_arrayA, n*sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+			
+			//printf("BBT Scan Final Computed : \n");
+			//printArray(n, odata, true);
+
+			cudaFree(dev_arrayA);
+			return;
+        }
+		*/
+
+
+		void oldScan(int n_new, int *odata, int *idata) {
+
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			// Upstream
+			int pow2d1 = 0;
+			int pow2d = 0;
+			for (int d = 0; d <= ilog2ceil(n_new) - 1; d++) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d + 1);
+				kernEffScanUpSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, idata);
+				checkCUDAErrorFn("kernEffScanUpSweep failed!");
+			}
+
+			// Downstream
+			int *zero = new int[1];
+			zero[0] = 0;
+			cudaMemcpy(idata + n_new - 1, zero, 1 * sizeof(int), cudaMemcpyHostToDevice);
+
+			for (int d = ilog2ceil(n_new) - 1; d >= 0; d--) {
+				pow2d = 1 << (d);
+				pow2d1 = 1 << (d + 1);
+				kernEffScanDownSweep << <fullBlocksPerGrid, blockSize >> > (n_new, pow2d, pow2d1, idata);
+				checkCUDAErrorFn("kernGenerateRandomPosArray failed!");
+			}
+
+			// Copy back to out
+			cudaMemcpy(odata, idata, n_new * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayB to odata failed!");
+			return;
+		}
+
+
+		void scan(int n, int *odata, const int *idata) {
+
+			// TODO
+			int n_new = n;
+			//int *tmp_print = new int[n];
+
+			//check for non-2powerN
+			if (1 << ilog2ceil(n) != n) 
+				n_new = (1 << ilog2ceil(n));
+			
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//Initialize to Zero
+			kernInitZero << <fullBlocksPerGrid, blockSize >> > (n_new, dev_arrayA);
+			checkCUDAErrorFn("kernInitZero failed!");
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			// More arrays
+			cudaMalloc((void**)&dev_odata, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements2, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			bool tmp = true;
+			try {
+				timer().startGpuTimer();
+				//printf("IN WEScan timer started!\n");
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+
+			//printf("\n==========================STARTED WES================================\n");
+			//printf("Pre Scan Array \n");
+			//printArray(n, idata, true);
+
+			//fullBlocksPerGrid = 4;
+
+			kernScanShared <<< fullBlocksPerGrid, blockSize / 2, (2*blockSize + blockSize/8) * sizeof(int) >> > (n_new, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to tmp_print failed!");
+			//printf("kernScanShared results per %d blocks\n", fullBlocksPerGrid);
+			//printArray(n_new, tmp_print, true);
+
+			kernGetLastElement << < fullBlocksPerGrid, blockSize, blockSize * sizeof(int) >> > (n_new, dev_lastElements, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_lastElements, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("kernGetLastElement results\n");
+			//printArray(fullBlocksPerGrid, tmp_print, true);
+
+			oldScan(fullBlocksPerGrid, dev_lastElements2, dev_lastElements);
+			//kernScanShared << < 1, blockSize / 2, blockSize * sizeof(int) >> > (n_new, dev_lastElements2, dev_lastElements);
+			//cudaMemcpy(tmp_print, dev_lastElements2, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("scan on  kernGetLastElement\n");
+			//printArray(fullBlocksPerGrid, tmp_print, true);
+
+			kernUpdateScan << < fullBlocksPerGrid, blockSize >> > (n_new, dev_lastElements2, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+			//printf("FINAL Scan results\n");
+			//printArray(n_new, tmp_print, true);
+			//printf("\n==========================FINISHED WES================================\n");
+
+
+			if (tmp == true) {
+				timer().endGpuTimer();
+				//printf("IN WEScan timer ended!\n");
+			}
+
+			// Copy back
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			//printf("BBT Scan Final Computed : \n");
+			//printArray(n, odata, true);
+			cudaFree(dev_arrayA);
+			cudaFree(dev_odata);
+			cudaFree(dev_lastElements);
+			cudaFree(dev_lastElements2);
+
+			return;
+		}
+
+
+
+		void compact_scan(int n, int *dev_odata, int *dev_idata) {
+
+			// TODO
+			int n_new = n;
+			//int *tmp_print = new int[n];
+
+			//check for non-2powerN
+			if (1 << ilog2ceil(n) != n) {
+				n_new = (1 << ilog2ceil(n));
+			}
+			
+			int fullBlocksPerGrid((n_new + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_arrayA, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			//Initialize to Zero
+			kernInitZero <<<fullBlocksPerGrid, blockSize >> > (n_new, dev_arrayA);
+			checkCUDAErrorFn("kernInitZero failed!");
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+			
+			// More arrays
+			cudaMalloc((void**)&dev_lastElements, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_lastElements2, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			bool tmp = true;
+			try {
+				timer().startGpuTimer();
+				//printf("IN WEScan timer started!\n");
+			}
+			catch (const std::runtime_error& e) {
+				tmp = false;
+			}
+
+			kernScanShared << < fullBlocksPerGrid, blockSize / 2, (2 * blockSize + blockSize / 8) * sizeof(int) >> > (n_new, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+
+			kernGetLastElement << < fullBlocksPerGrid, blockSize, blockSize * sizeof(int) >> > (n_new, dev_lastElements, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_lastElements, fullBlocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
+
+			oldScan(fullBlocksPerGrid, dev_lastElements2, dev_lastElements);
+			//kernScanShared << < 1, blockSize / 2, blockSize * sizeof(int) >> > (n_new, dev_lastElements2, dev_lastElements);
+
+			kernUpdateScan << < fullBlocksPerGrid, blockSize >> > (n_new, dev_lastElements2, dev_odata, dev_arrayA);
+			//cudaMemcpy(tmp_print, dev_odata, n_new * sizeof(int), cudaMemcpyDeviceToHost);
+
+			if (tmp == true) {
+				timer().endGpuTimer();
+				//printf("IN WEScan timer ended!\n");
+			}
+
+			// Copy back
+			//cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			//printf("BBT Scan Final Computed : \n");
+			//printArray(n, odata, true);
+			cudaFree(dev_arrayA);
+			cudaFree(dev_lastElements);
+			cudaFree(dev_lastElements2);
+
+			return;
+		}
+
+
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+         */
+		
+        int compact(int n, int *odata, const int *idata) {
+           
+            // TODO
+			int * indices = new int[n];
+			int * bools = new int[n];
+			int fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_bools, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_bools failed!");
+
+			cudaMalloc((void**)&dev_idata, n*sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+
+			int n_new = n;
+			if (1 << ilog2ceil(n) != n) {
+				int n_new = (1 << ilog2ceil(n));
+			} // allocate enough memory to thandle non power of two
+			cudaMalloc((void**)&dev_indices, n_new * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_indices failed!");
+
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_indices failed!");
+
+
+			timer().startGpuTimer();
+			
+			//Compute bools
+			Common::kernMapToBoolean<<<fullBlocksPerGrid, blockSize >>>(n, dev_bools, dev_idata);
+			checkCUDAErrorFn("kernMapToBoolean failed!");
+
+			//compute scans
+			compact_scan(n, dev_indices, dev_bools);
+
+			//scatter
+			Common::kernScatter<<<fullBlocksPerGrid, blockSize >>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+			checkCUDAErrorFn("kernScatter failed!");
+			
+			timer().endGpuTimer();
+
+			// Copy back to cpu
+			cudaMemcpy(odata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+
+
+			int *lastEl = new int[1];
+			cudaMemcpy(lastEl, dev_indices+n-1, 1*sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_odata to odata failed!");
+
+			//printf("GPU Compaction : \n");
+			//printArray(n, odata, true);
+
+			cudaFree(dev_bools);
+			cudaFree(dev_idata);
+			cudaFree(dev_indices);
+			cudaFree(dev_odata);
+
+			if (idata[n - 1] != 0) {
+				return lastEl[0] + 1;
+			}
+			else {
+				return lastEl[0];
+			}
+        }		
+	}
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.h b/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.h
new file mode 100644
index 0000000..803cb4f
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/efficient.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compact(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.cu b/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.cu
new file mode 100644
index 0000000..49b8259
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -0,0 +1,106 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "naive.h"
+
+
+namespace StreamCompaction {
+    namespace Naive {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+		int *dev_arrayA; 
+		int *dev_arrayB;
+
+		void printArray(int n, int *a, bool abridged = false) {
+			printf("    [ ");
+			for (int i = 0; i < n; i++) {
+				if (abridged && i + 2 == 15 && n > 16) {
+					i = n - 2;
+					printf("... ");
+				}
+				printf("%3d ", a[i]);
+			}
+			printf("]\n");
+		}
+
+        // TODO: __global__
+		__global__ void kernPrefixSumScanArray(int N, int pow2d1, int* arrA, int*arrB) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (k >= N) return;
+			
+			if (k >= pow2d1) {
+				arrB[k] = arrA[k - (pow2d1)] + arrA[k];			
+			}
+		}
+
+		__global__ void kernExclusiveShiftArray(int N,  int* arrA, int*arrB) {
+			int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+			
+			if (k >= N) return;
+
+			if (k == 0) {
+				arrA[0] = 0;
+			}
+			else {
+				arrA[k] = arrB[k-1];
+			}
+		}
+
+		/**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int *odata, const int *idata) {
+            
+			int fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			
+			cudaMalloc((void**)&dev_arrayA, n*sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayA failed!");
+
+			cudaMalloc((void**)&dev_arrayB, n*sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_arrayB failed!");
+			
+			// Fill dev_arrayA with idata
+			cudaMemcpy(dev_arrayA, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayA failed!");
+			
+			// Fill dev_arrayB with idata
+			cudaMemcpy(dev_arrayB, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyToSymbol from idata to dev_arrayB failed!");
+
+			timer().startGpuTimer();
+
+			// Call Scan Kernel
+			int pow2d1 = 0;
+
+			for (int d = 1; d <= ilog2ceil(n); d++) {
+				pow2d1 = 1 << (d - 1);
+				kernPrefixSumScanArray<<<fullBlocksPerGrid, blockSize>>>(n, pow2d1, dev_arrayA, dev_arrayB);
+				checkCUDAErrorFn("kernGenerateRandomPosArray failed!");
+				
+				//Copy
+				cudaMemcpy(dev_arrayA, dev_arrayB, n*sizeof(int), cudaMemcpyDeviceToDevice);
+			}
+
+			kernExclusiveShiftArray <<<fullBlocksPerGrid, blockSize >>> (n, dev_arrayA, dev_arrayB);
+			checkCUDAErrorFn("kernExclusiveShiftArray failed!");
+
+			timer().endGpuTimer();
+
+			// Fill dev_arrayA with idata
+			cudaMemcpy(odata, dev_arrayA, n*sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyFromSymbol from dev_arrayA to odata failed!");
+
+			//printf("Final Computed after shifting: \n");
+			//printArray(n, odata, true);
+			//printf("Computed: \n");
+
+			cudaFree(dev_arrayA);
+			cudaFree(dev_arrayB);
+		}
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.h b/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.h
new file mode 100644
index 0000000..37dcb06
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/naive.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Naive {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.cu b/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.cu
new file mode 100644
index 0000000..e5310b3
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -0,0 +1,44 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include "common.h"
+#include "thrust.h"
+
+namespace StreamCompaction {
+    namespace Thrust {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+
+        void scan(int n, int *odata, const int *idata) {
+            
+            // TODO use `thrust::exclusive_scan`
+            // example: for device_vectors dv_in and dv_out:
+            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+			thrust::host_vector<int>host_idata(idata, idata+n);
+			thrust::host_vector<int>host_odata(odata, odata+n);
+			checkCUDAErrorFn("thrust::host_vector host_odata or host_idata failed!");
+			printf("Created Thrust pointers \n");
+
+			thrust::device_vector<int> device_idata = host_idata;
+			thrust::device_vector<int> device_odata = host_odata;
+			checkCUDAErrorFn("thrust::device_vector device_idata or device_odata failed!");
+
+			timer().startGpuTimer();
+			thrust::exclusive_scan(device_idata.begin(), device_idata.end(), device_odata.begin());
+			timer().endGpuTimer();
+
+			// Copy back to cpu
+			thrust::copy(device_odata.begin(), device_odata.end(), odata);
+        }
+    }
+}
diff --git a/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.h b/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.h
new file mode 100644
index 0000000..fe98206
--- /dev/null
+++ b/stream_compaction/Project2-Stream-Compaction/stream_compaction/thrust.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Thrust {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+    }
+}